diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index e8fac1a13..6cdc7b594 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -92,7 +92,20 @@ #if defined(USE_INTEL_SPEEDUP) #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 + + #if defined(__GNUC__) && ((__GNUC__ < 4) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) + #define NO_AVX2_SUPPORT + #endif + #if defined(__clang__) && ((__clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 5)) + #define NO_AVX2_SUPPORT + #endif + + #define HAVE_INTEL_AVX1 + #ifndef NO_AVX2_SUPPORT + #define HAVE_INTEL_AVX2 + #endif #endif /* USE_INTEL_SPEEDUP */ #if defined(HAVE_INTEL_AVX2) @@ -150,9 +163,9 @@ static int InitSha256(wc_Sha256* sha256) } #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) - Transform(); Function prototype + Transform_Sha256(); Function prototype #else - Transform() { } + Transform_Sha256() { } int Sha256Final() { Save/Recover XMM, YMM ... @@ -171,7 +184,7 @@ static int InitSha256(wc_Sha256* sha256) #define XMM Instructions/inline asm - int Transform() { + int Transform_Sha256() { Stitched Message Sched/Round } @@ -179,7 +192,7 @@ static int InitSha256(wc_Sha256* sha256) #define YMM Instructions/inline asm - int Transform() { + int Transform_Sha256() { More granural Stitched Message Sched/Round } @@ -192,18 +205,29 @@ static int InitSha256(wc_Sha256* sha256) */ /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */ - static int Transform(wc_Sha256* sha256); + static int Transform_Sha256(wc_Sha256* sha256); #if defined(HAVE_INTEL_AVX1) - static int Transform_AVX1(wc_Sha256 *sha256); + static int Transform_Sha256_AVX1(wc_Sha256 *sha256); + static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, word32 len); #endif #if defined(HAVE_INTEL_AVX2) - static int Transform_AVX2(wc_Sha256 *sha256); - static int Transform_AVX1_RORX(wc_Sha256 *sha256); + static int Transform_Sha256_AVX2(wc_Sha256 *sha256); + static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, word32 len); + #ifdef HAVE_INTEL_RORX + static int Transform_Sha256_AVX1_RORX(wc_Sha256 *sha256); + static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, word32 len); + static int Transform_Sha256_AVX2_RORX(wc_Sha256 *sha256); + static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, word32 len); + #endif #endif - static int (*Transform_p)(wc_Sha256* sha256) /* = _Transform */; + static int (*Transform_Sha256_p)(wc_Sha256* sha256); + /* = _Transform_Sha256 */ + static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, word32 len); + /* = NULL */ static int transform_check = 0; static word32 intel_flags; - #define XTRANSFORM(S, B) (*Transform_p)((S)) + #define XTRANSFORM(S) (*Transform_Sha256_p)((S)) + #define XTRANSFORM_LEN(S, L) (*Transform_Sha256_Len_p)((S),(L)) static void Sha256_SetTransform(void) { @@ -213,37 +237,44 @@ static int InitSha256(wc_Sha256* sha256) intel_flags = cpuid_get_flags(); - #if defined(HAVE_INTEL_AVX2) - if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) { - if (1) - Transform_p = Transform_AVX1_RORX; + #ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_AVX2(intel_flags)) { + #ifdef HAVE_INTEL_RORX + if (IS_INTEL_BMI2(intel_flags)) { + Transform_Sha256_p = Transform_Sha256_AVX2_RORX; + Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len; + } else - Transform_p = Transform_AVX2; + #endif + if (1) + { + Transform_Sha256_p = Transform_Sha256_AVX2; + Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len; + } + #ifdef HAVE_INTEL_RORX + else { + Transform_Sha256_p = Transform_Sha256_AVX1_RORX; + Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len; + } + #endif } else #endif - #if defined(HAVE_INTEL_AVX1) - if (1) { - Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 : - Transform); + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + Transform_Sha256_p = Transform_Sha256_AVX1; + Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len; } else #endif - Transform_p = Transform; + { + Transform_Sha256_p = Transform_Sha256; + Transform_Sha256_Len_p = NULL; + } transform_check = 1; } - /* Dummy for saving MM_REGs on behalf of Transform */ - #if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1) - #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ - "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") - #elif defined(HAVE_INTEL_AVX1) - #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ - "xmm11","xmm12","xmm13","xmm14","xmm15") - #endif - int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) { int ret = 0; @@ -288,7 +319,8 @@ static int InitSha256(wc_Sha256* sha256) #include "fsl_mmcau.h" #endif - #define XTRANSFORM(S, B) Transform((S), (B)) + #define XTRANSFORM(S) Transform_Sha256((S)) + #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L)) int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) { @@ -315,7 +347,7 @@ static int InitSha256(wc_Sha256* sha256) return ret; } - static int Transform(wc_Sha256* sha256, byte* buf) + static int Transform_Sha256(wc_Sha256* sha256, byte* buf) { int ret = wolfSSL_CryptHwMutexLock(); if (ret == 0) { @@ -511,10 +543,6 @@ static int InitSha256(wc_Sha256* sha256) } #endif /* End Hardware Acceleration */ -#ifndef SAVE_XMM_YMM - #define SAVE_XMM_YMM -#endif - #ifdef NEED_SOFT_SHA256 static const ALIGN32 word32 K[64] = { @@ -559,10 +587,11 @@ static int InitSha256(wc_Sha256* sha256) h(j) = t0 + t1 #ifndef XTRANSFORM - #define XTRANSFORM(S, B) Transform((S)) - #endif + #define XTRANSFORM(S) Transform_Sha256((S)) + #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L)) + #endif - static int Transform(wc_Sha256* sha256) + static int Transform_Sha256(wc_Sha256* sha256) { word32 S[8], t0, t1; int i; @@ -623,7 +652,7 @@ static int InitSha256(wc_Sha256* sha256) static INLINE void AddLength(wc_Sha256* sha256, word32 len) { word32 tmp = sha256->loLen; - if ( (sha256->loLen += len) < tmp) + if ((sha256->loLen += len) < tmp) sha256->hiLen++; /* carry low to high */ } #endif @@ -655,9 +684,7 @@ static INLINE void AddLength(wc_Sha256* sha256, word32 len) if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE) return BUFFER_E; - SAVE_XMM_YMM; /* for Intel AVX */ - - while (len) { + if (sha256->buffLen > 0) { word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); XMEMCPY(&local[sha256->buffLen], data, add); @@ -672,19 +699,77 @@ static INLINE void AddLength(wc_Sha256* sha256, word32 len) #endif { ByteReverseWords(sha256->buffer, sha256->buffer, - WC_SHA256_BLOCK_SIZE); + WC_SHA256_BLOCK_SIZE); } #endif - ret = XTRANSFORM(sha256, local); - if (ret != 0) { - break; + ret = XTRANSFORM(sha256); + if (ret == 0) { + AddLength(sha256, WC_SHA256_BLOCK_SIZE); + sha256->buffLen = 0; } - - AddLength(sha256, WC_SHA256_BLOCK_SIZE); - sha256->buffLen = 0; + else + len = 0; } } + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if (Transform_Sha256_Len_p != NULL) { + word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); + + if (blocksLen > 0) { + AddLength(sha256, blocksLen); + sha256->data = data; + /* Byte reversal performed in function if required. */ + XTRANSFORM_LEN(sha256, blocksLen); + data += blocksLen; + len -= blocksLen; + } + } + else + #endif + #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \ + defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + { + word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); + + AddLength(sha256, blocksLen); + while (len >= WC_SHA256_BLOCK_SIZE) { + XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE); + + data += WC_SHA256_BLOCK_SIZE; + len -= WC_SHA256_BLOCK_SIZE; + + /* Byte reversal performed in function if required. */ + ret = XTRANSFORM(sha256); + if (ret != 0) + break; + } + } + #else + { + word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); + + AddLength(sha256, blocksLen); + while (len >= WC_SHA256_BLOCK_SIZE) { + XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE); + + data += WC_SHA256_BLOCK_SIZE; + len -= WC_SHA256_BLOCK_SIZE; + + ByteReverseWords(sha256->buffer, sha256->buffer, + WC_SHA256_BLOCK_SIZE); + ret = XTRANSFORM(sha256); + if (ret != 0) + break; + } + } + #endif + + if (len > 0) { + XMEMCPY(local, data, len); + sha256->buffLen = len; + } + return ret; } @@ -703,8 +788,6 @@ static INLINE void AddLength(wc_Sha256* sha256, word32 len) return BAD_FUNC_ARG; } - SAVE_XMM_YMM; /* for Intel AVX */ - AddLength(sha256, sha256->buffLen); /* before adding pads */ local[sha256->buffLen++] = 0x80; /* add 1 */ @@ -721,12 +804,12 @@ static INLINE void AddLength(wc_Sha256* sha256, word32 len) #endif { ByteReverseWords(sha256->buffer, sha256->buffer, - WC_SHA256_BLOCK_SIZE); + WC_SHA256_BLOCK_SIZE); } #endif } - ret = XTRANSFORM(sha256, local); + ret = XTRANSFORM(sha256); if (ret != 0) return ret; @@ -768,7 +851,7 @@ static INLINE void AddLength(wc_Sha256* sha256, word32 len) } #endif - return XTRANSFORM(sha256, local); + return XTRANSFORM(sha256); } int wc_Sha256Final(wc_Sha256* sha256, byte* hash) @@ -805,477 +888,812 @@ static INLINE void AddLength(wc_Sha256* sha256, word32 len) #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) -#define _DigestToReg(S0, S1, S2, S3, S4, S5, S6, S7) \ - "leaq %[digest], %%r8\n\t" \ - "movl (%%r8), %"#S0"\n\t" \ - "movl 4(%%r8), %"#S1"\n\t" \ - "movl 8(%%r8), %"#S2"\n\t" \ - "movl 12(%%r8), %"#S3"\n\t" \ - "movl 16(%%r8), %"#S4"\n\t" \ - "movl 20(%%r8), %"#S5"\n\t" \ - "movl 24(%%r8), %"#S6"\n\t" \ - "movl 28(%%r8), %"#S7"\n\t" +#define _LOAD_DIGEST() \ + "movl (%[sha256]), %%r8d \n\t" \ + "movl 4(%[sha256]), %%r9d \n\t" \ + "movl 8(%[sha256]), %%r10d\n\t" \ + "movl 12(%[sha256]), %%r11d\n\t" \ + "movl 16(%[sha256]), %%r12d\n\t" \ + "movl 20(%[sha256]), %%r13d\n\t" \ + "movl 24(%[sha256]), %%r14d\n\t" \ + "movl 28(%[sha256]), %%r15d\n\t" -#define _RegToDigest(S0, S1, S2, S3, S4, S5, S6, S7) \ - "leaq %[digest], %%r8\n\t" \ - "addl %"#S0", (%%r8)\n\t" \ - "addl %"#S1", 4(%%r8)\n\t" \ - "addl %"#S2", 8(%%r8)\n\t" \ - "addl %"#S3", 12(%%r8)\n\t" \ - "addl %"#S4", 16(%%r8)\n\t" \ - "addl %"#S5", 20(%%r8)\n\t" \ - "addl %"#S6", 24(%%r8)\n\t" \ - "addl %"#S7", 28(%%r8)\n\t" +#define _STORE_ADD_DIGEST() \ + "addl %%r8d , (%[sha256])\n\t" \ + "addl %%r9d , 4(%[sha256])\n\t" \ + "addl %%r10d, 8(%[sha256])\n\t" \ + "addl %%r11d, 12(%[sha256])\n\t" \ + "addl %%r12d, 16(%[sha256])\n\t" \ + "addl %%r13d, 20(%[sha256])\n\t" \ + "addl %%r14d, 24(%[sha256])\n\t" \ + "addl %%r15d, 28(%[sha256])\n\t" -#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) +#define _ADD_DIGEST() \ + "addl (%[sha256]), %%r8d \n\t" \ + "addl 4(%[sha256]), %%r9d \n\t" \ + "addl 8(%[sha256]), %%r10d\n\t" \ + "addl 12(%[sha256]), %%r11d\n\t" \ + "addl 16(%[sha256]), %%r12d\n\t" \ + "addl 20(%[sha256]), %%r13d\n\t" \ + "addl 24(%[sha256]), %%r14d\n\t" \ + "addl 28(%[sha256]), %%r15d\n\t" -#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) +#define _STORE_DIGEST() \ + "movl %%r8d , (%[sha256])\n\t" \ + "movl %%r9d , 4(%[sha256])\n\t" \ + "movl %%r10d, 8(%[sha256])\n\t" \ + "movl %%r11d, 12(%[sha256])\n\t" \ + "movl %%r12d, 16(%[sha256])\n\t" \ + "movl %%r13d, 20(%[sha256])\n\t" \ + "movl %%r14d, 24(%[sha256])\n\t" \ + "movl %%r15d, 28(%[sha256])\n\t" + +#define LOAD_DIGEST() \ + _LOAD_DIGEST() + +#define STORE_ADD_DIGEST() \ + _STORE_ADD_DIGEST() + +#define ADD_DIGEST() \ + _ADD_DIGEST() + +#define STORE_DIGEST() \ + _STORE_DIGEST() -#define S_0 %r15d -#define S_1 %r10d -#define S_2 %r11d -#define S_3 %r12d -#define S_4 %r13d -#define S_5 %r14d -#define S_6 %ebx -#define S_7 %r9d +#define S_0 %r8d +#define S_1 %r9d +#define S_2 %r10d +#define S_3 %r11d +#define S_4 %r12d +#define S_5 %r13d +#define S_6 %r14d +#define S_7 %r15d -#define SSE_REGs "%edi", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" +#define L1 "%%edx" +#define L2 "%%ecx" +#define L3 "%%eax" +#define L4 "%%ebx" +#define WK "%%rsp" + +#define WORK_REGS "eax", "ebx", "ecx", "edx" +#define STATE_REGS "r8","r9","r10","r11","r12","r13","r14","r15" +#define XMM_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13" #if defined(HAVE_INTEL_RORX) -#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i) \ - "# edx = e>>>6\n\t" \ - "rorx $6, %"#e", %%edx\n\t" +#define RND_STEP_RORX_0_1(a, b, c, d, e, f, g, h, i) \ + /* L3 = f */ \ + "movl %"#f", "L3"\n\t" \ + /* L2 = e>>>11 */ \ + "rorx $11, %"#e", "L2"\n\t" \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ -#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i) \ - "# edi = e>>>11\n\t" \ - "rorx $11, %"#e",%%edi\n\t" \ - "# edi = (e>>11) ^ (e>>6)\n\t" \ - "xorl %%edx, %%edi\n\t" \ - "# edx = e>>>25\n\t" \ - "rorx $25, %"#e", %%edx\n\t" +#define RND_STEP_RORX_0_2(a, b, c, d, e, f, g, h, i) \ + /* L2 = (e>>>6) ^ (e>>>11) */ \ + "xorl "L1", "L2"\n\t" \ + /* L3 = f ^ g */ \ + "xorl %"#g", "L3"\n\t" \ + /* L1 = e>>>25 */ \ + "rorx $25, %"#e", "L1"\n\t" \ -#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i) \ - "# esi = f\n\t" \ - "movl %"#f", %%esi\n\t" \ - "# esi = f ^ g\n\t" \ - "xorl %"#g", %%esi\n\t" \ - "# edx = Sigma1(e)\n\t" \ - "xorl %%edi, %%edx\n\t" \ - "# esi = (f ^ g) & e\n\t" \ - "andl %"#e", %%esi\n\t" \ - "# esi = Ch(e,f,g)\n\t" \ - "xorl %"#g", %%esi\n\t" +#define RND_STEP_RORX_0_3(a, b, c, d, e, f, g, h, i) \ + /* L3 = (f ^ g) & e */ \ + "andl %"#e", "L3"\n\t" \ + /* L1 = Sigma1(e) */ \ + "xorl "L2", "L1"\n\t" \ + /* L2 = a>>>13 */ \ + "rorx $13, %"#a", "L2"\n\t" \ -#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i) \ - "# h += w_k\n\t" \ - "leaq %[W_K], %%r8\n\t" \ - "addl ("#i")*4(%%r8), %"#h"\n\t" \ - "# h = h + w_k + Sigma1(e)\n\t" \ - "addl %%edx, %"#h"\n\t" \ - "# r8d = a>>>2\n\t" \ - "rorx $2, %"#a", %%r8d\n\t" \ - "# edi = a>>>13\n\t" \ - "rorx $13, %"#a", %%edi\n\t" +#define RND_STEP_RORX_0_4(a, b, c, d, e, f, g, h, i) \ + /* h += Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L1 = a>>>2 */ \ + "rorx $2, %"#a", "L1"\n\t" \ + /* L3 = Ch(e,f,g) */ \ + "xorl %"#g", "L3"\n\t" \ -#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i) \ - "# edx = a>>22\n\t" \ - "rorx $22, %"#a", %%edx\n\t" \ - "# edi = (a>>>2) ^ (a>>>13)\n\t" \ - "xorl %%r8d, %%edi\n\t" \ - "# edx = Sigma0(a)\n\t" \ - "xorl %%edi, %%edx\n\t" +#define RND_STEP_RORX_0_5(a, b, c, d, e, f, g, h, i) \ + /* L2 = (a>>>2) ^ (a>>>13) */ \ + "xorl "L1", "L2"\n\t" \ + /* L1 = a>>>22 */ \ + "rorx $22, %"#a", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L3", %"#h"\n\t" \ -#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i) \ - "# edi = b\n\t" \ - "movl %"#b", %%edi\n\t" \ - "# edi = a | b\n\t" \ - "orl %"#a", %%edi\n\t" \ - "# edi = (a | b) & c\n\t" \ - "andl %"#c", %%edi\n\t" \ - "# r8d = b\n\t" \ - "movl %"#b", %%r8d\n\t" +#define RND_STEP_RORX_0_6(a, b, c, d, e, f, g, h, i) \ + /* L1 = Sigma0(a) */ \ + "xorl "L2", "L1"\n\t" \ + /* L3 = b */ \ + "movl %"#b", "L3"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ -#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i) \ - "# h += Ch(e,f,g)\n\t" \ - "addl %%esi, %"#h"\n\t" \ - "# r8d = b & a\n\t" \ - "andl %"#a", %%r8d\n\t" \ - "# r8d = Maj(a,b,c)\n\t" \ - "orl %%edi, %%r8d\n\t" +#define RND_STEP_RORX_0_7(a, b, c, d, e, f, g, h, i) \ + /* L3 = a ^ b */ \ + "xorl %"#a", "L3"\n\t" \ + /* h += Sigma0(a) */ \ + "addl "L1", %"#h"\n\t" \ + /* L4 = (a ^ b) & (b ^ c) */ \ + "andl "L3", "L4"\n\t" \ + +#define RND_STEP_RORX_0_8(a, b, c, d, e, f, g, h, i) \ + /* L4 = Maj(a,b,c) */ \ + "xorl %"#b", "L4"\n\t" \ + /* L1 = d>>>6 (= e>>>6 next RND) */ \ + "rorx $6, %"#d", "L1"\n\t" \ + /* h += Maj(a,b,c) */ \ + "addl "L4", %"#h"\n\t" \ + +#define RND_STEP_RORX_1_1(a, b, c, d, e, f, g, h, i) \ + /* L4 = f */ \ + "movl %"#f", "L4"\n\t" \ + /* L2 = e>>>11 */ \ + "rorx $11, %"#e", "L2"\n\t" \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + +#define RND_STEP_RORX_1_2(a, b, c, d, e, f, g, h, i) \ + /* L2 = (e>>>6) ^ (e>>>11) */ \ + "xorl "L1", "L2"\n\t" \ + /* L4 = f ^ g */ \ + "xorl %"#g", "L4"\n\t" \ + /* L1 = e>>>25 */ \ + "rorx $25, %"#e", "L1"\n\t" \ + +#define RND_STEP_RORX_1_3(a, b, c, d, e, f, g, h, i) \ + /* L4 = (f ^ g) & e */ \ + "andl %"#e", "L4"\n\t" \ + /* L1 = Sigma1(e) */ \ + "xorl "L2", "L1"\n\t" \ + /* L2 = a>>>13 */ \ + "rorx $13, %"#a", "L2"\n\t" \ + +#define RND_STEP_RORX_1_4(a, b, c, d, e, f, g, h, i) \ + /* h += Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L1 = a>>>2 */ \ + "rorx $2, %"#a", "L1"\n\t" \ + /* L4 = Ch(e,f,g) */ \ + "xorl %"#g", "L4"\n\t" \ + +#define RND_STEP_RORX_1_5(a, b, c, d, e, f, g, h, i) \ + /* L2 = (a>>>2) ^ (a>>>13) */ \ + "xorl "L1", "L2"\n\t" \ + /* L1 = a>>>22 */ \ + "rorx $22, %"#a", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L4", %"#h"\n\t" \ + +#define RND_STEP_RORX_1_6(a, b, c, d, e, f, g, h, i) \ + /* L1 = Sigma0(a) */ \ + "xorl "L2", "L1"\n\t" \ + /* L4 = b */ \ + "movl %"#b", "L4"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + +#define RND_STEP_RORX_1_7(a, b, c, d, e, f, g, h, i) \ + /* L4 = a ^ b */ \ + "xorl %"#a", "L4"\n\t" \ + /* h += Sigma0(a) */ \ + "addl "L1", %"#h"\n\t" \ + /* L3 = (a ^ b) & (b ^ c) */ \ + "andl "L4", "L3"\n\t" \ + +#define RND_STEP_RORX_1_8(a, b, c, d, e, f, g, h, i) \ + /* L3 = Maj(a,b,c) */ \ + "xorl %"#b", "L3"\n\t" \ + /* L1 = d>>>6 (= e>>>6 next RND) */ \ + "rorx $6, %"#d", "L1"\n\t" \ + /* h += Maj(a,b,c) */ \ + "addl "L3", %"#h"\n\t" \ + +#define _RND_RORX_X_0(a, b, c, d, e, f, g, h, i) \ + /* L1 = e>>>6 */ \ + "rorx $6, %"#e", "L1"\n\t" \ + /* L2 = e>>>11 */ \ + "rorx $11, %"#e", "L2"\n\t" \ + /* Prev RND: h += Maj(a,b,c) */ \ + "addl "L3", %"#a"\n\t" \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + /* L3 = f */ \ + "movl %"#f", "L3"\n\t" \ + /* L2 = (e>>>6) ^ (e>>>11) */ \ + "xorl "L1", "L2"\n\t" \ + /* L3 = f ^ g */ \ + "xorl %"#g", "L3"\n\t" \ + /* L1 = e>>>25 */ \ + "rorx $25, %"#e", "L1"\n\t" \ + /* L1 = Sigma1(e) */ \ + "xorl "L2", "L1"\n\t" \ + /* L3 = (f ^ g) & e */ \ + "andl %"#e", "L3"\n\t" \ + /* h += Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L1 = a>>>2 */ \ + "rorx $2, %"#a", "L1"\n\t" \ + /* L2 = a>>>13 */ \ + "rorx $13, %"#a", "L2"\n\t" \ + /* L3 = Ch(e,f,g) */ \ + "xorl %"#g", "L3"\n\t" \ + /* L2 = (a>>>2) ^ (a>>>13) */ \ + "xorl "L1", "L2"\n\t" \ + /* L1 = a>>>22 */ \ + "rorx $22, %"#a", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L3", %"#h"\n\t" \ + /* L1 = Sigma0(a) */ \ + "xorl "L2", "L1"\n\t" \ + /* L3 = b */ \ + "movl %"#b", "L3"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + /* L3 = a ^ b */ \ + "xorl %"#a", "L3"\n\t" \ + /* L4 = (a ^ b) & (b ^ c) */ \ + "andl "L3", "L4"\n\t" \ + /* h += Sigma0(a) */ \ + "addl "L1", %"#h"\n\t" \ + /* L4 = Maj(a,b,c) */ \ + "xorl %"#b", "L4"\n\t" \ + +#define _RND_RORX_X_1(a, b, c, d, e, f, g, h, i) \ + /* L1 = e>>>6 */ \ + "rorx $6, %"#e", "L1"\n\t" \ + /* L2 = e>>>11 */ \ + "rorx $11, %"#e", "L2"\n\t" \ + /* Prev RND: h += Maj(a,b,c) */ \ + "addl "L4", %"#a"\n\t" \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + /* L4 = f */ \ + "movl %"#f", "L4"\n\t" \ + /* L2 = (e>>>6) ^ (e>>>11) */ \ + "xorl "L1", "L2"\n\t" \ + /* L4 = f ^ g */ \ + "xorl %"#g", "L4"\n\t" \ + /* L1 = e>>>25 */ \ + "rorx $25, %"#e", "L1"\n\t" \ + /* L1 = Sigma1(e) */ \ + "xorl "L2", "L1"\n\t" \ + /* L4 = (f ^ g) & e */ \ + "andl %"#e", "L4"\n\t" \ + /* h += Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L1 = a>>>2 */ \ + "rorx $2, %"#a", "L1"\n\t" \ + /* L2 = a>>>13 */ \ + "rorx $13, %"#a", "L2"\n\t" \ + /* L4 = Ch(e,f,g) */ \ + "xorl %"#g", "L4"\n\t" \ + /* L2 = (a>>>2) ^ (a>>>13) */ \ + "xorl "L1", "L2"\n\t" \ + /* L1 = a>>>22 */ \ + "rorx $22, %"#a", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L4", %"#h"\n\t" \ + /* L1 = Sigma0(a) */ \ + "xorl "L2", "L1"\n\t" \ + /* L4 = b */ \ + "movl %"#b", "L4"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + /* L4 = a ^ b */ \ + "xorl %"#a", "L4"\n\t" \ + /* L2 = (a ^ b) & (b ^ c) */ \ + "andl "L4", "L3"\n\t" \ + /* h += Sigma0(a) */ \ + "addl "L1", %"#h"\n\t" \ + /* L3 = Maj(a,b,c) */ \ + "xorl %"#b", "L3"\n\t" \ + + +#define RND_RORX_X_0(a,b,c,d,e,f,g,h,i) \ + _RND_RORX_X_0(a,b,c,d,e,f,g,h,i) +#define RND_RORX_X_1(a,b,c,d,e,f,g,h,i) \ + _RND_RORX_X_1(a,b,c,d,e,f,g,h,i) + +#define RND_RORX_X4(a,b,c,d,e,f,g,h,i) \ + RND_RORX_X_0(a,b,c,d,e,f,g,h,i+0) \ + RND_RORX_X_1(h,a,b,c,d,e,f,g,i+1) \ + RND_RORX_X_0(g,h,a,b,c,d,e,f,i+2) \ + RND_RORX_X_1(f,g,h,a,b,c,d,e,i+3) -#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i) \ - "# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \ - "addl %"#h", %"#d"\n\t" \ - "addl %"#h", %%r8d\n\t" \ - "addl %%edx, %%r8d\n\t" \ - "movl %%r8d, %"#h"\n\t" #endif /* HAVE_INTEL_RORX */ -#define RND_STEP_1(a,b,c,d,e,f,g,h,i) \ - "movl %"#e", %%edx\n\t" \ - "# edx = e>>>6\n\t" \ - "roll $26, %%edx\n\t" \ - "movl %"#e", %%edi\n\t" +#define RND_STEP_0_1(a,b,c,d,e,f,g,h,i) \ + /* L1 = e>>>14 */ \ + "rorl $14, "L1"\n\t" \ -#define RND_STEP_2(a,b,c,d,e,f,g,h,i) \ - "# edi = e>>>11\n\t" \ - "roll $21, %%edi\n\t" \ - "# edi = (e>>11) ^ (e>>6)\n\t" \ - "xorl %%edx, %%edi\n\t" \ - "# edx = e\n\t" \ - "movl %"#e", %%edx\n\t" \ - "# edx = e>>>25\n\t" \ - "roll $7, %%edx\n\t" +#define RND_STEP_0_2(a,b,c,d,e,f,g,h,i) \ + /* L3 = b */ \ + "movl %"#b", "L3"\n\t" \ + /* L2 = f */ \ + "movl %"#f", "L2"\n\t" \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + /* L2 = f ^ g */ \ + "xorl %"#g", "L2"\n\t" \ -#define RND_STEP_3(a,b,c,d,e,f,g,h,i) \ - "# esi = f\n\t" \ - "movl %"#f", %%esi\n\t" \ - "# esi = f ^ g\n\t" \ - "xorl %"#g", %%esi\n\t" \ - "# edx = Sigma1(e)\n\t" \ - "xorl %%edi, %%edx\n\t" \ - "# esi = (f ^ g) & e\n\t" \ - "andl %"#e", %%esi\n\t" \ - "# esi = Ch(e,f,g)\n\t" \ - "xorl %"#g", %%esi\n\t" +#define RND_STEP_0_3(a,b,c,d,e,f,g,h,i) \ + /* L1 = (e>>>14) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* L2 = (f ^ g) & e */ \ + "andl %"#e", "L2"\n\t" \ -#define RND_STEP_4(a,b,c,d,e,f,g,h,i) \ - "# h += w_k\n\t" \ - "leaq %[W_K], %%r8\n\t" \ - "addl ("#i")*4(%%r8), %"#h"\n\t" \ - "# h = h + w_k + Sigma1(e)\n\t" \ - "addl %%edx, %"#h"\n\t" \ - "# r8d = a\n\t" \ - "movl %"#a", %%r8d\n\t" \ - "# r8d = a>>>2\n\t" \ - "roll $30, %%r8d\n\t" \ - "# edi = a\n\t" \ - "movl %"#a", %%edi\n\t" \ - "# edi = a>>>13\n\t" \ - "roll $19, %%edi\n\t" \ - "# edx = a\n\t" \ - "movl %"#a", %%edx\n\t" +#define RND_STEP_0_4(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((e>>>14) ^ e) >>> 5 */ \ + "rorl $5, "L1"\n\t" \ + /* L2 = Ch(e,f,g) */ \ + "xorl %"#g", "L2"\n\t" \ + /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L2", %"#h"\n\t" \ -#define RND_STEP_5(a,b,c,d,e,f,g,h,i) \ - "# edx = a>>>22\n\t" \ - "roll $10, %%edx\n\t" \ - "# edi = (a>>>2) ^ (a>>>13)\n\t" \ - "xorl %%r8d, %%edi\n\t" \ - "# edx = Sigma0(a)\n\t" \ - "xorl %%edi, %%edx\n\t" +#define RND_STEP_0_5(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ + "rorl $6, "L1"\n\t" \ + /* L3 = a ^ b (= b ^ c of next RND) */ \ + "xorl %"#a", "L3"\n\t" \ + /* h = h + w_k + Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L2 = a */ \ + "movl %"#a", "L2"\n\t" \ -#define RND_STEP_6(a,b,c,d,e,f,g,h,i) \ - "# edi = b\n\t" \ - "movl %"#b", %%edi\n\t" \ - "# edi = a | b\n\t" \ - "orl %"#a", %%edi\n\t" \ - "# edi = (a | b) & c\n\t" \ - "andl %"#c", %%edi\n\t" \ - "# r8d = b\n\t" \ - "movl %"#b", %%r8d\n\t" +#define RND_STEP_0_6(a,b,c,d,e,f,g,h,i) \ + /* L3 = (a ^ b) & (b ^ c) */ \ + "andl "L3", "L4"\n\t" \ + /* L2 = a>>>9 */ \ + "rorl $9, "L2"\n\t" \ + /* L2 = (a>>>9) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* L1 = Maj(a,b,c) */ \ + "xorl %"#b", "L4"\n\t" \ -#define RND_STEP_7(a,b,c,d,e,f,g,h,i) \ - "# h += Ch(e,f,g)\n\t" \ - "addl %%esi, %"#h"\n\t" \ - "#r8d = b & a\n\t" \ - "andl %"#a", %%r8d\n\t" \ - "# r8d = Maj(a,b,c)\n\t" \ - "orl %%edi, %%r8d\n\t" +#define RND_STEP_0_7(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((a>>>9) ^ a) >>> 11 */ \ + "rorl $11, "L2"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L4", %"#h"\n\t" \ -#define RND_STEP_8(a,b,c,d,e,f,g,h,i) \ - "# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \ - "addl %"#h", %"#d"\n\t" \ - "# r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c)\n\t" \ - "addl %"#h", %%r8d\n\t" \ - "# r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \ - "addl %%edx, %%r8d\n\t" \ - "# h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \ - "movl %%r8d, %"#h"\n\t" +#define RND_STEP_0_8(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ + "rorl $2, "L2"\n\t" \ + /* L1 = d (e of next RND) */ \ + "movl %"#d", "L1"\n\t" \ + /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L2", %"#h"\n\t" \ -#define RND_X(a,b,c,d,e,f,g,h,i) \ - RND_STEP_1(a,b,c,d,e,f,g,h,i) \ - RND_STEP_2(a,b,c,d,e,f,g,h,i) \ - RND_STEP_3(a,b,c,d,e,f,g,h,i) \ - RND_STEP_4(a,b,c,d,e,f,g,h,i) \ - RND_STEP_5(a,b,c,d,e,f,g,h,i) \ - RND_STEP_6(a,b,c,d,e,f,g,h,i) \ - RND_STEP_7(a,b,c,d,e,f,g,h,i) \ - RND_STEP_8(a,b,c,d,e,f,g,h,i) +#define RND_STEP_1_1(a,b,c,d,e,f,g,h,i) \ + /* L1 = e>>>14 */ \ + "rorl $14, "L1"\n\t" \ -#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) -#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) -#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) -#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) -#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) -#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) -#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) -#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) +#define RND_STEP_1_2(a,b,c,d,e,f,g,h,i) \ + /* L3 = b */ \ + "movl %"#b", "L4"\n\t" \ + /* L2 = f */ \ + "movl %"#f", "L2"\n\t" \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + /* L2 = f ^ g */ \ + "xorl %"#g", "L2"\n\t" \ + +#define RND_STEP_1_3(a,b,c,d,e,f,g,h,i) \ + /* L1 = (e>>>14) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* L2 = (f ^ g) & e */ \ + "andl %"#e", "L2"\n\t" \ + +#define RND_STEP_1_4(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((e>>>14) ^ e) >>> 5 */ \ + "rorl $5, "L1"\n\t" \ + /* L2 = Ch(e,f,g) */ \ + "xorl %"#g", "L2"\n\t" \ + /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L2", %"#h"\n\t" \ + +#define RND_STEP_1_5(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ + "rorl $6, "L1"\n\t" \ + /* L4 = a ^ b (= b ^ c of next RND) */ \ + "xorl %"#a", "L4"\n\t" \ + /* h = h + w_k + Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L2 = a */ \ + "movl %"#a", "L2"\n\t" \ + +#define RND_STEP_1_6(a,b,c,d,e,f,g,h,i) \ + /* L3 = (a ^ b) & (b ^ c) */ \ + "andl "L4", "L3"\n\t" \ + /* L2 = a>>>9 */ \ + "rorl $9, "L2"\n\t" \ + /* L2 = (a>>>9) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* L1 = Maj(a,b,c) */ \ + "xorl %"#b", "L3"\n\t" \ + +#define RND_STEP_1_7(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((a>>>9) ^ a) >>> 11 */ \ + "rorl $11, "L2"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L3", %"#h"\n\t" \ + +#define RND_STEP_1_8(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ + "rorl $2, "L2"\n\t" \ + /* L1 = d (e of next RND) */ \ + "movl %"#d", "L1"\n\t" \ + /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L2", %"#h"\n\t" \ + +#define _RND_ALL_0(a,b,c,d,e,f,g,h,i) \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + /* L2 = f */ \ + "movl %"#f", "L2"\n\t" \ + /* L3 = b */ \ + "movl %"#b", "L3"\n\t" \ + /* L2 = f ^ g */ \ + "xorl %"#g", "L2"\n\t" \ + /* L1 = e>>>14 */ \ + "rorl $14, "L1"\n\t" \ + /* L2 = (f ^ g) & e */ \ + "andl %"#e", "L2"\n\t" \ + /* L1 = (e>>>14) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* L2 = Ch(e,f,g) */ \ + "xorl %"#g", "L2"\n\t" \ + /* L1 = ((e>>>14) ^ e) >>> 5 */ \ + "rorl $5, "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L2", %"#h"\n\t" \ + /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* L3 = a ^ b */ \ + "xorl %"#a", "L3"\n\t" \ + /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ + "rorl $6, "L1"\n\t" \ + /* L2 = a */ \ + "movl %"#a", "L2"\n\t" \ + /* h = h + w_k + Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L2 = a>>>9 */ \ + "rorl $9, "L2"\n\t" \ + /* L3 = (a ^ b) & (b ^ c) */ \ + "andl "L3", "L4"\n\t" \ + /* L2 = (a>>>9) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* L1 = Maj(a,b,c) */ \ + "xorl %"#b", "L4"\n\t" \ + /* L2 = ((a>>>9) ^ a) >>> 11 */ \ + "rorl $11, "L2"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L4", %"#h"\n\t" \ + /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ + "rorl $2, "L2"\n\t" \ + /* L1 = d (e of next RND) */ \ + "movl %"#d", "L1"\n\t" \ + /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L2", %"#h"\n\t" \ + +#define _RND_ALL_1(a,b,c,d,e,f,g,h,i) \ + /* h += w_k */ \ + "addl ("#i")*4("WK"), %"#h"\n\t" \ + /* L2 = f */ \ + "movl %"#f", "L2"\n\t" \ + /* L3 = b */ \ + "movl %"#b", "L4"\n\t" \ + /* L2 = f ^ g */ \ + "xorl %"#g", "L2"\n\t" \ + /* L1 = e>>>14 */ \ + "rorl $14, "L1"\n\t" \ + /* L2 = (f ^ g) & e */ \ + "andl %"#e", "L2"\n\t" \ + /* L1 = (e>>>14) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* L2 = Ch(e,f,g) */ \ + "xorl %"#g", "L2"\n\t" \ + /* L1 = ((e>>>14) ^ e) >>> 5 */ \ + "rorl $5, "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addl "L2", %"#h"\n\t" \ + /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ + "xorl %"#e", "L1"\n\t" \ + /* L3 = a ^ b */ \ + "xorl %"#a", "L4"\n\t" \ + /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ + "rorl $6, "L1"\n\t" \ + /* L2 = a */ \ + "movl %"#a", "L2"\n\t" \ + /* h = h + w_k + Sigma1(e) */ \ + "addl "L1", %"#h"\n\t" \ + /* L2 = a>>>9 */ \ + "rorl $9, "L2"\n\t" \ + /* L3 = (a ^ b) & (b ^ c) */ \ + "andl "L4", "L3"\n\t" \ + /* L2 = (a>>>9) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* L1 = Maj(a,b,c) */ \ + "xorl %"#b", "L3"\n\t" \ + /* L2 = ((a>>>9) ^ a) >>> 11 */ \ + "rorl $11, "L2"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addl %"#h", %"#d"\n\t" \ + /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ + "xorl %"#a", "L2"\n\t" \ + /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L3", %"#h"\n\t" \ + /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ + "rorl $2, "L2"\n\t" \ + /* L1 = d (e of next RND) */ \ + "movl %"#d", "L1"\n\t" \ + /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + "addl "L2", %"#h"\n\t" \ -#define RND_1_3(a,b,c,d,e,f,g,h,i) \ - RND_STEP_1(a,b,c,d,e,f,g,h,i) \ - RND_STEP_2(a,b,c,d,e,f,g,h,i) \ - RND_STEP_3(a,b,c,d,e,f,g,h,i) +#define RND_ALL_0(a, b, c, d, e, f, g, h, i) \ + _RND_ALL_0(a, b, c, d, e, f, g, h, i) +#define RND_ALL_1(a, b, c, d, e, f, g, h, i) \ + _RND_ALL_1(a, b, c, d, e, f, g, h, i) -#define RND_4_6(a,b,c,d,e,f,g,h,i) \ - RND_STEP_4(a,b,c,d,e,f,g,h,i) \ - RND_STEP_5(a,b,c,d,e,f,g,h,i) \ - RND_STEP_6(a,b,c,d,e,f,g,h,i) - -#define RND_7_8(a,b,c,d,e,f,g,h,i) \ - RND_STEP_7(a,b,c,d,e,f,g,h,i) \ - RND_STEP_8(a,b,c,d,e,f,g,h,i) - -#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) -#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) -#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) -#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) -#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) -#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) -#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) -#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) - - -#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) -#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) -#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) -#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) -#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) -#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) -#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) -#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) - -#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) -#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) -#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) -#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) -#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) -#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) -#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) -#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) - -#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) -#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) -#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) -#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) -#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) -#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) -#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) -#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) - -#define FOR(cnt, init, max, inc, loop) \ - __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) -#define END(cnt, init, max, inc, loop) \ - __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::); +#define RND_ALL_4(a, b, c, d, e, f, g, h, i) \ + RND_ALL_0(a, b, c, d, e, f, g, h, i+0) \ + RND_ALL_1(h, a, b, c, d, e, f, g, i+1) \ + RND_ALL_0(g, h, a, b, c, d, e, f, i+2) \ + RND_ALL_1(f, g, h, a, b, c, d, e, i+3) #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ -#define VPALIGNR(op1,op2,op3,op4) \ +#define _VPALIGNR(op1, op2, op3, op4) \ "vpalignr $"#op4", %"#op3", %"#op2", %"#op1"\n\t" -#define VPADDD(op1,op2,op3) \ +#define VPALIGNR(op1, op2, op3, op4) \ + _VPALIGNR(op1, op2, op3, op4) +#define _VPADDD(op1, op2, op3) \ "vpaddd %"#op3", %"#op2", %"#op1"\n\t" -#define VPSRLD(op1,op2,op3) \ +#define VPADDD(op1, op2, op3) \ + _VPADDD(op1, op2, op3) +#define _VPSRLD(op1, op2, op3) \ "vpsrld $"#op3", %"#op2", %"#op1"\n\t" -#define VPSRLQ(op1,op2,op3) \ +#define VPSRLD(op1, op2, op3) \ + _VPSRLD(op1, op2, op3) +#define _VPSRLQ(op1, op2, op3) \ "vpsrlq $"#op3", %"#op2", %"#op1"\n\t" -#define VPSLLD(op1,op2,op3) \ +#define VPSRLQ(op1,op2,op3) \ + _VPSRLQ(op1,op2,op3) +#define _VPSLLD(op1,op2,op3) \ "vpslld $"#op3", %"#op2", %"#op1"\n\t" -#define VPOR(op1,op2,op3) \ +#define VPSLLD(op1,op2,op3) \ + _VPSLLD(op1,op2,op3) +#define _VPOR(op1,op2,op3) \ "vpor %"#op3", %"#op2", %"#op1"\n\t" -#define VPXOR(op1,op2,op3) \ +#define VPOR(op1,op2,op3) \ + _VPOR(op1,op2,op3) +#define _VPXOR(op1,op2,op3) \ "vpxor %"#op3", %"#op2", %"#op1"\n\t" -#define VPSHUFD(op1,op2,op3) \ +#define VPXOR(op1,op2,op3) \ + _VPXOR(op1,op2,op3) +#define _VPSHUFD(op1,op2,op3) \ "vpshufd $"#op3", %"#op2", %"#op1"\n\t" -#define VPSHUFB(op1,op2,op3) \ +#define VPSHUFD(op1,op2,op3) \ + _VPSHUFD(op1,op2,op3) +#define _VPSHUFB(op1,op2,op3) \ "vpshufb %"#op3", %"#op2", %"#op1"\n\t" +#define VPSHUFB(op1,op2,op3) \ + _VPSHUFB(op1,op2,op3) +#define _VPSLLDQ(op1,op2,op3) \ + "vpslldq $"#op3", %"#op2", %"#op1"\n\t" +#define VPSLLDQ(op1,op2,op3) \ + _VPSLLDQ(op1,op2,op3) -#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ - a,b,c,d,e,f,g,h,_i)\ - RND_STEP_1(a,b,c,d,e,f,g,h,_i)\ - VPALIGNR (XTMP0, X3, X2, 4)\ - RND_STEP_2(a,b,c,d,e,f,g,h,_i)\ - VPADDD (XTMP0, XTMP0, X0)\ - RND_STEP_3(a,b,c,d,e,f,g,h,_i)\ - VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\ - RND_STEP_4(a,b,c,d,e,f,g,h,_i)\ - VPSRLD (XTMP2, XTMP1, 7)\ - RND_STEP_5(a,b,c,d,e,f,g,h,_i)\ - VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ - RND_STEP_6(a,b,c,d,e,f,g,h,_i)\ - VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\ - RND_STEP_7(a,b,c,d,e,f,g,h,_i)\ - VPSRLD (XTMP2, XTMP1,18)\ - RND_STEP_8(a,b,c,d,e,f,g,h,_i)\ -\ - RND_STEP_1(h,a,b,c,d,e,f,g,_i+1)\ - VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\ - RND_STEP_2(h,a,b,c,d,e,f,g,_i+1)\ - VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ - RND_STEP_3(h,a,b,c,d,e,f,g,_i+1)\ - VPXOR (XTMP3, XTMP3, XTMP1)\ - RND_STEP_4(h,a,b,c,d,e,f,g,_i+1)\ - VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ - RND_STEP_5(h,a,b,c,d,e,f,g,_i+1)\ - VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\ - RND_STEP_6(h,a,b,c,d,e,f,g,_i+1)\ - VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\ - RND_STEP_7(h,a,b,c,d,e,f,g,_i+1)\ - VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\ - RND_STEP_8(h,a,b,c,d,e,f,g,_i+1)\ -\ - RND_STEP_1(g,h,a,b,c,d,e,f,_i+2)\ - VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\ - RND_STEP_2(g,h,a,b,c,d,e,f,_i+2)\ - VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ - RND_STEP_3(g,h,a,b,c,d,e,f,_i+2)\ - VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ - RND_STEP_4(g,h,a,b,c,d,e,f,_i+2)\ - VPXOR (XTMP2, XTMP2, XTMP3)\ - RND_STEP_5(g,h,a,b,c,d,e,f,_i+2)\ - VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\ - RND_STEP_6(g,h,a,b,c,d,e,f,_i+2)\ - VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\ - RND_STEP_7(g,h,a,b,c,d,e,f,_i+2)\ - VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\ - RND_STEP_8(g,h,a,b,c,d,e,f,_i+2)\ -\ - RND_STEP_1(f,g,h,a,b,c,d,e,_i+3)\ - VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\ - RND_STEP_2(f,g,h,a,b,c,d,e,_i+3)\ - VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\ - RND_STEP_3(f,g,h,a,b,c,d,e,_i+3)\ - VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ - RND_STEP_4(f,g,h,a,b,c,d,e,_i+3)\ - VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ - RND_STEP_5(f,g,h,a,b,c,d,e,_i+3)\ - VPXOR (XTMP2, XTMP2, XTMP3)\ - RND_STEP_6(f,g,h,a,b,c,d,e,_i+3)\ - VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\ - RND_STEP_7(f,g,h,a,b,c,d,e,_i+3)\ - VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\ - RND_STEP_8(f,g,h,a,b,c,d,e,_i+3)\ - VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\ +#define MsgSched(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \ + VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */ \ + VPALIGNR (XTMP0, X3, X2, 4) /* XTMP0 = W[-7] */ \ + RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (XTMP2, XTMP1, 7) /* XTMP2 = W[-15] >> 7 */ \ + VPSLLD (XTMP3, XTMP1, 25) /* XTEMP3 = W[-15] << (32-7) */ \ + RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (XTMP4, XTMP1, 18) /* XTEMP4 = W[-15] >> 18 */ \ + VPSLLD (XTMP5, XTMP1, 14) /* XTEMP5 = W[-15] << (32-18) */ \ + RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \ + VPOR (XTMP2, XTMP3, XTMP2) /* XTMP2 = W[-15] >>> 7 */ \ + VPOR (XTMP4, XTMP5, XTMP4) /* XTMP4 = W[-15] >>> 18 */ \ + RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (XTMP5, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \ + VPXOR (XTMP2, XTMP4, XTMP2) \ + /* XTMP2 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \ + RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (XTMP1, XTMP5, XTMP2) /* XTMP1 = s0 */ \ + VPSHUFD (XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/ \ + RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */ \ + VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ + RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \ + VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ + VPADDD (XTMP0, XTMP0, X0) \ + RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (XTMP2, XTMP3, XTMP2) \ + VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \ + RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */ \ + RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \ + VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */ \ + RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \ + VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */ \ + RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */ \ + RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLQ (XTMP4, XTMP2, 17) /* XTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ + VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ + RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \ + RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */ \ + VPXOR (XTMP4, XTMP3, XTMP4) \ + RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \ + RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \ + VPXOR (XTMP5, XTMP4, XTMP5) /* XTMP5 = s1 {xDxC} */ \ + RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */ \ + RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \ + VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */ #if defined(HAVE_INTEL_RORX) -#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ - XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ - RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i)\ - VPALIGNR (XTMP0, X3, X2, 4)\ - RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i)\ - VPADDD (XTMP0, XTMP0, X0)\ - RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i)\ - VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\ - RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i)\ - VPSRLD (XTMP2, XTMP1, 7)\ - RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i)\ - VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ - RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i)\ - VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\ - RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i)\ - VPSRLD (XTMP2, XTMP1,18)\ - RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i)\ -\ - RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1)\ - VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\ - RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1)\ - VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ - RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1)\ - VPXOR (XTMP3, XTMP3, XTMP1)\ - RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1)\ - VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ - RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1)\ - VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\ - RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1)\ - VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\ - RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1)\ - VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\ - RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1)\ -\ - RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2)\ - VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\ - RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2)\ - VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ - RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2)\ - VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ - RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2)\ - VPXOR (XTMP2, XTMP2, XTMP3)\ - RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2)\ - VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\ - RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2)\ - VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\ - RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2)\ - VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\ - RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2)\ -\ - RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3)\ - VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\ - RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3)\ - VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\ - RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3)\ - VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ - RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3)\ - VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ - RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3)\ - VPXOR (XTMP2, XTMP2, XTMP3)\ - RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3)\ - VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\ - RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3)\ - VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\ - RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3)\ - VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\ +#define MsgSched_RORX(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \ + RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \ + VPALIGNR (XTMP0, X3, X2, 4) \ + VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */ \ + RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (XTMP2, XTMP1, 7) \ + VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */ \ + RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \ + VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */ \ + RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \ + \ + RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (XTMP2, XTMP1,18) \ + RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \ + VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */ \ + RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (XTMP3, XTMP3, XTMP1) \ + RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (XTMP3, XTMP3, XTMP2) \ + /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \ + RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \ + VPSHUFD (XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/ \ + RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */ \ + RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */ \ + RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \ + \ + RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \ + VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ + RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \ + VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ + VPADDD (XTMP0, XTMP0, X0) \ + RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \ + VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \ + RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (XTMP2, XTMP2, XTMP3) \ + RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */ \ + RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \ + VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */ \ + RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \ + VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */ \ + RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \ + \ + RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */ \ + RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */ \ + RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ + RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */ \ + RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \ + VPXOR (XTMP2, XTMP2, XTMP3) \ + RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \ + VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */ \ + RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */ \ + RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \ + VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */ #endif /* HAVE_INTEL_RORX */ -#define W_K_from_buff() \ - "leaq %[buf], %%r8\n\t" \ - "vmovdqu (%%r8), %%xmm4\n\t" \ - "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" \ - "vmovdqu 16(%%r8), %%xmm5\n\t" \ - "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" \ - "vmovdqu 32(%%r8), %%xmm6\n\t" \ - "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" \ - "vmovdqu 48(%%r8), %%xmm7\n\t" \ - "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t" +#define _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \ + "# X0, X1, X2, X3 = W[0..15]\n\t" \ + "vmovdqu (%%rax), %"#X0"\n\t" \ + "vmovdqu 16(%%rax), %"#X1"\n\t" \ + VPSHUFB(X0, X0, BYTE_FLIP_MASK) \ + VPSHUFB(X1, X1, BYTE_FLIP_MASK) \ + "vmovdqu 32(%%rax), %"#X2"\n\t" \ + "vmovdqu 48(%%rax), %"#X3"\n\t" \ + VPSHUFB(X2, X2, BYTE_FLIP_MASK) \ + VPSHUFB(X3, X3, BYTE_FLIP_MASK) -#define _SET_W_K_XFER(reg, i) \ - "leaq %[K], %%r8\n\t" \ - "vpaddd ("#i")*4(%%r8), %"#reg", %%xmm9\n\t" \ - "leaq %[W_K], %%r8\n\t" \ - "vmovdqa %%xmm9, ("#i")*4(%%r8)\n\t" +#define W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \ + _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) -#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) -static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ -static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ -static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b }; +#define _SET_W_K_XFER_4(i) \ + "vpaddd ("#i"*4)+ 0+%[K], %%xmm0, %%xmm4\n\t" \ + "vpaddd ("#i"*4)+16+%[K], %%xmm1, %%xmm5\n\t" \ + "vmovdqu %%xmm4, ("WK")\n\t" \ + "vmovdqu %%xmm5, 16("WK")\n\t" \ + "vpaddd ("#i"*4)+32+%[K], %%xmm2, %%xmm6\n\t" \ + "vpaddd ("#i"*4)+48+%[K], %%xmm3, %%xmm7\n\t" \ + "vmovdqu %%xmm6, 32("WK")\n\t" \ + "vmovdqu %%xmm7, 48("WK")\n\t" +#define SET_W_K_XFER_4(i) \ + _SET_W_K_XFER_4(i) + + +static const ALIGN32 word64 mSHUF_00BA[] = + { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ +static const ALIGN32 word64 mSHUF_DC00[] = + { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ +static const ALIGN32 word64 mBYTE_FLIP_MASK[] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b }; #define _Init_Masks(mask1, mask2, mask3) \ - "vmovdqu %[FLIP], %"#mask1"\n\t" \ - "vmovdqu %[SHUF00BA], %"#mask2"\n\t" \ - "vmovdqu %[SHUFDC00], %"#mask3"\n\t" + "vmovdqa %[FLIP], %"#mask1"\n\t" \ + "vmovdqa %[SHUF00BA], %"#mask2"\n\t" \ + "vmovdqa %[SHUFDC00], %"#mask3"\n\t" -#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ - _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) +#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ + _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) -#define X0 %xmm4 -#define X1 %xmm5 -#define X2 %xmm6 -#define X3 %xmm7 -#define X_ X0 +#define X0 %xmm0 +#define X1 %xmm1 +#define X2 %xmm2 +#define X3 %xmm3 -#define XTMP0 %xmm0 -#define XTMP1 %xmm1 -#define XTMP2 %xmm2 -#define XTMP3 %xmm3 +#define XTMP0 %xmm4 +#define XTMP1 %xmm5 +#define XTMP2 %xmm6 +#define XTMP3 %xmm7 #define XTMP4 %xmm8 #define XTMP5 %xmm9 #define XFER %xmm10 @@ -1285,755 +1703,858 @@ static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0 #define BYTE_FLIP_MASK %xmm13 -static int Transform_AVX1(wc_Sha256* sha256) +SHA256_NOINLINE static int Transform_Sha256_AVX1(wc_Sha256* sha256) { - ALIGN32 word32 W_K[64]; /* temp for W+K */ - __asm__ __volatile__ ( + "subq $64, %%rsp\n\t" + + "leaq 32(%[sha256]), %%rax\n\t" Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) - "# X0, X1, X2, X3 = W[0..15]; \n\t" - W_K_from_buff() + LOAD_DIGEST() - DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) + W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) - SET_W_K_XFER(X0, 0) - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) - SET_W_K_XFER(X1, 4) - MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) - SET_W_K_XFER(X2, 8) - MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) - SET_W_K_XFER(X3, 12) - MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) - SET_W_K_XFER(X0, 16) - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) - SET_W_K_XFER(X1, 20) - MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) - SET_W_K_XFER(X2, 24) - MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) - SET_W_K_XFER(X3, 28) - MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) - SET_W_K_XFER(X0, 32) - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) - SET_W_K_XFER(X1, 36) - MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) - SET_W_K_XFER(X2, 40) - MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) - SET_W_K_XFER(X3, 44) - MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) + "movl %%r9d, "L4"\n\t" + "movl %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" - SET_W_K_XFER(X0, 48) - SET_W_K_XFER(X1, 52) - SET_W_K_XFER(X2, 56) - SET_W_K_XFER(X3, 60) + SET_W_K_XFER_4(0) + MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) + SET_W_K_XFER_4(16) + MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) + SET_W_K_XFER_4(32) + MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) + SET_W_K_XFER_4(48) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) + STORE_ADD_DIGEST() - RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) + "addq $64, %%rsp\n\t" : : [FLIP] "m" (mBYTE_FLIP_MASK[0]), [SHUF00BA] "m" (mSHUF_00BA[0]), [SHUFDC00] "m" (mSHUF_DC00[0]), - [digest] "m" (sha256->digest), - [buf] "m" (sha256->buffer), - [K] "m" (K), - [W_K] "m" (W_K) - : SSE_REGs, "memory" + [sha256] "r" (sha256), + [K] "m" (K) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory" + ); + + return 0; +} + +SHA256_NOINLINE static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, + word32 len) +{ + __asm__ __volatile__ ( + + "subq $64, %%rsp\n\t" + "movq 120(%[sha256]), %%rax\n\t" + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) + LOAD_DIGEST() + + "# Start of loop processing a block\n" + "1:\n\t" + + W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) + + "movl %%r9d, "L4"\n\t" + "movl %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + + SET_W_K_XFER_4(0) + MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(16) + MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(32) + MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(48) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + "movq 120(%[sha256]), %%rax\n\t" + + ADD_DIGEST() + + "addq $64, %%rax\n\t" + "subl $64, %[len]\n\t" + + STORE_DIGEST() + + "movq %%rax, 120(%[sha256])\n\t" + "jnz 1b\n\t" + + "addq $64, %%rsp\n\t" + + : + : [FLIP] "m" (mBYTE_FLIP_MASK[0]), + [SHUF00BA] "m" (mSHUF_00BA[0]), + [SHUFDC00] "m" (mSHUF_DC00[0]), + [sha256] "r" (sha256), + [len] "r" (len), + [K] "m" (K) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory" + ); + + return 0; +} +#endif /* HAVE_INTEL_AVX1 */ + +#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) +SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX(wc_Sha256* sha256) +{ + __asm__ __volatile__ ( + + "subq $64, %%rsp\n\t" + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) + "leaq 32(%[sha256]), %%rax\n\t" + W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) + + LOAD_DIGEST() + + SET_W_K_XFER_4(0) + "movl %%r9d, "L4"\n\t" + "rorx $6, %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(16) + MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(32) + MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(48) + "xorl "L3", "L3"\n\t" + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + /* Prev RND: h += Maj(a,b,c) */ + "addl "L3", %%r8d\n\t" + + STORE_ADD_DIGEST() + + "addq $64, %%rsp\n\t" + + : + : [FLIP] "m" (mBYTE_FLIP_MASK[0]), + [SHUF00BA] "m" (mSHUF_00BA[0]), + [SHUFDC00] "m" (mSHUF_DC00[0]), + [sha256] "r" (sha256), + [K] "m" (K) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory" + ); + + return 0; +} + +SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, + word32 len) +{ + __asm__ __volatile__ ( + + "subq $64, %%rsp\n\t" + "movq 120(%[sha256]), %%rax\n\t" + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) + LOAD_DIGEST() + + "# Start of loop processing a block\n" + "1:\n\t" + + W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) + + SET_W_K_XFER_4(0) + "movl %%r9d, "L4"\n\t" + "rorx $6, %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched_RORX(X2, X3, X0, X1, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) + MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(16) + MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(32) + MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + + SET_W_K_XFER_4(48) + "xorl "L3", "L3"\n\t" + "xorl "L2", "L2"\n\t" + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + /* Prev RND: h += Maj(a,b,c) */ + "addl "L3", %%r8d\n\t" + "movq 120(%[sha256]), %%rax\n\t" + + ADD_DIGEST() + + "addq $64, %%rax\n\t" + "subl $64, %[len]\n\t" + + STORE_DIGEST() + + "movq %%rax, 120(%[sha256])\n\t" + "jnz 1b\n\t" + + "addq $64, %%rsp\n\t" + + : + : [FLIP] "m" (mBYTE_FLIP_MASK[0]), + [SHUF00BA] "m" (mSHUF_00BA[0]), + [SHUFDC00] "m" (mSHUF_DC00[0]), + [sha256] "r" (sha256), + [len] "r" (len), + [K] "m" (K) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory" + ); + + return 0; +} +#endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */ + + +#if defined(HAVE_INTEL_AVX2) +#define Y0 %ymm0 +#define Y1 %ymm1 +#define Y2 %ymm2 +#define Y3 %ymm3 + +#define YTMP0 %ymm4 +#define YTMP1 %ymm5 +#define YTMP2 %ymm6 +#define YTMP3 %ymm7 +#define YTMP4 %ymm8 +#define YTMP5 %ymm9 +#define YXFER %ymm10 + +#define SHUF_Y_00BA %ymm11 /* shuffle xBxA -> 00BA */ +#define SHUF_Y_DC00 %ymm12 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_Y_MASK %ymm13 + +#define YMM_REGS "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", \ + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13" + +#define MsgSched_Y(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \ + VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \ + VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \ + RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \ + VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \ + RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \ + VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \ + RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \ + VPOR (YTMP2, YTMP3, YTMP2) /* YTMP2 = W[-15] >>> 7 */ \ + VPOR (YTMP4, YTMP5, YTMP4) /* YTMP4 = W[-15] >>> 18 */ \ + RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \ + RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \ + VPXOR (YTMP2, YTMP4, YTMP2) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \ + RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \ + VPSHUFD (YTMP2, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \ + RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (YTMP4, YTMP2, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \ + VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ + RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \ + RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \ + VPSRLQ (YTMP2, YTMP2, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ + VPADDD (YTMP0, YTMP0, Y0) \ + RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (YTMP2, YTMP3, YTMP2) \ + VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \ + RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \ + RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \ + VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \ + RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \ + VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \ + RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \ + RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ + VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ + RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \ + RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \ + VPXOR (YTMP4, YTMP3, YTMP4) \ + RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \ + RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \ + VPXOR (YTMP5, YTMP4, YTMP5) /* YTMP5 = s1 {xDxC} */ \ + RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \ + RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \ + VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ + +#if defined(HAVE_INTEL_RORX) + +#define MsgSched_Y_RORX(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \ + RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \ + VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \ + RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \ + VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \ + RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \ + RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \ + VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \ + RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \ + VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \ + RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \ + VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \ + RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \ + VPOR (YTMP2, YTMP2, YTMP3) /* YTMP2 = W[-15] >>> 7 */ \ + RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \ + VPOR (YTMP4, YTMP4, YTMP5) /* YTMP4 = W[-15] >>> 18 */ \ + RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \ + RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (YTMP2, YTMP2, YTMP4) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \ + RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \ + VPSHUFD (YTMP3, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \ + RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \ + VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \ + RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLD (YTMP4, YTMP3, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \ + RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLQ (YTMP2, YTMP3, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ + RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \ + VPSRLQ (YTMP3, YTMP3, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ + RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \ + VPADDD (YTMP0, YTMP0, Y0) \ + RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (YTMP2, YTMP2, YTMP3) \ + RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \ + VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \ + RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \ + VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \ + RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \ + VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \ + RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \ + VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \ + RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \ + VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \ + RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \ + RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \ + VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ + RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ + RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \ + VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \ + RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \ + VPXOR (YTMP4, YTMP4, YTMP3) \ + RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \ + VPXOR (YTMP5, YTMP5, YTMP4) /* YTMP5 = s1 {xDxC} */ \ + RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \ + RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \ + VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \ + RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \ + RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \ + VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ \ + +#endif /* HAVE_INTEL_RORX */ + +#define _VINSERTI128(op1,op2,op3,op4) \ + "vinserti128 $"#op4", %"#op3", %"#op2", %"#op1"\n\t" +#define VINSERTI128(op1,op2,op3,op4) \ + _VINSERTI128(op1,op2,op3,op4) + + +#define _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \ + "# X0, X1, X2, X3 = W[0..15]\n\t" \ + "vmovdqu (%%"#reg"), %%xmm0\n\t" \ + "vmovdqu 16(%%"#reg"), %%xmm1\n\t" \ + VPSHUFB(X0, X0, BYTE_FLIP_MASK) \ + VPSHUFB(X1, X1, BYTE_FLIP_MASK) \ + "vmovdqu 32(%%"#reg"), %%xmm2\n\t" \ + "vmovdqu 48(%%"#reg"), %%xmm3\n\t" \ + VPSHUFB(X2, X2, BYTE_FLIP_MASK) \ + VPSHUFB(X3, X3, BYTE_FLIP_MASK) + +#define LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \ + _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) + + +#define _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \ + "# X0, X1, X2, X3 = W[0..15]\n\t" \ + "vmovdqu (%%"#reg"), %%xmm0\n\t" \ + "vmovdqu 16(%%"#reg"), %%xmm1\n\t" \ + "vmovdqu 64(%%"#reg"), %%xmm4\n\t" \ + "vmovdqu 80(%%"#reg"), %%xmm5\n\t" \ + VINSERTI128(Y0, Y0, XTMP0, 1) \ + VINSERTI128(Y1, Y1, XTMP1, 1) \ + VPSHUFB(Y0, Y0, BYTE_FLIP_Y_MASK) \ + VPSHUFB(Y1, Y1, BYTE_FLIP_Y_MASK) \ + "vmovdqu 32(%%"#reg"), %%xmm2\n\t" \ + "vmovdqu 48(%%"#reg"), %%xmm3\n\t" \ + "vmovdqu 96(%%"#reg"), %%xmm6\n\t" \ + "vmovdqu 112(%%"#reg"), %%xmm7\n\t" \ + VINSERTI128(Y2, Y2, XTMP2, 1) \ + VINSERTI128(Y3, Y3, XTMP3, 1) \ + VPSHUFB(Y2, Y2, BYTE_FLIP_Y_MASK) \ + VPSHUFB(Y3, Y3, BYTE_FLIP_Y_MASK) + +#define LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \ + _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) + + +#define _SET_W_Y_4(i) \ + "vpaddd ("#i"*8)+ 0+%[K], %%ymm0, %%ymm4\n\t" \ + "vpaddd ("#i"*8)+32+%[K], %%ymm1, %%ymm5\n\t" \ + "vmovdqu %%ymm4, ("#i"*8)+ 0("WK")\n\t" \ + "vmovdqu %%ymm5, ("#i"*8)+32("WK")\n\t" \ + "vpaddd ("#i"*8)+64+%[K], %%ymm2, %%ymm4\n\t" \ + "vpaddd ("#i"*8)+96+%[K], %%ymm3, %%ymm5\n\t" \ + "vmovdqu %%ymm4, ("#i"*8)+64("WK")\n\t" \ + "vmovdqu %%ymm5, ("#i"*8)+96("WK")\n\t" + +#define SET_W_Y_4(i) \ + _SET_W_Y_4(i) + + +static const ALIGN32 word64 mSHUF_Y_00BA[] = + { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF, + 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ +static const ALIGN32 word64 mSHUF_Y_DC00[] = + { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100, + 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ +static const ALIGN32 word64 mBYTE_FLIP_Y_MASK[] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, + 0x0405060700010203, 0x0c0d0e0f08090a0b }; + +#define _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ + "vmovdqa %[FLIP], %"#BYTE_FLIP_MASK"\n\t" \ + "vmovdqa %[SHUF00BA], %"#SHUF_00BA"\n\t" \ + "vmovdqa %[SHUFDC00], %"#SHUF_DC00"\n\t" + +#define INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ + _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) + +static const ALIGN32 word32 K256[128] = { + 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, + 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, + 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, + 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, + 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, + 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, + 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L, + 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L, + 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, + 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, + 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, + 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, + 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, + 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, + 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L, + 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L, + 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L, + 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L, + 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, + 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, + 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, + 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, + 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L, + 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L, + 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L, + 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L, + 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L, + 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L, + 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, + 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, + 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L, + 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L +}; + +SHA256_NOINLINE static int Transform_Sha256_AVX2(wc_Sha256* sha256) +{ + __asm__ __volatile__ ( + + "subq $512, %%rsp\n\t" + "leaq 32(%[sha256]), %%rax\n\t" + + INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00) + LOAD_DIGEST() + + LOAD_W_K_LOW(BYTE_FLIP_MASK, rax) + + "movl %%r9d, "L4"\n\t" + "movl %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + + SET_W_Y_4(0) + MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) + MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) + MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) + + SET_W_Y_4(16) + MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) + MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) + MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) + MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) + + SET_W_Y_4(32) + MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) + MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) + MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) + MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) + + SET_W_Y_4(48) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) + + STORE_ADD_DIGEST() + + "addq $512, %%rsp\n\t" + + : + : [FLIP] "m" (mBYTE_FLIP_MASK[0]), + [SHUF00BA] "m" (mSHUF_Y_00BA[0]), + [SHUFDC00] "m" (mSHUF_Y_DC00[0]), + [sha256] "r" (sha256), + [K] "m" (K256) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory" + ); + + return 0; +} + +SHA256_NOINLINE static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, + word32 len) +{ + if ((len & WC_SHA256_BLOCK_SIZE) != 0) { + Transform_Sha256_AVX2(sha256); + sha256->data += WC_SHA256_BLOCK_SIZE; + len -= WC_SHA256_BLOCK_SIZE; + if (len == 0) + return 0; + } + + __asm__ __volatile__ ( + + "subq $512, %%rsp\n\t" + "movq 120(%[sha256]), %%rax\n\t" + + INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00) + LOAD_DIGEST() + + "# Start of loop processing two blocks\n" + "1:\n\t" + + LOAD_W_K(BYTE_FLIP_Y_MASK, rax) + + "movl %%r9d, "L4"\n\t" + "movl %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + + SET_W_Y_4(0) + MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) + MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) + MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) + + SET_W_Y_4(16) + MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) + MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) + MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) + MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) + + SET_W_Y_4(32) + MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) + MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) + MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) + MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) + + SET_W_Y_4(48) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) + + ADD_DIGEST() + STORE_DIGEST() + + "movl %%r9d, "L4"\n\t" + "movl %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108) + RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116) + RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124) + + ADD_DIGEST() + + "movq 120(%[sha256]), %%rax\n\t" + "addq $128, %%rax\n\t" + "subl $128, %[len]\n\t" + + STORE_DIGEST() + + "movq %%rax, 120(%[sha256])\n\t" + "jnz 1b\n\t" + + "addq $512, %%rsp\n\t" + + : + : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]), + [SHUF00BA] "m" (mSHUF_Y_00BA[0]), + [SHUFDC00] "m" (mSHUF_Y_DC00[0]), + [sha256] "r" (sha256), + [len] "r" (len), + [K] "m" (K256) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory" ); return 0; } #if defined(HAVE_INTEL_RORX) -static int Transform_AVX1_RORX(wc_Sha256* sha256) +SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX(wc_Sha256* sha256) { - ALIGN32 word32 W_K[64]; /* temp for W+K */ - __asm__ __volatile__ ( - Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) - "# X0, X1, X2, X3 = W[0..15]; \n\t" - W_K_from_buff() + "subq $512, %%rsp\n\t" + "leaq 32(%[sha256]), %%rax\n\t" - DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) + INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00) + LOAD_W_K_LOW(BYTE_FLIP_MASK, rax) - SET_W_K_XFER(X0, 0) - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) - SET_W_K_XFER(X1, 4) - MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) - SET_W_K_XFER(X2, 8) - MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) - SET_W_K_XFER(X3, 12) - MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) - SET_W_K_XFER(X0, 16) - MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) - SET_W_K_XFER(X1, 20) - MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) - SET_W_K_XFER(X2, 24) - MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) - SET_W_K_XFER(X3, 28) - MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) - SET_W_K_XFER(X0, 32) - MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) - SET_W_K_XFER(X1, 36) - MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) - SET_W_K_XFER(X2, 40) - MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) - SET_W_K_XFER(X3, 44) - MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) + LOAD_DIGEST() - SET_W_K_XFER(X0, 48) - SET_W_K_XFER(X1, 52) - SET_W_K_XFER(X2, 56) - SET_W_K_XFER(X3, 60) + "movl %%r9d, "L4"\n\t" + "rorx $6, %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) + SET_W_Y_4(0) + MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) + MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) + MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) + SET_W_Y_4(16) + MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) + MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) + MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) + MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) + SET_W_Y_4(32) + MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) + MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) + MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) + MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) + SET_W_Y_4(48) + "xorl "L3", "L3"\n\t" + "xorl "L2", "L2"\n\t" + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) + /* Prev RND: h += Maj(a,b,c) */ + "addl "L3", %%r8d\n\t" - RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) + STORE_ADD_DIGEST() + + "addq $512, %%rsp\n\t" : : [FLIP] "m" (mBYTE_FLIP_MASK[0]), - [SHUF00BA] "m" (mSHUF_00BA[0]), - [SHUFDC00] "m" (mSHUF_DC00[0]), - [digest] "m" (sha256->digest), - [buf] "m" (sha256->buffer), - [K] "m" (K), - [W_K] "m" (W_K) - : SSE_REGs, "memory" + [SHUF00BA] "m" (mSHUF_Y_00BA[0]), + [SHUFDC00] "m" (mSHUF_Y_DC00[0]), + [sha256] "r" (sha256), + [K] "m" (K256) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory" + ); + + return 0; +} + +SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, + word32 len) +{ + if ((len & WC_SHA256_BLOCK_SIZE) != 0) { + Transform_Sha256_AVX2_RORX(sha256); + sha256->data += WC_SHA256_BLOCK_SIZE; + len -= WC_SHA256_BLOCK_SIZE; + if (len == 0) + return 0; + } + + __asm__ __volatile__ ( + + "subq $512, %%rsp\n\t" + "movq 120(%[sha256]), %%rax\n\t" + + INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00) + LOAD_DIGEST() + + "# Start of loop processing two blocks\n" + "1:\n\t" + + LOAD_W_K(BYTE_FLIP_Y_MASK, rax) + + "movl %%r9d, "L4"\n\t" + "rorx $6, %%r12d, "L1"\n\t" + "xorl %%r10d, "L4"\n\t" + + SET_W_Y_4(0) + MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) + MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) + MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) + MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) + + SET_W_Y_4(16) + MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) + MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) + MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) + MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) + + SET_W_Y_4(32) + MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) + MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) + MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) + MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) + + SET_W_Y_4(48) + "xorl "L3", "L3"\n\t" + "xorl "L2", "L2"\n\t" + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) + /* Prev RND: h += Maj(a,b,c) */ + "addl "L3", %%r8d\n\t" + "xorl "L2", "L2"\n\t" + + ADD_DIGEST() + STORE_DIGEST() + + "movl %%r9d, "L4"\n\t" + "xorl "L3", "L3"\n\t" + "xorl %%r10d, "L4"\n\t" + + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108) + RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116) + RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124) + /* Prev RND: h += Maj(a,b,c) */ + "addl "L3", %%r8d\n\t" + "movq 120(%[sha256]), %%rax\n\t" + + ADD_DIGEST() + + "addq $128, %%rax\n\t" + "subl $128, %[len]\n\t" + + STORE_DIGEST() + + "movq %%rax, 120(%[sha256])\n\t" + "jnz 1b\n\t" + + "addq $512, %%rsp\n\t" + + : + : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]), + [SHUF00BA] "m" (mSHUF_Y_00BA[0]), + [SHUFDC00] "m" (mSHUF_Y_DC00[0]), + [sha256] "r" (sha256), + [len] "r" (len), + [K] "m" (K256) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory" ); return 0; } #endif /* HAVE_INTEL_RORX */ -#endif /* HAVE_INTEL_AVX1 */ - - -#if defined(HAVE_INTEL_AVX2) - -#define _MOVE_to_REG(ymm, mem, i) \ - "leaq %["#mem"], %%r8\n\t" \ - "vmovdqu ("#i")*4(%%r8), %%"#ymm"\n\t" -#define _MOVE_to_MEM(mem, i, ymm) \ - "leaq %["#mem"], %%r8\n\t" \ - "vmovdqu %%"#ymm", "#i"*4(%%r8)\n\t" -#define _BYTE_SWAP(ymm, map) \ - "vpshufb %["#map"], %%"#ymm", %%"#ymm"\n\t" -#define _MOVE_128(ymm0, ymm1, ymm2, map) \ - "vperm2i128 $"#map", %%"#ymm2", %%"#ymm1", %%"#ymm0"\n\t" -#define _MOVE_BYTE(ymm0, ymm1, map) \ - "vpshufb %["#map"], %%"#ymm1", %%"#ymm0"\n\t" -#define _S_TEMP(dest, src, bits, temp) \ - "vpsrld $"#bits", %%"#src", %%"#dest"\n\t" \ - "vpslld $32-"#bits", %%"#src", %%"#temp"\n\t" \ - "vpor %%"#temp",%%"#dest", %%"#dest"\n\t" -#define _AVX2_R(dest, src, bits) \ - "vpsrld $"#bits", %%"#src", %%"#dest"\n\t" -#define _XOR(dest, src1, src2) \ - "vpxor %%"#src1", %%"#src2", %%"#dest"\n\t" -#define _OR(dest, src1, src2) \ - "vpor %%"#src1", %%"#src2", %%"#dest"\n\t" -#define _ADD(dest, src1, src2) \ - "vpaddd %%"#src1", %%"#src2", %%"#dest"\n\t" -#define _ADD_MEM(dest, src1, mem, i) \ - "leaq %["#mem"], %%r8\n\t" \ - "vpaddd "#i"*4(%%r8), %%"#src1", %%"#dest"\n\t" -#define _BLEND(map, dest, src1, src2) \ - "vpblendd $"#map", %%"#src1", %%"#src2", %%"#dest"\n\t" - -#define _EXTRACT_XMM_0(xmm, mem) \ - "vpextrd $0, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_1(xmm, mem) \ - "vpextrd $1, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_2(xmm, mem) \ - "vpextrd $2, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_3(xmm, mem) \ - "vpextrd $3, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_4(ymm, xmm, mem) \ - "vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t" \ - "vpextrd $0, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_5(xmm, mem) \ - "vpextrd $1, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_6(xmm, mem) \ - "vpextrd $2, %%"#xmm", %["#mem"]\n\t" -#define _EXTRACT_XMM_7(xmm, mem) \ - "vpextrd $3, %%"#xmm", %["#mem"]\n\t" - -#define _SWAP_YMM_HL(ymm) \ - "vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t" -#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) - -#define MOVE_to_REG(ymm, mem, i) _MOVE_to_REG(ymm, mem, i) -#define MOVE_to_MEM(mem, i, ymm) _MOVE_to_MEM(mem, i, ymm) -#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) -#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) -#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) -#define XOR(dest, src1, src2) _XOR(dest, src1, src2) -#define OR(dest, src1, src2) _OR(dest, src1, src2) -#define ADD(dest, src1, src2) _ADD(dest, src1, src2) -#define ADD_MEM(dest, src1, mem, i) _ADD_MEM(dest, src1, mem, i) -#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) - -#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp) -#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) -#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) - -#define GAMMA0(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18) \ - XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) XOR(dest, G_TEMP, dest) -#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18) -#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) \ - XOR(dest, G_TEMP, dest) - -#define GAMMA1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19) \ - XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) XOR(dest, G_TEMP, dest) -#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19) -#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) \ - XOR(dest, G_TEMP, dest) - -#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP1W_2) \ - BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) -#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) \ - MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAP2W_2) BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) -#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP3W_2) \ - BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) - -#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08)\ - MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAPW_7) BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) - -#undef voitle - -#define W_I_16 ymm8 -#define W_I_15 ymm9 -#define W_I_7 ymm10 -#define W_I_2 ymm11 -#define W_I ymm12 -#define G_TEMP ymm13 -#define S_TEMP ymm14 -#define YMM_TEMP0 ymm15 -#define YMM_TEMP0x xmm15 -#define W_I_TEMP ymm7 -#define W_K_TEMP ymm15 -#define W_K_TEMPx xmm15 - -#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ - "vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15"\n\t" \ - "vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16"\n\t" \ - "vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15"\n\t" \ - "vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16"\n\t" \ - "vpshufd $0x93, %%"#w_i_16", %%"#w_i_16"\n\t" - -#define MOVE_7_to_15(w_i_15, w_i_7)\ - "vmovdqu %%"#w_i_7", %%"#w_i_15"\n\t" - -#define MOVE_I_to_7(w_i_7, w_i)\ - "vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7"\n\t" \ - "vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7"\n\t" \ - "vpshufd $0x39, %%"#w_i_7", %%"#w_i_7"\n\t" - -#define MOVE_I_to_2(w_i_2, w_i)\ - "vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2"\n\t" \ - "vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2"\n\t" - -#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ - MOVE_15_to_16(w_i_16, w_i_15, w_i_7) \ - MOVE_7_to_15(w_i_15, w_i_7) \ - MOVE_I_to_7(w_i_7, w_i) \ - MOVE_I_to_2(w_i_2, w_i) - -#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - { word32 d[8];\ - __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\ - __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\ - __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\ - __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\ - __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\ - __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\ - __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\ - __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\ - printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ - __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\ - __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\ -} - - -#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - -#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - -#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - - - /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ - static const unsigned long mBYTE_FLIP_MASK_16[] = - { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b }; - static const unsigned long mBYTE_FLIP_MASK_15[] = - { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b }; - static const unsigned long mBYTE_FLIP_MASK_7 [] = - { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b }; - static const unsigned long mBYTE_FLIP_MASK_2 [] = - { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 }; - - static const unsigned long mMAPtoW_I_7[] = - { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 }; - static const unsigned long mMAP1toW_I_2[] = - { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 }; - static const unsigned long mMAP2toW_I_2[] = - { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 }; - static const unsigned long mMAP3toW_I_2[] = - { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 }; - -static int Transform_AVX2(wc_Sha256* sha256) -{ -#ifdef WOLFSSL_SMALL_STACK - word32* W_K; - W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); - if (W_K == NULL) - return MEMORY_E; -#else - word32 W_K[64]; -#endif - - __asm__ __volatile__ ( - - MOVE_to_REG(W_I_16, buf, 0) BYTE_SWAP(W_I_16, FLIP_16) - MOVE_to_REG(W_I_15, buf, 1) BYTE_SWAP(W_I_15, FLIP_15) - MOVE_to_REG(W_I, buf, 8) BYTE_SWAP(W_I, FLIP_16) - MOVE_to_REG(W_I_7, buf, 16-7) BYTE_SWAP(W_I_7, FLIP_7) - MOVE_to_REG(W_I_2, buf, 16-2) BYTE_SWAP(W_I_2, FLIP_2) - - DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) - - ADD_MEM(W_K_TEMP, W_I_16, K, 0) - MOVE_to_MEM(W_K, 0, W_K_TEMP) - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) - - ADD_MEM(YMM_TEMP0, W_I, K, 8) - MOVE_to_MEM(W_K, 8, YMM_TEMP0) - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) - GAMMA0_1(W_I_TEMP, W_I_15) - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) - GAMMA0_2(W_I_TEMP, W_I_15) - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) - ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) - ADD(W_I, W_I_7, W_I_TEMP) - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) - ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) - FEEDBACK1_to_W_I_2 - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) - FEEDBACK_to_W_I_7 - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) - ADD(W_I_TEMP, W_I_7, W_I_TEMP) - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) - ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) - FEEDBACK2_to_W_I_2 - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) - FEEDBACK3_to_W_I_2 - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) - GAMMA1(YMM_TEMP0, W_I_2) - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) - - MOVE_to_REG(YMM_TEMP0, K, 16) - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) - ADD(YMM_TEMP0, YMM_TEMP0, W_I) - MOVE_to_MEM(W_K, 16, YMM_TEMP0) - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) - GAMMA0_1(W_I_TEMP, W_I_15) - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) - GAMMA0_2(W_I_TEMP, W_I_15) - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) - ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) - ADD(W_I, W_I_7, W_I_TEMP) - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) - ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) - FEEDBACK1_to_W_I_2 - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) - FEEDBACK_to_W_I_7 - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) - ADD(W_I_TEMP, W_I_7, W_I_TEMP) - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) - GAMMA1(YMM_TEMP0, W_I_2) - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) - ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) - FEEDBACK2_to_W_I_2 - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) - FEEDBACK3_to_W_I_2 - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) - - MOVE_to_REG(YMM_TEMP0, K, 24) - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) - ADD(YMM_TEMP0, YMM_TEMP0, W_I) - MOVE_to_MEM(W_K, 24, YMM_TEMP0) - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) - GAMMA0_1(W_I_TEMP, W_I_15) - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) - GAMMA0_2(W_I_TEMP, W_I_15) - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) - ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) - ADD(W_I, W_I_7, W_I_TEMP) - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) - ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) - FEEDBACK1_to_W_I_2 - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) - FEEDBACK_to_W_I_7 - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) - ADD(W_I_TEMP, W_I_7, W_I_TEMP) - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) - ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) - FEEDBACK2_to_W_I_2 - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) - FEEDBACK3_to_W_I_2 - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) - GAMMA1(YMM_TEMP0, W_I_2) - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) - - MOVE_to_REG(YMM_TEMP0, K, 32) - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) - ADD(YMM_TEMP0, YMM_TEMP0, W_I) - MOVE_to_MEM(W_K, 32, YMM_TEMP0) - - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) - GAMMA0_1(W_I_TEMP, W_I_15) - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) - GAMMA0_2(W_I_TEMP, W_I_15) - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) - ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) - ADD(W_I, W_I_7, W_I_TEMP) - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) - ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) - FEEDBACK1_to_W_I_2 - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) - FEEDBACK_to_W_I_7 - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) - ADD(W_I_TEMP, W_I_7, W_I_TEMP) - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) - ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) - FEEDBACK2_to_W_I_2 - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) - FEEDBACK3_to_W_I_2 - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) - - MOVE_to_REG(YMM_TEMP0, K, 40) - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) - ADD(YMM_TEMP0, YMM_TEMP0, W_I) - MOVE_to_MEM(W_K, 40, YMM_TEMP0) - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) - GAMMA0_1(W_I_TEMP, W_I_15) - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) - GAMMA0_2(W_I_TEMP, W_I_15) - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) - ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) - ADD(W_I, W_I_7, W_I_TEMP) - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) - ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) - FEEDBACK1_to_W_I_2 - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) - FEEDBACK_to_W_I_7 - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) - ADD(W_I_TEMP, W_I_7, W_I_TEMP) - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) - ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) - FEEDBACK2_to_W_I_2 - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) - FEEDBACK3_to_W_I_2 - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) - - MOVE_to_REG(YMM_TEMP0, K, 48) - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) - ADD(YMM_TEMP0, YMM_TEMP0, W_I) - MOVE_to_MEM(W_K, 48, YMM_TEMP0) - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) - GAMMA0_1(W_I_TEMP, W_I_15) - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) - GAMMA0_2(W_I_TEMP, W_I_15) - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) - ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) - ADD(W_I, W_I_7, W_I_TEMP) - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) - ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) - FEEDBACK1_to_W_I_2 - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) - FEEDBACK_to_W_I_7 - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) - ADD(W_I_TEMP, W_I_7, W_I_TEMP) - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) - ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) - FEEDBACK2_to_W_I_2 - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) - FEEDBACK3_to_W_I_2 - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) - GAMMA1_1(YMM_TEMP0, W_I_2) - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) - GAMMA1_2(YMM_TEMP0, W_I_2) - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) - ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) - - MOVE_to_REG(YMM_TEMP0, K, 56) - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) - ADD(YMM_TEMP0, YMM_TEMP0, W_I) - MOVE_to_MEM(W_K, 56, YMM_TEMP0) - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) - - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) - - RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) - - : - : [FLIP_16] "m" (mBYTE_FLIP_MASK_16[0]), - [FLIP_15] "m" (mBYTE_FLIP_MASK_15[0]), - [FLIP_7] "m" (mBYTE_FLIP_MASK_7[0]), - [FLIP_2] "m" (mBYTE_FLIP_MASK_2), - [MAPW_7] "m" (mMAPtoW_I_7[0]), - [MAP1W_2] "m" (mMAP1toW_I_2[0]), - [MAP2W_2] "m" (mMAP2toW_I_2[0]), - [MAP3W_2] "m" (mMAP3toW_I_2[0]), - [digest] "m" (sha256->digest), - [buf] "m" (sha256->buffer), - [K] "m" (K), - [W_K] "m" (W_K) - : SSE_REGs, "memory" - ); - -#ifdef WOLFSSL_SMALL_STACK - XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER); -#endif - - return 0; -} - -#endif /* HAVE_INTEL_AVX2 */ +#endif /* HAVE_INTEL_AVX2 */ #ifdef WOLFSSL_SHA224 diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index bada40af7..15cfeeb82 100644 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -129,7 +129,20 @@ #if defined(USE_INTEL_SPEEDUP) #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 + + #if defined(__GNUC__) && ((__GNUC__ < 4) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) + #define NO_AVX2_SUPPORT + #endif + #if defined(__clang__) && ((__clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 5)) + #define NO_AVX2_SUPPORT + #endif + + #define HAVE_INTEL_AVX1 + #ifndef NO_AVX2_SUPPORT + #define HAVE_INTEL_AVX2 + #endif #endif #if defined(HAVE_INTEL_AVX1) @@ -142,24 +155,6 @@ #endif -#if defined(HAVE_INTEL_RORX) - #define ROTR(func, bits, x) \ - word64 func(word64 x) { word64 ret ;\ - __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x)) ;\ - return ret ;\ - } - - static INLINE ROTR(rotrFixed64_28, 28, x); - static INLINE ROTR(rotrFixed64_34, 34, x); - static INLINE ROTR(rotrFixed64_39, 39, x); - static INLINE ROTR(rotrFixed64_14, 14, x); - static INLINE ROTR(rotrFixed64_18, 18, x); - static INLINE ROTR(rotrFixed64_41, 41, x); - - #define S0_RORX(x) (rotrFixed64_28(x)^rotrFixed64_34(x)^rotrFixed64_39(x)) - #define S1_RORX(x) (rotrFixed64_14(x)^rotrFixed64_18(x)^rotrFixed64_41(x)) -#endif /* HAVE_INTEL_RORX */ - #if defined(HAVE_BYTEREVERSE64) && \ !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size) @@ -212,11 +207,11 @@ static int InitSha512(wc_Sha512* sha512) } #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) - Transform_AVX1(); # Function prototype - Transform_AVX2(); # + Transform_Sha512_AVX1(); # Function prototype + Transform_Sha512_AVX2(); # #endif - _Transform() { # Native Transform Function body + _Transform_Sha512() { # Native Transform Function body } @@ -245,7 +240,7 @@ static int InitSha512(wc_Sha512* sha512) #if defnied(HAVE_INTEL_AVX1) - int Transform_AVX1() { + int Transform_Sha512_AVX1() { Stitched Message Sched/Round } @@ -253,7 +248,7 @@ static int InitSha512(wc_Sha512* sha512) #if defnied(HAVE_INTEL_AVX2) - int Transform_AVX2() { + int Transform_Sha512_AVX2() { Stitched Message Sched/Round } #endif @@ -266,30 +261,29 @@ static int InitSha512(wc_Sha512* sha512) */ #if defined(HAVE_INTEL_AVX1) - static int Transform_AVX1(wc_Sha512 *sha512); + static int Transform_Sha512_AVX1(wc_Sha512 *sha512); + static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len); #endif #if defined(HAVE_INTEL_AVX2) - static int Transform_AVX2(wc_Sha512 *sha512); - #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) - static int Transform_AVX1_RORX(wc_Sha512 *sha512); + static int Transform_Sha512_AVX2(wc_Sha512 *sha512); + static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len); + #if defined(HAVE_INTEL_RORX) + static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512); + static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512, + word32 len); + static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512); + static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512, + word32 len); #endif #endif - static int _Transform(wc_Sha512 *sha512); - static int (*Transform_p)(wc_Sha512* sha512) = _Transform; + static int _Transform_Sha512(wc_Sha512 *sha512); + static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512; + static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL; static int transform_check = 0; static int intel_flags; - #define Transform(sha512) (*Transform_p)(sha512) - - /* Dummy for saving MM_REGs on behalf of Transform */ - /* #if defined(HAVE_INTEL_AVX2) - #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\ - "%ymm0","%ymm1","%ymm2","%ymm3","%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\ - "%ymm12","%ymm13","%ymm14","%ymm15") - */ - #if defined(HAVE_INTEL_AVX1) - #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15") - #endif + #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512) + #define Transform_Sha512_Len(sha512, len) \ + (*Transform_Sha512_Len_p)(sha512, len) static void Sha512_SetTransform() { @@ -299,22 +293,35 @@ static int InitSha512(wc_Sha512* sha512) intel_flags = cpuid_get_flags(); #if defined(HAVE_INTEL_AVX2) - if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) { - if (1) - Transform_p = Transform_AVX1_RORX; + if (IS_INTEL_AVX2(intel_flags)) { + #ifdef HAVE_INTEL_RORX + if (IS_INTEL_BMI2(intel_flags)) { + Transform_Sha512_p = Transform_Sha512_AVX2_RORX; + Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len; + } else - Transform_p = Transform_AVX2; + #endif + if (1) { + Transform_Sha512_p = Transform_Sha512_AVX2; + Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len; + } + #ifdef HAVE_INTEL_RORX + else { + Transform_Sha512_p = Transform_Sha512_AVX1_RORX; + Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len; + } + #endif } else #endif #if defined(HAVE_INTEL_AVX1) - if (1) { - Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 : - _Transform); + if (IS_INTEL_AVX1(intel_flags)) { + Transform_Sha512_p = Transform_Sha512_AVX1; + Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len; } else #endif - Transform_p = _Transform; + Transform_Sha512_p = _Transform_Sha512; transform_check = 1; } @@ -332,7 +339,7 @@ static int InitSha512(wc_Sha512* sha512) } #else - #define Transform(sha512) _Transform(sha512) + #define Transform_Sha512(sha512) _Transform_Sha512(sha512) int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) { @@ -359,10 +366,6 @@ static int InitSha512(wc_Sha512* sha512) #endif /* Hardware Acceleration */ -#ifndef SAVE_XMM_YMM - #define SAVE_XMM_YMM -#endif - static const word64 K512[80] = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), @@ -437,7 +440,7 @@ static const word64 K512[80] = { d(i) += h(i); \ h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) -static int _Transform(wc_Sha512* sha512) +static int _Transform_Sha512(wc_Sha512* sha512) { const word64* K = K512; word32 j; @@ -513,9 +516,7 @@ static INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE) return BUFFER_E; - SAVE_XMM_YMM; /* for Intel AVX */ - - while (len) { + if (sha512->buffLen > 0) { word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen); XMEMCPY(&local[sha512->buffLen], data, add); @@ -530,17 +531,76 @@ static INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) #endif { ByteReverseWords64(sha512->buffer, sha512->buffer, - WC_SHA512_BLOCK_SIZE); + WC_SHA512_BLOCK_SIZE); } #endif - ret = Transform(sha512); + ret = Transform_Sha512(sha512); + if (ret == 0) { + AddLength(sha512, WC_SHA512_BLOCK_SIZE); + sha512->buffLen = 0; + } + else + len = 0; + } + } + +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if (Transform_Sha512_Len_p != NULL) { + word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); + + if (blocksLen > 0) { + AddLength(sha512, blocksLen); + sha512->data = data; + /* Byte reversal performed in function if required. */ + Transform_Sha512_Len(sha512, blocksLen); + data += blocksLen; + len -= blocksLen; + } + } + else +#endif +#if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \ + defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + { + word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); + + AddLength(sha512, blocksLen); + while (len >= WC_SHA512_BLOCK_SIZE) { + XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE); + + data += WC_SHA512_BLOCK_SIZE; + len -= WC_SHA512_BLOCK_SIZE; + + /* Byte reversal performed in function if required. */ + ret = Transform_Sha512(sha512); if (ret != 0) break; - - AddLength(sha512, WC_SHA512_BLOCK_SIZE); - sha512->buffLen = 0; } } +#else + { + word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); + + AddLength(sha512, blocksLen); + while (len >= WC_SHA512_BLOCK_SIZE) { + XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE); + + data += WC_SHA512_BLOCK_SIZE; + len -= WC_SHA512_BLOCK_SIZE; + + ByteReverseWords64(sha512->buffer, sha512->buffer, + WC_SHA512_BLOCK_SIZE); + ret = Transform_Sha512(sha512); + if (ret != 0) + break; + } + } +#endif + + if (len > 0) { + XMEMCPY(local, data, len); + sha512->buffLen = len; + } return ret; } @@ -572,7 +632,6 @@ static INLINE int Sha512Final(wc_Sha512* sha512) return BAD_FUNC_ARG; } - SAVE_XMM_YMM ; /* for Intel AVX */ AddLength(sha512, sha512->buffLen); /* before adding pads */ local[sha512->buffLen++] = 0x80; /* add 1 */ @@ -590,7 +649,7 @@ static INLINE int Sha512Final(wc_Sha512* sha512) WC_SHA512_BLOCK_SIZE); } #endif /* LITTLE_ENDIAN_ORDER */ - ret = Transform(sha512); + ret = Transform_Sha512(sha512); if (ret != 0) return ret; @@ -620,7 +679,7 @@ static INLINE int Sha512Final(wc_Sha512* sha512) &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE); #endif - ret = Transform(sha512); + ret = Transform_Sha512(sha512); if (ret != 0) return ret; @@ -676,674 +735,1799 @@ void wc_Sha512Free(wc_Sha512* sha512) #if defined(HAVE_INTEL_AVX1) -#define Rx_1(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i]; -#define Rx_2(i) d(i)+=h(i); -#define Rx_3(i) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)); +static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; + +#define W_0 xmm0 +#define W_2 xmm1 +#define W_4 xmm2 +#define W_6 xmm3 +#define W_8 xmm4 +#define W_10 xmm5 +#define W_12 xmm6 +#define W_14 xmm7 + +#define W_M15 xmm12 +#define W_M7 xmm13 +#define MASK xmm14 + +#define XTMP1 xmm8 +#define XTMP2 xmm9 +#define XTMP3 xmm10 +#define XTMP4 xmm11 + +#define XMM_REGS \ + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" + +#define _VPALIGNR(dest, src1, src2, bits) \ + "vpalignr $"#bits", %%"#src2", %%"#src1", %%"#dest"\n\t" +#define VPALIGNR(dest, src1, src2, bits) \ + _VPALIGNR(dest, src1, src2, bits) + +#define _V_SHIFT_R(dest, src, bits) \ + "vpsrlq $"#bits", %%"#src", %%"#dest"\n\t" +#define V_SHIFT_R(dest, src, bits) \ + _V_SHIFT_R(dest, src, bits) + +#define _V_SHIFT_L(dest, src, bits) \ + "vpsllq $"#bits", %%"#src", %%"#dest"\n\t" +#define V_SHIFT_L(dest, src, bits) \ + _V_SHIFT_L(dest, src, bits) + +#define _V_ADD(dest, src1, src2) \ + "vpaddq %%"#src1", %%"#src2", %%"#dest"\n\t" +#define V_ADD(dest, src1, src2) \ + _V_ADD(dest, src1, src2) + +#define _V_XOR(dest, src1, src2) \ + "vpxor %%"#src1", %%"#src2", %%"#dest"\n\t" +#define V_XOR(dest, src1, src2) \ + _V_XOR(dest, src1, src2) + +#define _V_OR(dest, src1, src2) \ + "vpor %%"#src1", %%"#src2", %%"#dest"\n\t" +#define V_OR(dest, src1, src2) \ + _V_OR(dest, src1, src2) + +#define RA %%r8 +#define RB %%r9 +#define RC %%r10 +#define RD %%r11 +#define RE %%r12 +#define RF %%r13 +#define RG %%r14 +#define RH %%r15 + +#define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" + +#define L1 "%%rax" +#define L2 "%%rcx" +#define L3 "%%rdx" +#define L4 "%%rbx" +#define WX "%%rsp" + +#define WORK_REGS "rax", "rbx", "rcx", "rdx" + +#define RND_0_1(a,b,c,d,e,f,g,h,i) \ + /* L1 = e >>> 23 */ \ + "rorq $23, "L1"\n\t" \ + +#define RND_0_2(a,b,c,d,e,f,g,h,i) \ + /* L3 = a */ \ + "movq "#a", "L3"\n\t" \ + /* L2 = f */ \ + "movq "#f", "L2"\n\t" \ + /* h += W_X[i] */ \ + "addq ("#i")*8("WX"), "#h"\n\t" \ + /* L2 = f ^ g */ \ + "xorq "#g", "L2"\n\t" \ + +#define RND_0_2_A(a,b,c,d,e,f,g,h,i) \ + /* L3 = a */ \ + "movq "#a", "L3"\n\t" \ + /* L2 = f */ \ + "movq "#f", "L2"\n\t" \ + +#define RND_0_2_B(a,b,c,d,e,f,g,h,i) \ + /* h += W_X[i] */ \ + "addq ("#i")*8("WX"), "#h"\n\t" \ + /* L2 = f ^ g */ \ + "xorq "#g", "L2"\n\t" \ + +#define RND_0_3(a,b,c,d,e,f,g,h,i) \ + /* L1 = (e >>> 23) ^ e */ \ + "xorq "#e", "L1"\n\t" \ + /* L2 = (f ^ g) & e */ \ + "andq "#e", "L2"\n\t" \ + +#define RND_0_4(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((e >>> 23) ^ e) >>> 4 */ \ + "rorq $4, "L1"\n\t" \ + /* L2 = ((f ^ g) & e) ^ g */ \ + "xorq "#g", "L2"\n\t" \ + +#define RND_0_5(a,b,c,d,e,f,g,h,i) \ + /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */ \ + "xorq "#e", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addq "L2", "#h"\n\t" \ + +#define RND_0_6(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \ + "rorq $14, "L1"\n\t" \ + /* L3 = a ^ b */ \ + "xorq "#b", "L3"\n\t" \ + +#define RND_0_7(a,b,c,d,e,f,g,h,i) \ + /* h += Sigma1(e) */ \ + "addq "L1", "#h"\n\t" \ + /* L2 = a */ \ + "movq "#a", "L2"\n\t" \ + +#define RND_0_8(a,b,c,d,e,f,g,h,i) \ + /* L4 = (a ^ b) & (b ^ c) */ \ + "andq "L3", "L4"\n\t" \ + /* L2 = a >>> 5 */ \ + "rorq $5, "L2"\n\t" \ + +#define RND_0_9(a,b,c,d,e,f,g,h,i) \ + /* L2 = (a >>> 5) ^ a */ \ + "xorq "#a", "L2"\n\t" \ + /* L4 = ((a ^ b) & (b ^ c) ^ b */ \ + "xorq "#b", "L4"\n\t" \ + +#define RND_0_10(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((a >>> 5) ^ a) >>> 6 */ \ + "rorq $6, "L2"\n\t" \ + /* d += h */ \ + "addq "#h", "#d"\n\t" \ + +#define RND_0_11(a,b,c,d,e,f,g,h,i) \ + /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \ + "xorq "#a", "L2"\n\t" \ + /* h += Sigma0(a) */ \ + "addq "L4", "#h"\n\t" \ + +#define RND_0_12(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \ + "rorq $28, "L2"\n\t" \ + /* d (= e next RND) */ \ + "movq "#d", "L1"\n\t" \ + /* h += Maj(a,b,c) */ \ + "addq "L2", "#h"\n\t" \ + +#define RND_1_1(a,b,c,d,e,f,g,h,i) \ + /* L1 = e >>> 23 */ \ + "rorq $23, "L1"\n\t" \ + +#define RND_1_2(a,b,c,d,e,f,g,h,i) \ + /* L4 = a */ \ + "movq "#a", "L4"\n\t" \ + /* L2 = f */ \ + "movq "#f", "L2"\n\t" \ + /* h += W_X[i] */ \ + "addq ("#i")*8("WX"), "#h"\n\t" \ + /* L2 = f ^ g */ \ + "xorq "#g", "L2"\n\t" \ + +#define RND_1_2_A(a,b,c,d,e,f,g,h,i) \ + /* L4 = a */ \ + "movq "#a", "L4"\n\t" \ + /* L2 = f */ \ + "movq "#f", "L2"\n\t" \ + +#define RND_1_2_B(a,b,c,d,e,f,g,h,i) \ + /* h += W_X[i] */ \ + "addq ("#i")*8("WX"), "#h"\n\t" \ + /* L2 = f ^ g */ \ + "xorq "#g", "L2"\n\t" \ + +#define RND_1_3(a,b,c,d,e,f,g,h,i) \ + /* L1 = (e >>> 23) ^ e */ \ + "xorq "#e", "L1"\n\t" \ + /* L2 = (f ^ g) & e */ \ + "andq "#e", "L2"\n\t" \ + +#define RND_1_4(a,b,c,d,e,f,g,h,i) \ + /* ((e >>> 23) ^ e) >>> 4 */ \ + "rorq $4, "L1"\n\t" \ + /* ((f ^ g) & e) ^ g */ \ + "xorq "#g", "L2"\n\t" \ + +#define RND_1_5(a,b,c,d,e,f,g,h,i) \ + /* (((e >>> 23) ^ e) >>> 4) ^ e */ \ + "xorq "#e", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addq "L2", "#h"\n\t" \ + +#define RND_1_6(a,b,c,d,e,f,g,h,i) \ + /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \ + "rorq $14, "L1"\n\t" \ + /* L4 = a ^ b */ \ + "xorq "#b", "L4"\n\t" \ + +#define RND_1_7(a,b,c,d,e,f,g,h,i) \ + /* h += Sigma1(e) */ \ + "addq "L1", "#h"\n\t" \ + /* L2 = a */ \ + "movq "#a", "L2"\n\t" \ + +#define RND_1_8(a,b,c,d,e,f,g,h,i) \ + /* L3 = (a ^ b) & (b ^ c) */ \ + "andq "L4", "L3"\n\t" \ + /* L2 = a >>> 5 */ \ + "rorq $5, "L2"\n\t" \ + +#define RND_1_9(a,b,c,d,e,f,g,h,i) \ + /* L2 = (a >>> 5) ^ a */ \ + "xorq "#a", "L2"\n\t" \ + /* L3 = ((a ^ b) & (b ^ c) ^ b */ \ + "xorq "#b", "L3"\n\t" \ + +#define RND_1_10(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((a >>> 5) ^ a) >>> 6 */ \ + "rorq $6, "L2"\n\t" \ + /* d += h */ \ + "addq "#h", "#d"\n\t" \ + +#define RND_1_11(a,b,c,d,e,f,g,h,i) \ + /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \ + "xorq "#a", "L2"\n\t" \ + /* h += Sigma0(a) */ \ + "addq "L3", "#h"\n\t" \ + +#define RND_1_12(a,b,c,d,e,f,g,h,i) \ + /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \ + "rorq $28, "L2"\n\t" \ + /* d (= e next RND) */ \ + "movq "#d", "L1"\n\t" \ + /* h += Maj(a,b,c) */ \ + "addq "L2", "#h"\n\t" \ + + +#define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ + RND_0_1(a,b,c,d,e,f,g,h,i) \ + VPALIGNR(W_M15, W_2, W_0, 8) \ + VPALIGNR(W_M7, W_10, W_8, 8) \ + RND_0_2(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(XTMP1, W_M15, 1) \ + V_SHIFT_L(XTMP2, W_M15, 63) \ + RND_0_3(a,b,c,d,e,f,g,h,i) \ + RND_0_4(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(XTMP3, W_M15, 8) \ + V_SHIFT_L(XTMP4, W_M15, 56) \ + RND_0_5(a,b,c,d,e,f,g,h,i) \ + RND_0_6(a,b,c,d,e,f,g,h,i) \ + V_OR(XTMP1, XTMP2, XTMP1) \ + V_OR(XTMP3, XTMP4, XTMP3) \ + RND_0_7(a,b,c,d,e,f,g,h,i) \ + RND_0_8(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(XTMP4, W_M15, 7) \ + V_XOR(XTMP1, XTMP3, XTMP1) \ + RND_0_9(a,b,c,d,e,f,g,h,i) \ + RND_0_10(a,b,c,d,e,f,g,h,i) \ + V_XOR(XTMP1, XTMP4, XTMP1) \ + V_ADD(W_0, W_0, W_M7) \ + RND_0_11(a,b,c,d,e,f,g,h,i) \ + RND_0_12(a,b,c,d,e,f,g,h,i) \ + RND_1_1(h,a,b,c,d,e,f,g,i+1) \ + V_ADD(W_0, W_0, XTMP1) \ + RND_1_2(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(XTMP1, W_14, 19) \ + V_SHIFT_L(XTMP2, W_14, 45) \ + RND_1_3(h,a,b,c,d,e,f,g,i+1) \ + RND_1_4(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(XTMP3, W_14, 61) \ + V_SHIFT_L(XTMP4, W_14, 3) \ + RND_1_5(h,a,b,c,d,e,f,g,i+1) \ + RND_1_6(h,a,b,c,d,e,f,g,i+1) \ + RND_1_7(h,a,b,c,d,e,f,g,i+1) \ + V_OR(XTMP1, XTMP2, XTMP1) \ + V_OR(XTMP3, XTMP4, XTMP3) \ + RND_1_8(h,a,b,c,d,e,f,g,i+1) \ + RND_1_9(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(XTMP1, XTMP3, XTMP1) \ + V_SHIFT_R(XTMP4, W_14, 6) \ + RND_1_10(h,a,b,c,d,e,f,g,i+1) \ + RND_1_11(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(XTMP1, XTMP4, XTMP1) \ + RND_1_12(h,a,b,c,d,e,f,g,i+1) \ + V_ADD(W_0, W_0, XTMP1) \ + +#define RND_ALL_2(a, b, c, d, e, f, g, h, i) \ + RND_0_1 (a, b, c, d, e, f, g, h, i ) \ + RND_0_2 (a, b, c, d, e, f, g, h, i ) \ + RND_0_3 (a, b, c, d, e, f, g, h, i ) \ + RND_0_4 (a, b, c, d, e, f, g, h, i ) \ + RND_0_5 (a, b, c, d, e, f, g, h, i ) \ + RND_0_6 (a, b, c, d, e, f, g, h, i ) \ + RND_0_7 (a, b, c, d, e, f, g, h, i ) \ + RND_0_8 (a, b, c, d, e, f, g, h, i ) \ + RND_0_9 (a, b, c, d, e, f, g, h, i ) \ + RND_0_10(a, b, c, d, e, f, g, h, i ) \ + RND_0_11(a, b, c, d, e, f, g, h, i ) \ + RND_0_12(a, b, c, d, e, f, g, h, i ) \ + RND_1_1 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_2 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_3 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_4 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_5 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_6 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_7 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_8 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_9 (h, a, b, c, d, e, f, g, i+1) \ + RND_1_10(h, a, b, c, d, e, f, g, i+1) \ + RND_1_11(h, a, b, c, d, e, f, g, i+1) \ + RND_1_12(h, a, b, c, d, e, f, g, i+1) + #if defined(HAVE_INTEL_RORX) - #define Rx_RORX_1(i) h(i)+=S1_RORX(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i]; - #define Rx_RORX_2(i) d(i)+=h(i); - #define Rx_RORX_3(i) h(i)+=S0_RORX(a(i))+Maj(a(i),b(i),c(i)); -#endif /* HAVE_INTEL_RORX */ +#define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \ + /* L1 = e>>>14 */ \ + "rorxq $14, "#e", "L1"\n\t" \ + /* L2 = e>>>18 */ \ + "rorxq $18, "#e", "L2"\n\t" \ + /* Prev RND: h += Maj(a,b,c) */ \ + "addq "L3", "#a"\n\t" \ + +#define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \ + /* h += w_k */ \ + "addq ("#i")*8("WX"), "#h"\n\t" \ + /* L3 = f */ \ + "movq "#f", "L3"\n\t" \ + /* L2 = (e>>>14) ^ (e>>>18) */ \ + "xorq "L1", "L2"\n\t" \ + +#define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \ + /* L3 = f ^ g */ \ + "xorq "#g", "L3"\n\t" \ + /* L1 = e>>>41 */ \ + "rorxq $41, "#e", "L1"\n\t" \ + /* L1 = Sigma1(e) */ \ + "xorq "L2", "L1"\n\t" \ + +#define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \ + /* L3 = (f ^ g) & e */ \ + "andq "#e", "L3"\n\t" \ + /* h += Sigma1(e) */ \ + "addq "L1", "#h"\n\t" \ + /* L1 = a>>>28 */ \ + "rorxq $28, "#a", "L1"\n\t" \ + +#define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \ + /* L2 = a>>>34 */ \ + "rorxq $34, "#a", "L2"\n\t" \ + /* L3 = Ch(e,f,g) */ \ + "xorq "#g", "L3"\n\t" \ + /* L2 = (a>>>28) ^ (a>>>34) */ \ + "xorq "L1", "L2"\n\t" \ + +#define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \ + /* L1 = a>>>39 */ \ + "rorxq $39, "#a", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addq "L3", "#h"\n\t" \ + /* L1 = Sigma0(a) */ \ + "xorq "L2", "L1"\n\t" \ + +#define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \ + /* L3 = b */ \ + "movq "#b", "L3"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addq "#h", "#d"\n\t" \ + /* L3 = a ^ b */ \ + "xorq "#a", "L3"\n\t" \ + +#define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \ + /* L4 = (a ^ b) & (b ^ c) */ \ + "andq "L3", "L4"\n\t" \ + /* h += Sigma0(a) */ \ + "addq "L1", "#h"\n\t" \ + /* L4 = Maj(a,b,c) */ \ + "xorq "#b", "L4"\n\t" \ + +#define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \ + /* L1 = e>>>14 */ \ + "rorxq $14, "#e", "L1"\n\t" \ + /* L2 = e>>>18 */ \ + "rorxq $18, "#e", "L2"\n\t" \ + /* Prev RND: h += Maj(a,b,c) */ \ + "addq "L4", "#a"\n\t" \ + +#define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \ + /* h += w_k */ \ + "addq ("#i")*8("WX"), "#h"\n\t" \ + /* L4 = f */ \ + "movq "#f", "L4"\n\t" \ + /* L2 = (e>>>14) ^ (e>>>18) */ \ + "xorq "L1", "L2"\n\t" \ + +#define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \ + /* L4 = f ^ g */ \ + "xorq "#g", "L4"\n\t" \ + /* L1 = e>>>41 */ \ + "rorxq $41, "#e", "L1"\n\t" \ + /* L1 = Sigma1(e) */ \ + "xorq "L2", "L1"\n\t" \ + +#define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \ + /* L4 = (f ^ g) & e */ \ + "andq "#e", "L4"\n\t" \ + /* h += Sigma1(e) */ \ + "addq "L1", "#h"\n\t" \ + /* L1 = a>>>28 */ \ + "rorxq $28, "#a", "L1"\n\t" \ + +#define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \ + /* L2 = a>>>34 */ \ + "rorxq $34, "#a", "L2"\n\t" \ + /* L4 = Ch(e,f,g) */ \ + "xorq "#g", "L4"\n\t" \ + /* L2 = (a>>>28) ^ (a>>>34) */ \ + "xorq "L1", "L2"\n\t" \ + +#define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \ + /* L1 = a>>>39 */ \ + "rorxq $39, "#a", "L1"\n\t" \ + /* h += Ch(e,f,g) */ \ + "addq "L4", "#h"\n\t" \ + /* L1 = Sigma0(a) */ \ + "xorq "L2", "L1"\n\t" \ + +#define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \ + /* L4 = b */ \ + "movq "#b", "L4"\n\t" \ + /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ + "addq "#h", "#d"\n\t" \ + /* L4 = a ^ b */ \ + "xorq "#a", "L4"\n\t" \ + +#define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \ + /* L2 = (a ^ b) & (b ^ c) */ \ + "andq "L4", "L3"\n\t" \ + /* h += Sigma0(a) */ \ + "addq "L1", "#h"\n\t" \ + /* L3 = Maj(a,b,c) */ \ + "xorq "#b", "L3"\n\t" \ + +#define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \ + RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1) \ + RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1) \ + +#define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \ + RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0) \ + RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2) + +#define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ + RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ + VPALIGNR(W_M15, W_2, W_0, 8) \ + VPALIGNR(W_M7, W_10, W_8, 8) \ + RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(XTMP1, W_M15, 1) \ + V_SHIFT_L(XTMP2, W_M15, 63) \ + RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(XTMP3, W_M15, 8) \ + V_SHIFT_L(XTMP4, W_M15, 56) \ + RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ + V_OR(XTMP1, XTMP2, XTMP1) \ + V_OR(XTMP3, XTMP4, XTMP3) \ + RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(XTMP4, W_M15, 7) \ + V_XOR(XTMP1, XTMP3, XTMP1) \ + RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ + V_XOR(XTMP1, XTMP4, XTMP1) \ + V_ADD(W_0, W_0, W_M7) \ + RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ + RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ + V_ADD(W_0, W_0, XTMP1) \ + RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(XTMP1, W_14, 19) \ + V_SHIFT_L(XTMP2, W_14, 45) \ + RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(XTMP3, W_14, 61) \ + V_SHIFT_L(XTMP4, W_14, 3) \ + RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ + V_OR(XTMP1, XTMP2, XTMP1) \ + V_OR(XTMP3, XTMP4, XTMP3) \ + RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ + RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(XTMP1, XTMP3, XTMP1) \ + V_SHIFT_R(XTMP4, W_14, 6) \ + RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ + RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(XTMP1, XTMP4, XTMP1) \ + RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ + V_ADD(W_0, W_0, XTMP1) \ + +#endif + +#define _INIT_MASK(mask) \ + "vmovdqu %[mask], %%"#mask"\n\t" +#define INIT_MASK(mask) \ + _INIT_MASK(mask) + +#define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \ + "vmovdqu "#i1"*16(%%"#reg"), %%"#xmm1"\n\t" \ + "vmovdqu "#i2"*16(%%"#reg"), %%"#xmm2"\n\t" \ + "vpshufb %%"#mask", %%"#xmm1", %%"#xmm1"\n\t" \ + "vpshufb %%"#mask", %%"#xmm2", %%"#xmm2"\n\t" +#define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \ + _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) + +#define LOAD_W(mask, reg) \ + /* X0..3(xmm4..7), W[0..15] = buffer[0.15]; */ \ + LOAD_W_2(0, 1, W_0 , W_2 , mask, reg) \ + LOAD_W_2(2, 3, W_4 , W_6 , mask, reg) \ + LOAD_W_2(4, 5, W_8 , W_10, mask, reg) \ + LOAD_W_2(6, 7, W_12, W_14, mask, reg) + +#define _SET_W_X_2(xmm0, xmm1, reg, i) \ + "vpaddq "#i"+ 0(%%"#reg"), %%"#xmm0", %%xmm8\n\t" \ + "vpaddq "#i"+16(%%"#reg"), %%"#xmm1", %%xmm9\n\t" \ + "vmovdqu %%xmm8, "#i"+ 0("WX")\n\t" \ + "vmovdqu %%xmm9, "#i"+16("WX")\n\t" \ + +#define SET_W_X_2(xmm0, xmm1, reg, i) \ + _SET_W_X_2(xmm0, xmm1, reg, i) + +#define SET_W_X(reg) \ + SET_W_X_2(W_0 , W_2 , reg, 0) \ + SET_W_X_2(W_4 , W_6 , reg, 32) \ + SET_W_X_2(W_8 , W_10, reg, 64) \ + SET_W_X_2(W_12, W_14, reg, 96) + +#define LOAD_DIGEST() \ + "movq (%[sha512]), %%r8 \n\t" \ + "movq 8(%[sha512]), %%r9 \n\t" \ + "movq 16(%[sha512]), %%r10\n\t" \ + "movq 24(%[sha512]), %%r11\n\t" \ + "movq 32(%[sha512]), %%r12\n\t" \ + "movq 40(%[sha512]), %%r13\n\t" \ + "movq 48(%[sha512]), %%r14\n\t" \ + "movq 56(%[sha512]), %%r15\n\t" + +#define STORE_ADD_DIGEST() \ + "addq %%r8, (%[sha512])\n\t" \ + "addq %%r9, 8(%[sha512])\n\t" \ + "addq %%r10, 16(%[sha512])\n\t" \ + "addq %%r11, 24(%[sha512])\n\t" \ + "addq %%r12, 32(%[sha512])\n\t" \ + "addq %%r13, 40(%[sha512])\n\t" \ + "addq %%r14, 48(%[sha512])\n\t" \ + "addq %%r15, 56(%[sha512])\n\t" + +#define ADD_DIGEST() \ + "addq (%[sha512]), %%r8 \n\t" \ + "addq 8(%[sha512]), %%r9 \n\t" \ + "addq 16(%[sha512]), %%r10\n\t" \ + "addq 24(%[sha512]), %%r11\n\t" \ + "addq 32(%[sha512]), %%r12\n\t" \ + "addq 40(%[sha512]), %%r13\n\t" \ + "addq 48(%[sha512]), %%r14\n\t" \ + "addq 56(%[sha512]), %%r15\n\t" + +#define STORE_DIGEST() \ + "movq %%r8, (%[sha512])\n\t" \ + "movq %%r9, 8(%[sha512])\n\t" \ + "movq %%r10, 16(%[sha512])\n\t" \ + "movq %%r11, 24(%[sha512])\n\t" \ + "movq %%r12, 32(%[sha512])\n\t" \ + "movq %%r13, 40(%[sha512])\n\t" \ + "movq %%r14, 48(%[sha512])\n\t" \ + "movq %%r15, 56(%[sha512])\n\t" #endif /* HAVE_INTEL_AVX1 */ -#if defined(HAVE_INTEL_AVX2) -#define Ry_1(i, w) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + w; -#define Ry_2(i, w) d(i)+=h(i); -#define Ry_3(i, w) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)); -#endif /* HAVE_INTEL_AVX2 */ - -/* INLINE Assember for Intel AVX1 instructions */ -#if defined(HAVE_INTEL_AVX1) -#if defined(DEBUG_XMM) - #define SAVE_REG(i) __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0]):); - #define RECV_REG(i) __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0])); - - #define _DUMP_REG(REG, name)\ - { word64 buf[16];word64 reg[16][2];int k;\ - SAVE_REG(0); SAVE_REG(1); SAVE_REG(2); SAVE_REG(3); SAVE_REG(4); \ - SAVE_REG(5); SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\ - SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \ - __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0]):);\ - printf(" "#name":\t"); for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n"); \ - RECV_REG(0); RECV_REG(1); RECV_REG(2); RECV_REG(3); RECV_REG(4);\ - RECV_REG(5); RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\ - RECV_REG(10); RECV_REG(11); RECV_REG(12); RECV_REG(13); RECV_REG(14); RECV_REG(15);\ - } - - #define DUMP_REG(REG) _DUMP_REG(REG, #REG) - #define PRINTF(fmt, ...) -#else - #define DUMP_REG(REG) - #define PRINTF(fmt, ...) -#endif /* DEBUG_XMM */ - -#define _MOVE_to_REG(xymm, mem) __asm__ volatile("vmovdqu %0, %%"#xymm" "\ - :: "m"(mem)); -#define _MOVE_to_MEM(mem,i, xymm) __asm__ volatile("vmovdqu %%"#xymm", %0" :\ - "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3]):); -#define _MOVE(dest, src) __asm__ volatile("vmovdqu %%"#src", %%"\ - #dest" "::); - -#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrlq $"#bits", %%"\ - #src", %%"#dest"\n\tvpsllq $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ - #temp",%%"#dest", %%"#dest" "::); -#define _AVX1_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\ - #src", %%"#dest" "::); -#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ - #src2", %%"#dest" "::); -#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ - #src2", %%"#dest" "::); -#define _ADD(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\ - #src2", %%"#dest" "::); -#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddq %0, %%"#src1", %%"\ - #dest" "::"m"(mem)); - -#define MOVE_to_REG(xymm, mem) _MOVE_to_REG(xymm, mem) -#define MOVE_to_MEM(mem, i, xymm) _MOVE_to_MEM(mem, i, xymm) -#define MOVE(dest, src) _MOVE(dest, src) - -#define XOR(dest, src1, src2) _XOR(dest, src1, src2) -#define OR(dest, src1, src2) _OR(dest, src1, src2) -#define ADD(dest, src1, src2) _ADD(dest, src1, src2) - -#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); -#define AVX1_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) -#define AVX1_R(dest, src, bits) _AVX1_R(dest, src, bits) - -#define Init_Mask(mask) \ - __asm__ volatile("vmovdqu %0, %%xmm1\n\t"::"m"(mask):"%xmm1"); - -#define _W_from_buff1(w, buff, xmm) \ - /* X0..3(xmm4..7), W[0..15] = sha512->buffer[0.15]; */\ - __asm__ volatile("vmovdqu %1, %%"#xmm"\n\t"\ - "vpshufb %%xmm1, %%"#xmm", %%"#xmm"\n\t"\ - "vmovdqu %%"#xmm", %0"\ - :"=m"(w): "m"(buff):"%xmm0"); - -#define W_from_buff1(w, buff, xmm) _W_from_buff1(w, buff, xmm) - -#define W_from_buff(w, buff)\ - Init_Mask(mBYTE_FLIP_MASK[0]);\ - W_from_buff1(w[0], buff[0], W_0);\ - W_from_buff1(w[2], buff[2], W_2);\ - W_from_buff1(w[4], buff[4], W_4);\ - W_from_buff1(w[6], buff[6], W_6);\ - W_from_buff1(w[8], buff[8], W_8);\ - W_from_buff1(w[10],buff[10],W_10);\ - W_from_buff1(w[12],buff[12],W_12);\ - W_from_buff1(w[14],buff[14],W_14); - -static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; - -#define W_I_15 xmm14 -#define W_I_7 xmm11 -#define W_I_2 xmm13 -#define W_I xmm12 -#define G_TEMP xmm0 -#define S_TEMP xmm1 -#define XMM_TEMP0 xmm2 - -#define W_0 xmm12 -#define W_2 xmm3 -#define W_4 xmm4 -#define W_6 xmm5 -#define W_8 xmm6 -#define W_10 xmm7 -#define W_12 xmm8 -#define W_14 xmm9 - -#define s0_1(dest, src) AVX1_S(dest, src, 1); -#define s0_2(dest, src) AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest); -#define s0_3(dest, src) AVX1_R(G_TEMP, src, 7); XOR(dest, G_TEMP, dest); - -#define s1_1(dest, src) AVX1_S(dest, src, 19); -#define s1_2(dest, src) AVX1_S(G_TEMP, src, 61); XOR(dest, G_TEMP, dest); -#define s1_3(dest, src) AVX1_R(G_TEMP, src, 6); XOR(dest, G_TEMP, dest); - -#define s0_(dest, src) s0_1(dest, src); s0_2(dest, src); s0_3(dest, src) -#define s1_(dest, src) s1_1(dest, src); s1_2(dest, src); s1_3(dest, src) - -#define Block_xx_1(i) \ - MOVE_to_REG(W_I_15, W_X[(i-15)&15]);\ - MOVE_to_REG(W_I_7, W_X[(i- 7)&15]);\ - -#define Block_xx_2(i) \ - MOVE_to_REG(W_I_2, W_X[(i- 2)&15]);\ - MOVE_to_REG(W_I, W_X[(i)]);\ - -#define Block_xx_3(i) \ - s0_ (XMM_TEMP0, W_I_15);\ - -#define Block_xx_4(i) \ - ADD(W_I, W_I, XMM_TEMP0);\ - ADD(W_I, W_I, W_I_7);\ - -#define Block_xx_5(i) \ - s1_ (XMM_TEMP0, W_I_2);\ - -#define Block_xx_6(i) \ - ADD(W_I, W_I, XMM_TEMP0);\ - MOVE_to_MEM(W_X,i, W_I);\ - if (i==0)\ - MOVE_to_MEM(W_X,16, W_I);\ - -#define Block_xx_7(i) \ - MOVE_to_REG(W_I_15, W_X[(i-15)&15]);\ - MOVE_to_REG(W_I_7, W_X[(i- 7)&15]);\ - -#define Block_xx_8(i) \ - MOVE_to_REG(W_I_2, W_X[(i- 2)&15]);\ - MOVE_to_REG(W_I, W_X[(i)]);\ - -#define Block_xx_9(i) \ - s0_ (XMM_TEMP0, W_I_15);\ - -#define Block_xx_10(i) \ - ADD(W_I, W_I, XMM_TEMP0);\ - ADD(W_I, W_I, W_I_7);\ - -#define Block_xx_11(i) \ - s1_ (XMM_TEMP0, W_I_2);\ - -#define Block_xx_12(i) \ - ADD(W_I, W_I, XMM_TEMP0);\ - MOVE_to_MEM(W_X,i, W_I);\ - if ((i)==0)\ - MOVE_to_MEM(W_X,16, W_I);\ - -static INLINE void Block_0_1(word64 *W_X) { Block_xx_1(0); } -static INLINE void Block_0_2(word64 *W_X) { Block_xx_2(0); } -static INLINE void Block_0_3(void) { Block_xx_3(0); } -static INLINE void Block_0_4(void) { Block_xx_4(0); } -static INLINE void Block_0_5(void) { Block_xx_5(0); } -static INLINE void Block_0_6(word64 *W_X) { Block_xx_6(0); } -static INLINE void Block_0_7(word64 *W_X) { Block_xx_7(2); } -static INLINE void Block_0_8(word64 *W_X) { Block_xx_8(2); } -static INLINE void Block_0_9(void) { Block_xx_9(2); } -static INLINE void Block_0_10(void){ Block_xx_10(2); } -static INLINE void Block_0_11(void){ Block_xx_11(2); } -static INLINE void Block_0_12(word64 *W_X){ Block_xx_12(2); } - -static INLINE void Block_4_1(word64 *W_X) { Block_xx_1(4); } -static INLINE void Block_4_2(word64 *W_X) { Block_xx_2(4); } -static INLINE void Block_4_3(void) { Block_xx_3(4); } -static INLINE void Block_4_4(void) { Block_xx_4(4); } -static INLINE void Block_4_5(void) { Block_xx_5(4); } -static INLINE void Block_4_6(word64 *W_X) { Block_xx_6(4); } -static INLINE void Block_4_7(word64 *W_X) { Block_xx_7(6); } -static INLINE void Block_4_8(word64 *W_X) { Block_xx_8(6); } -static INLINE void Block_4_9(void) { Block_xx_9(6); } -static INLINE void Block_4_10(void){ Block_xx_10(6); } -static INLINE void Block_4_11(void){ Block_xx_11(6); } -static INLINE void Block_4_12(word64 *W_X){ Block_xx_12(6); } - -static INLINE void Block_8_1(word64 *W_X) { Block_xx_1(8); } -static INLINE void Block_8_2(word64 *W_X) { Block_xx_2(8); } -static INLINE void Block_8_3(void) { Block_xx_3(8); } -static INLINE void Block_8_4(void) { Block_xx_4(8); } -static INLINE void Block_8_5(void) { Block_xx_5(8); } -static INLINE void Block_8_6(word64 *W_X) { Block_xx_6(8); } -static INLINE void Block_8_7(word64 *W_X) { Block_xx_7(10); } -static INLINE void Block_8_8(word64 *W_X) { Block_xx_8(10); } -static INLINE void Block_8_9(void) { Block_xx_9(10); } -static INLINE void Block_8_10(void){ Block_xx_10(10); } -static INLINE void Block_8_11(void){ Block_xx_11(10); } -static INLINE void Block_8_12(word64 *W_X){ Block_xx_12(10); } - -static INLINE void Block_12_1(word64 *W_X) { Block_xx_1(12); } -static INLINE void Block_12_2(word64 *W_X) { Block_xx_2(12); } -static INLINE void Block_12_3(void) { Block_xx_3(12); } -static INLINE void Block_12_4(void) { Block_xx_4(12); } -static INLINE void Block_12_5(void) { Block_xx_5(12); } -static INLINE void Block_12_6(word64 *W_X) { Block_xx_6(12); } -static INLINE void Block_12_7(word64 *W_X) { Block_xx_7(14); } -static INLINE void Block_12_8(word64 *W_X) { Block_xx_8(14); } -static INLINE void Block_12_9(void) { Block_xx_9(14); } -static INLINE void Block_12_10(void){ Block_xx_10(14); } -static INLINE void Block_12_11(void){ Block_xx_11(14); } -static INLINE void Block_12_12(word64 *W_X){ Block_xx_12(14); } - -#endif /* HAVE_INTEL_AVX1 */ - -#if defined(HAVE_INTEL_AVX2) -static const unsigned long mBYTE_FLIP_MASK_Y[] = - { 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f }; - -#define W_from_buff_Y(buff)\ - { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15]; */\ - __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]));\ - __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\ - "vmovdqu %1, %%ymm4\n\t"\ - "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\ - "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\ - :: "m"(buff[0]), "m"(buff[4]));\ - __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\ - "vmovdqu %1, %%ymm6\n\t"\ - "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\ - "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\ - :: "m"(buff[8]), "m"(buff[12]));\ - } - -#if defined(DEBUG_YMM) - #define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0]):); - #define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0])); - - #define _DUMP_REG_Y(REG, name)\ - { word64 buf[16];word64 reg[16][2];int k;\ - SAVE_REG_Y(4); SAVE_REG_Y(5); SAVE_REG_Y(6); SAVE_REG_Y(7); \ - SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\ - SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \ - __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0]):);\ - printf(" "#name":\t"); for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]); printf("\n"); \ - RECV_REG_Y(4); RECV_REG_Y(5); RECV_REG_Y(6); RECV_REG_Y(7); \ - RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \ - RECV_REG_Y(13); RECV_REG_Y(14); RECV_REG_Y(15);\ - } - - #define DUMP_REG_Y(REG) _DUMP_REG_Y(REG, #REG) - #define DUMP_REG2_Y(REG) _DUMP_REG_Y(REG, #REG) - #define PRINTF_Y(fmt, ...) -#else - #define DUMP_REG_Y(REG) - #define DUMP_REG2_Y(REG) - #define PRINTF_Y(fmt, ...) -#endif /* DEBUG_YMM */ - -#define _MOVE_to_REGy(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" "\ - :: "m"(mem)); -#define _MOVE_to_MEMy(mem,i, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" \ - : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3]):); -#define _MOVE_128y(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"\ - #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" "::); -#define _S_TEMPy(dest, src, bits, temp) \ - __asm__ volatile("vpsrlq $"#bits", %%"#src", %%"#dest"\n\tvpsllq $64-"#bits\ - ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" "::); -#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\ - #src", %%"#dest" "::); -#define _XORy(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ - #src2", %%"#dest" "::); -#define _ADDy(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\ - #src2", %%"#dest" "::); -#define _BLENDy(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ - #src1", %%"#src2", %%"#dest" "::); -#define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd $"#map", %%"\ - #src1", %%"#src2", %%"#dest" "::); -#define _PERMQy(map, dest, src) __asm__ volatile("vpermq $"#map", %%"\ - #src", %%"#dest" "::); - -#define MOVE_to_REGy(ymm, mem) _MOVE_to_REGy(ymm, mem) -#define MOVE_to_MEMy(mem, i, ymm) _MOVE_to_MEMy(mem, i, ymm) - -#define MOVE_128y(ymm0, ymm1, ymm2, map) _MOVE_128y(ymm0, ymm1, ymm2, map) -#define XORy(dest, src1, src2) _XORy(dest, src1, src2) -#define ADDy(dest, src1, src2) _ADDy(dest, src1, src2) -#define BLENDy(map, dest, src1, src2) _BLENDy(map, dest, src1, src2) -#define BLENDQy(map, dest, src1, src2) _BLENDQy(map, dest, src1, src2) -#define PERMQy(map, dest, src) _PERMQy(map, dest, src) - - -#define S_TMPy(dest, src, bits, temp) _S_TEMPy(dest, src, bits, temp); -#define AVX2_S(dest, src, bits) S_TMPy(dest, src, bits, S_TEMPy) -#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) - - -#define FEEDBACK1_to_W_I_2(w_i_2, w_i) MOVE_128y(YMM_TEMP0, w_i, w_i, 0x08);\ - BLENDy(0xf0, w_i_2, YMM_TEMP0, w_i_2); - -#define MOVE_W_to_W_I_15(w_i_15, w_0, w_4) BLENDQy(0x1, w_i_15, w_4, w_0);\ - PERMQy(0x39, w_i_15, w_i_15); -#define MOVE_W_to_W_I_7(w_i_7, w_8, w_12) BLENDQy(0x1, w_i_7, w_12, w_8);\ - PERMQy(0x39, w_i_7, w_i_7); -#define MOVE_W_to_W_I_2(w_i_2, w_12) BLENDQy(0xc, w_i_2, w_12, w_i_2);\ - PERMQy(0x0e, w_i_2, w_i_2); - - -#define W_I_16y ymm8 -#define W_I_15y ymm9 -#define W_I_7y ymm10 -#define W_I_2y ymm11 -#define W_Iy ymm12 -#define G_TEMPy ymm13 -#define S_TEMPy ymm14 -#define YMM_TEMP0 ymm15 -#define YMM_TEMP0x xmm15 -#define W_I_TEMPy ymm7 -#define W_K_TEMPy ymm15 -#define W_K_TEMPx xmm15 -#define W_0y ymm12 -#define W_4y ymm4 -#define W_8y ymm5 -#define W_12y ymm6 - - -#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" "::);\ - __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" "::);\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" "::);\ - __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" "::);\ - __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" "::);\ - -#define MOVE_7_to_15(w_i_15, w_i_7)\ - __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" "::);\ - -#define MOVE_I_to_7(w_i_7, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" "::);\ - __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" "::);\ - __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" "::);\ - -#define MOVE_I_to_2(w_i_2, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" "::);\ - __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" "::);\ - -#endif /* HAVE_INTEL_AVX2 */ - /*** Transform Body ***/ #if defined(HAVE_INTEL_AVX1) -static int Transform_AVX1(wc_Sha512* sha512) +static int Transform_Sha512_AVX1(wc_Sha512* sha512) { - const word64* K = K512; - word64 W_X[16+4] = {0}; - word32 j; - word64 T[8]; + __asm__ __volatile__ ( - /* Copy digest to working vars */ - XMEMCPY(T, sha512->digest, sizeof(T)); + /* 16 Ws plus loop counter. */ + "subq $136, %%rsp\n\t" + "leaq 64(%[sha512]), %%rax\n\t" - W_from_buff(W_X, sha512->buffer); - for (j = 0; j < 80; j += 16) { - Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3(); - Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X); - Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9(); - Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X); + INIT_MASK(MASK) + LOAD_DIGEST() - Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3(); - Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X); - Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9(); - Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X); + LOAD_W(MASK, rax) - Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3(); - Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X); - Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9(); - Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X); + "movl $4, 16*8("WX")\n\t" + "leaq %[K512], %%rsi\n\t" + /* b */ + "movq %%r9, "L4"\n\t" + /* e */ + "movq %%r12, "L1"\n\t" + /* b ^ c */ + "xorq %%r10, "L4"\n\t" - Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3(); - Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X); - Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9(); - Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X); - } + "# Start of 16 rounds\n" + "1:\n\t" - /* Add the working vars back into digest */ - sha512->digest[0] += a(0); - sha512->digest[1] += b(0); - sha512->digest[2] += c(0); - sha512->digest[3] += d(0); - sha512->digest[4] += e(0); - sha512->digest[5] += f(0); - sha512->digest[6] += g(0); - sha512->digest[7] += h(0); + SET_W_X(rsi) - /* Wipe variables */ -#if !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) - XMEMSET(W_X, 0, sizeof(word64) * 16); -#endif - XMEMSET(T, 0, sizeof(T)); + "addq $128, %%rsi\n\t" + + MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) + MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) + MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) + MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) + MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) + MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) + MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) + + "subl $1, 16*8("WX")\n\t" + "jne 1b\n\t" + + SET_W_X(rsi) + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + STORE_ADD_DIGEST() + + "addq $136, %%rsp\n\t" + + : + : [mask] "m" (mBYTE_FLIP_MASK), + [sha512] "r" (sha512), + [K512] "m" (K512) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" + ); + + return 0; +} + +static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len) +{ + __asm__ __volatile__ ( + + "movq 224(%[sha512]), %%rsi\n\t" + "leaq %[K512], %%rdx\n\t" + + INIT_MASK(MASK) + LOAD_DIGEST() + + "# Start of processing a block\n" + "2:\n\t" + + /* 16 Ws plus loop counter and K512. len goes into -4(%rsp). + * Debug needs more stack space. */ + "subq $256, %%rsp\n\t" + + LOAD_W(MASK, rsi) + + "movl $4, 16*8("WX")\n\t" + /* b */ + "movq %%r9, "L4"\n\t" + /* e */ + "movq %%r12, "L1"\n\t" + /* b ^ c */ + "xorq %%r10, "L4"\n\t" + + SET_W_X(rdx) + + "# Start of 16 rounds\n" + "1:\n\t" + + "addq $128, %%rdx\n\t" + "movq %%rdx, 17*8(%%rsp)\n\t" + + MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) + MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) + MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) + MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) + MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) + MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) + MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) + + "movq 17*8(%%rsp), %%rdx\n\t" + + SET_W_X(rdx) + + "subl $1, 16*8("WX")\n\t" + "jne 1b\n\t" + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + ADD_DIGEST() + + "addq $256, %%rsp\n\t" + "leaq %[K512], %%rdx\n\t" + "addq $128, %%rsi\n\t" + "subl $128, %[len]\n\t" + + STORE_DIGEST() + + "jnz 2b\n\t" + + : + : [mask] "m" (mBYTE_FLIP_MASK), + [len] "m" (len), + [sha512] "r" (sha512), + [K512] "m" (K512) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" + ); return 0; } #endif /* HAVE_INTEL_AVX1 */ -#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX) -static int Transform_AVX1_RORX(wc_Sha512* sha512) +#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) +static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512) { - const word64* K = K512; - word64 W_X[16+4] = {0}; - word32 j; - word64 T[8]; + __asm__ __volatile__ ( - /* Copy digest to working vars */ - XMEMCPY(T, sha512->digest, sizeof(T)); + /* 16 Ws plus loop counter and K512. */ + "subq $144, %%rsp\n\t" + "leaq 64(%[sha512]), %%rax\n\t" - W_from_buff(W_X, sha512->buffer); - for (j = 0; j < 80; j += 16) { - Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); Block_0_2(W_X); - Rx_RORX_3( 0); Block_0_3(); - Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5(); - Rx_RORX_3( 1); Block_0_6(W_X); - Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); Block_0_8(W_X); - Rx_RORX_3( 2); Block_0_9(); - Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11(); - Rx_RORX_3( 3); Block_0_12(W_X); + INIT_MASK(MASK) + LOAD_DIGEST() - Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); Block_4_2(W_X); - Rx_RORX_3( 4); Block_4_3(); - Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5(); - Rx_RORX_3( 5); Block_4_6(W_X); - Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); Block_4_8(W_X); - Rx_RORX_3( 6); Block_4_9(); - Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11(); - Rx_RORX_3( 7); Block_4_12(W_X); + LOAD_W(MASK, rax) - Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); Block_8_2(W_X); - Rx_RORX_3( 8); Block_8_3(); - Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5(); - Rx_RORX_3( 9); Block_8_6(W_X); - Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); Block_8_8(W_X); - Rx_RORX_3(10); Block_8_9(); - Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11(); - Rx_RORX_3(11); Block_8_12(W_X); + "movl $4, 16*8("WX")\n\t" + "leaq %[K512], %%rsi\n\t" + /* L4 = b */ + "movq %%r9, "L4"\n\t" + /* L3 = 0 (add to prev h) */ + "xorq "L3", "L3"\n\t" + /* L4 = b ^ c */ + "xorq %%r10, "L4"\n\t" - Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12); Block_12_2(W_X); - Rx_RORX_3(12); Block_12_3(); - Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5(); - Rx_RORX_3(13); Block_12_6(W_X); - Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); Block_12_8(W_X); - Rx_RORX_3(14); Block_12_9(); - Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11(); - Rx_RORX_3(15); Block_12_12(W_X); - } + SET_W_X(rsi) - /* Add the working vars back into digest */ - sha512->digest[0] += a(0); - sha512->digest[1] += b(0); - sha512->digest[2] += c(0); - sha512->digest[3] += d(0); - sha512->digest[4] += e(0); - sha512->digest[5] += f(0); - sha512->digest[6] += g(0); - sha512->digest[7] += h(0); + "# Start of 16 rounds\n" + "1:\n\t" - /* Wipe variables */ -#if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) - XMEMSET(W_X, 0, sizeof(word64) * 16); -#endif - XMEMSET(T, 0, sizeof(T)); + "addq $128, %%rsi\n\t" + + MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) + MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) + MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) + MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) + MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) + MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) + MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) + + SET_W_X(rsi) + + "subl $1, 16*8("WX")\n\t" + "jne 1b\n\t" + + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) + + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + /* Prev RND: h += Maj(a,b,c) */ + "addq "L3", %%r8\n\t" + "addq $144, %%rsp\n\t" + + STORE_ADD_DIGEST() + + : + : [mask] "m" (mBYTE_FLIP_MASK), + [sha512] "r" (sha512), + [K512] "m" (K512) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" + ); return 0; } -#endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_AVX1 && HAVE_INTEL_RORX */ + +static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len) +{ + __asm__ __volatile__ ( + + "movq 224(%[sha512]), %%rsi\n\t" + "leaq %[K512], %%rcx\n\t" + + INIT_MASK(MASK) + LOAD_DIGEST() + + "# Start of processing a block\n" + "2:\n\t" + + /* 16 Ws plus loop counter and K512. len goes into -4(%rsp). + * Debug needs more stack space. */ + "subq $256, %%rsp\n\t" + + LOAD_W(MASK, rsi) + + "movl $4, 16*8("WX")\n\t" + /* L4 = b */ + "movq %%r9, "L4"\n\t" + /* L3 = 0 (add to prev h) */ + "xorq "L3", "L3"\n\t" + /* L4 = b ^ c */ + "xorq %%r10, "L4"\n\t" + + SET_W_X(rcx) + + "# Start of 16 rounds\n" + "1:\n\t" + + "addq $128, %%rcx\n\t" + "movq %%rcx, 17*8(%%rsp)\n\t" + + MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) + MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) + MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) + MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) + MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) + MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) + MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) + + "movq 17*8(%%rsp), %%rcx\n\t" + + SET_W_X(rcx) + + "subl $1, 16*8("WX")\n\t" + "jne 1b\n\t" + + SET_W_X(rcx) + + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) + + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + /* Prev RND: h += Maj(a,b,c) */ + "addq "L3", %%r8\n\t" + "addq $256, %%rsp\n\t" + + ADD_DIGEST() + + "leaq %[K512], %%rcx\n\t" + "addq $128, %%rsi\n\t" + "subl $128, %[len]\n\t" + + STORE_DIGEST() + + "jnz 2b\n\t" + + : + : [mask] "m" (mBYTE_FLIP_MASK), + [len] "m" (len), + [sha512] "r" (sha512), + [K512] "m" (K512) + : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" + ); + + return 0; +} +#endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */ #if defined(HAVE_INTEL_AVX2) +static const unsigned long mBYTE_FLIP_MASK_Y[] = + { 0x0001020304050607, 0x08090a0b0c0d0e0f, + 0x0001020304050607, 0x08090a0b0c0d0e0f }; -#define s0_1y(dest, src) AVX2_S(dest, src, 1); -#define s0_2y(dest, src) AVX2_S(G_TEMPy, src, 8); XORy(dest, G_TEMPy, dest); -#define s0_3y(dest, src) AVX2_R(G_TEMPy, src, 7); XORy(dest, G_TEMPy, dest); +#define W_Y_0 ymm0 +#define W_Y_4 ymm1 +#define W_Y_8 ymm2 +#define W_Y_12 ymm3 -#define s1_1y(dest, src) AVX2_S(dest, src, 19); -#define s1_2y(dest, src) AVX2_S(G_TEMPy, src, 61); XORy(dest, G_TEMPy, dest); -#define s1_3y(dest, src) AVX2_R(G_TEMPy, src, 6); XORy(dest, G_TEMPy, dest); +#define X0 xmm0 +#define X1 xmm1 +#define X2 xmm2 +#define X3 xmm3 +#define X4 xmm4 +#define X5 xmm5 +#define X6 xmm6 +#define X7 xmm7 +#define X8 xmm8 +#define X9 xmm9 +#define Y0 ymm0 +#define Y1 ymm1 +#define Y2 ymm2 +#define Y3 ymm3 +#define Y4 ymm4 +#define Y5 ymm5 +#define Y6 ymm6 +#define Y7 ymm7 -#define s0_y(dest, src) s0_1y(dest, src); s0_2y(dest, src); s0_3y(dest, src) -#define s1_y(dest, src) s1_1y(dest, src); s1_2y(dest, src); s1_3y(dest, src) +#define W_Y_M15 ymm12 +#define W_Y_M7 ymm13 +#define W_Y_M2 ymm14 +#define MASK_Y ymm15 + +#define YTMP1 ymm8 +#define YTMP2 ymm9 +#define YTMP3 ymm10 +#define YTMP4 ymm11 + +#define YMM_REGS \ + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", \ + "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" + +#define _VPERM2I128(dest, src1, src2, sel) \ + "vperm2I128 $"#sel", %%"#src2", %%"#src1", %%"#dest"\n\t" +#define VPERM2I128(dest, src1, src2, sel) \ + _VPERM2I128(dest, src1, src2, sel) + +#define _VPERMQ(dest, src, sel) \ + "vpermq $"#sel", %%"#src", %%"#dest"\n\t" +#define VPERMQ(dest, src, sel) \ + _VPERMQ(dest, src, sel) + +#define _VPBLENDD(dest, src1, src2, sel) \ + "vpblendd $"#sel", %%"#src2", %%"#src1", %%"#dest"\n\t" +#define VPBLENDD(dest, src1, src2, sel) \ + _VPBLENDD(dest, src1, src2, sel) + +#define _V_ADD_I(dest, src1, addr, i) \ + "vpaddq "#i"*8(%%"#addr"), %%"#src1", %%"#dest"\n\t" +#define V_ADD_I(dest, src1, addr, i) \ + _V_ADD_I(dest, src1, addr, i) + +#define _VMOVDQU_I(addr, i, src) \ + "vmovdqu %%"#src", "#i"*8(%%"#addr")\n\t" +#define VMOVDQU_I(addr, i, src) \ + _VMOVDQU_I(addr, i, src) + +#define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \ + RND_0_1(a,b,c,d,e,f,g,h,i) \ + /* W[-13]..W[-15], W[-12] */ \ + VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \ + /* W[-5]..W[-7], W[-4] */ \ + VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \ + RND_0_2(a,b,c,d,e,f,g,h,i) \ + RND_0_3(a,b,c,d,e,f,g,h,i) \ + /* W_Y_M15 = W[-12]..W[-15] */ \ + VPERMQ(W_Y_M15, W_Y_M15, 0x39) \ + RND_0_4(a,b,c,d,e,f,g,h,i) \ + /* W_Y_M7 = W[-4]..W[-7] */ \ + VPERMQ(W_Y_M7, W_Y_M7, 0x39) \ + RND_0_5(a,b,c,d,e,f,g,h,i) \ + RND_0_6(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >> 1 */ \ + V_SHIFT_R(YTMP1, W_Y_M15, 1) \ + RND_0_7(a,b,c,d,e,f,g,h,i) \ + /* W[-15] << 63 */ \ + V_SHIFT_L(YTMP2, W_Y_M15, 63) \ + RND_0_8(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >> 8 */ \ + V_SHIFT_R(YTMP3, W_Y_M15, 8) \ + RND_0_9(a,b,c,d,e,f,g,h,i) \ + /* W[-15] << 56 */ \ + V_SHIFT_L(YTMP4, W_Y_M15, 56) \ + RND_0_10(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >>> 1 */ \ + V_OR(YTMP1, YTMP2, YTMP1) \ + RND_0_11(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >>> 8 */ \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_0_12(a,b,c,d,e,f,g,h,i) \ + RND_1_1(h,a,b,c,d,e,f,g,i+1) \ + /* W[-15] >> 7 */ \ + V_SHIFT_R(YTMP4, W_Y_M15, 7) \ + RND_1_2_A(h,a,b,c,d,e,f,g,i+1) \ + /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_1_2_B(h,a,b,c,d,e,f,g,i+1) \ + /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_1_3(h,a,b,c,d,e,f,g,i+1) \ + /* W[0] = W[-16] + W[-7] */ \ + V_ADD(W_Y_0, W_Y_0, W_Y_M7) \ + RND_1_4(h,a,b,c,d,e,f,g,i+1) \ + /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \ + V_ADD(W_Y_0, W_Y_0, YTMP1) \ + RND_1_5(h,a,b,c,d,e,f,g,i+1) \ + /* 0, 0, W[-1], W[-2] */ \ + VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \ + RND_1_6(h,a,b,c,d,e,f,g,i+1) \ + RND_1_7(h,a,b,c,d,e,f,g,i+1) \ + RND_1_8(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] >> 19 */ \ + V_SHIFT_R(YTMP1, W_Y_M2, 19) \ + RND_1_9(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] << 45 */ \ + V_SHIFT_L(YTMP2, W_Y_M2, 45) \ + RND_1_10(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] >> 61 */ \ + V_SHIFT_R(YTMP3, W_Y_M2, 61) \ + RND_1_11(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] << 3 */ \ + V_SHIFT_L(YTMP4, W_Y_M2, 3) \ + RND_1_12(h,a,b,c,d,e,f,g,i+1) \ + RND_0_1(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >>> 19 */ \ + V_OR(YTMP1, YTMP2, YTMP1) \ + RND_0_2(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >>> 61 */ \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_0_3(g,h,a,b,c,d,e,f,i+2) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_0_4(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >> 6 */ \ + V_SHIFT_R(YTMP4, W_Y_M2, 6) \ + RND_0_5(g,h,a,b,c,d,e,f,i+2) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_0_6(g,h,a,b,c,d,e,f,i+2) \ + /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ + V_ADD(W_Y_0, W_Y_0, YTMP1) \ + RND_0_7(g,h,a,b,c,d,e,f,i+2) \ + RND_0_8(g,h,a,b,c,d,e,f,i+2) \ + /* W[1], W[0], 0, 0 */ \ + VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \ + RND_0_9(g,h,a,b,c,d,e,f,i+2) \ + RND_0_10(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >> 19 */ \ + V_SHIFT_R(YTMP1, W_Y_M2, 19) \ + RND_0_11(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] << 45 */ \ + V_SHIFT_L(YTMP2, W_Y_M2, 45) \ + RND_0_12(g,h,a,b,c,d,e,f,i+2) \ + RND_1_1(f,g,h,a,b,c,d,e,i+3) \ + /* W[-2] >> 61 */ \ + V_SHIFT_R(YTMP3, W_Y_M2, 61) \ + RND_1_2(f,g,h,a,b,c,d,e,i+3) \ + /* W[-2] << 3 */ \ + V_SHIFT_L(YTMP4, W_Y_M2, 3) \ + RND_1_3(f,g,h,a,b,c,d,e,i+3) \ + /* W[-2] >>> 19 */ \ + V_OR(YTMP1, YTMP2, YTMP1) \ + RND_1_4(f,g,h,a,b,c,d,e,i+3) \ + /* W[-2] >>> 61 */ \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_1_5(f,g,h,a,b,c,d,e,i+3) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_1_6(f,g,h,a,b,c,d,e,i+3) \ + /* W[-2] >> 6 */ \ + V_SHIFT_R(YTMP4, W_Y_M2, 6) \ + RND_1_7(f,g,h,a,b,c,d,e,i+3) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_1_8(f,g,h,a,b,c,d,e,i+3) \ + /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ + V_ADD(W_Y_0, W_Y_0, YTMP1) \ + RND_1_9(f,g,h,a,b,c,d,e,i+3) \ + RND_1_10(f,g,h,a,b,c,d,e,i+3) \ + RND_1_11(f,g,h,a,b,c,d,e,i+3) \ + RND_1_12(f,g,h,a,b,c,d,e,i+3) \ + +#define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ + RND_0_1(a,b,c,d,e,f,g,h,i) \ + VPALIGNR(W_Y_M15, W_2, W_0, 8) \ + VPALIGNR(W_Y_M7, W_10, W_8, 8) \ + RND_0_2(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(YTMP1, W_Y_M15, 1) \ + V_SHIFT_L(YTMP2, W_Y_M15, 63) \ + RND_0_3(a,b,c,d,e,f,g,h,i) \ + RND_0_4(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(YTMP3, W_Y_M15, 8) \ + V_SHIFT_L(YTMP4, W_Y_M15, 56) \ + RND_0_5(a,b,c,d,e,f,g,h,i) \ + RND_0_6(a,b,c,d,e,f,g,h,i) \ + V_OR(YTMP1, YTMP2, YTMP1) \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_0_7(a,b,c,d,e,f,g,h,i) \ + RND_0_8(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(YTMP4, W_Y_M15, 7) \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_0_9(a,b,c,d,e,f,g,h,i) \ + RND_0_10(a,b,c,d,e,f,g,h,i) \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + V_ADD(W_0, W_0, W_Y_M7) \ + RND_0_11(a,b,c,d,e,f,g,h,i) \ + RND_0_12(a,b,c,d,e,f,g,h,i) \ + RND_1_1(h,a,b,c,d,e,f,g,i+1) \ + V_ADD(W_0, W_0, YTMP1) \ + RND_1_2(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(YTMP1, W_14, 19) \ + V_SHIFT_L(YTMP2, W_14, 45) \ + RND_1_3(h,a,b,c,d,e,f,g,i+1) \ + RND_1_4(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(YTMP3, W_14, 61) \ + V_SHIFT_L(YTMP4, W_14, 3) \ + RND_1_5(h,a,b,c,d,e,f,g,i+1) \ + RND_1_6(h,a,b,c,d,e,f,g,i+1) \ + RND_1_7(h,a,b,c,d,e,f,g,i+1) \ + V_OR(YTMP1, YTMP2, YTMP1) \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_1_8(h,a,b,c,d,e,f,g,i+1) \ + RND_1_9(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + V_SHIFT_R(YTMP4, W_14, 6) \ + RND_1_10(h,a,b,c,d,e,f,g,i+1) \ + RND_1_11(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_1_12(h,a,b,c,d,e,f,g,i+1) \ + V_ADD(W_0, W_0, YTMP1) \ + +#define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \ + RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ + /* W[-13]..W[-15], W[-12] */ \ + VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \ + /* W[-5]..W[-7], W[-4] */ \ + VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \ + RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ + /* W_Y_M15 = W[-12]..W[-15] */ \ + VPERMQ(W_Y_M15, W_Y_M15, 0x39) \ + RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ + /* W_Y_M7 = W[-4]..W[-7] */ \ + VPERMQ(W_Y_M7, W_Y_M7, 0x39) \ + RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >> 1 */ \ + V_SHIFT_R(YTMP1, W_Y_M15, 1) \ + /* W[-15] << 63 */ \ + V_SHIFT_L(YTMP2, W_Y_M15, 63) \ + RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >> 8 */ \ + V_SHIFT_R(YTMP3, W_Y_M15, 8) \ + /* W[-15] << 56 */ \ + V_SHIFT_L(YTMP4, W_Y_M15, 56) \ + /* W[-15] >>> 1 */ \ + V_OR(YTMP1, YTMP2, YTMP1) \ + /* W[-15] >>> 8 */ \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ + /* W[-15] >> 7 */ \ + V_SHIFT_R(YTMP4, W_Y_M15, 7) \ + RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ + /* 0, 0, W[-1], W[-2] */ \ + VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \ + RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ + RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ + /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ + /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ + /* W[0] = W[-16] + W[-7] */ \ + V_ADD(W_Y_0, W_Y_0, W_Y_M7) \ + /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \ + V_ADD(W_Y_0, W_Y_0, YTMP1) \ + RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] >> 19 */ \ + V_SHIFT_R(YTMP1, W_Y_M2, 19) \ + /* W[-2] << 45 */ \ + V_SHIFT_L(YTMP2, W_Y_M2, 45) \ + RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] >> 61 */ \ + V_SHIFT_R(YTMP3, W_Y_M2, 61) \ + /* W[-2] << 3 */ \ + V_SHIFT_L(YTMP4, W_Y_M2, 3) \ + /* W[-2] >>> 19 */ \ + V_OR(YTMP1, YTMP2, YTMP1) \ + RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] >>> 61 */ \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ + /* W[-2] >> 6 */ \ + V_SHIFT_R(YTMP4, W_Y_M2, 6) \ + RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2) \ + /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ + V_ADD(W_Y_0, W_Y_0, YTMP1) \ + RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2) \ + /* W[1], W[0], 0, 0 */ \ + VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \ + RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2) \ + RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >> 19 */ \ + V_SHIFT_R(YTMP1, W_Y_M2, 19) \ + /* W[-2] << 45 */ \ + V_SHIFT_L(YTMP2, W_Y_M2, 45) \ + RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >> 61 */ \ + V_SHIFT_R(YTMP3, W_Y_M2, 61) \ + /* W[-2] << 3 */ \ + V_SHIFT_L(YTMP4, W_Y_M2, 3) \ + /* W[-2] >>> 19 */ \ + V_OR(YTMP1, YTMP2, YTMP1) \ + RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2) \ + /* W[-2] >>> 61 */ \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3) \ + /* W[-2] >> 6 */ \ + V_SHIFT_R(YTMP4, W_Y_M2, 6) \ + RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3) \ + RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3) \ + /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3) \ + RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3) \ + /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ + V_ADD(W_Y_0, W_Y_0, YTMP1) \ + RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3) \ + V_ADD_I(YTMP1, W_Y_0, rsi, i) \ + RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3) \ + RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3) \ + VMOVDQU_I(rsp, i, YTMP1) \ + +#define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e, \ + f,g,h,i) \ + RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ + VPALIGNR(W_Y_M15, W_2, W_0, 8) \ + VPALIGNR(W_Y_M7, W_10, W_8, 8) \ + RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(YTMP1, W_Y_M15, 1) \ + V_SHIFT_L(YTMP2, W_Y_M15, 63) \ + RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(YTMP3, W_Y_M15, 8) \ + V_SHIFT_L(YTMP4, W_Y_M15, 56) \ + RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ + V_OR(YTMP1, YTMP2, YTMP1) \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ + V_SHIFT_R(YTMP4, W_Y_M15, 7) \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + V_ADD(W_0, W_0, W_Y_M7) \ + RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ + RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ + V_ADD(W_0, W_0, YTMP1) \ + RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(YTMP1, W_14, 19) \ + V_SHIFT_L(YTMP2, W_14, 45) \ + RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ + V_SHIFT_R(YTMP3, W_14, 61) \ + V_SHIFT_L(YTMP4, W_14, 3) \ + RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ + V_OR(YTMP1, YTMP2, YTMP1) \ + V_OR(YTMP3, YTMP4, YTMP3) \ + RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ + RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(YTMP1, YTMP3, YTMP1) \ + V_SHIFT_R(YTMP4, W_14, 6) \ + RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ + RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ + V_XOR(YTMP1, YTMP4, YTMP1) \ + RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ + V_ADD(W_0, W_0, YTMP1) \ -#define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\ - MOVE_W_to_W_I_15(W_I_15y, w_0, w_4);\ - MOVE_W_to_W_I_7 (W_I_7y, w_8, w_12);\ - MOVE_W_to_W_I_2 (W_I_2y, w_12);\ +#define _INIT_MASK_Y(mask) \ + "vmovdqu %[mask], %%"#mask"\n\t" +#define INIT_MASK_Y(mask) \ + _INIT_MASK_Y(mask) -#define Block_Y_xx_2(i, w_0, w_4, w_8, w_12)\ - s0_1y (YMM_TEMP0, W_I_15y);\ +/* Load into YMM registers and swap endian. */ +#define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i) \ + /* buffer[0..15] => ymm0..ymm3; */ \ + "vmovdqu "#i"+ 0(%%"#reg"), %%"#ymm0"\n\t" \ + "vmovdqu "#i"+32(%%"#reg"), %%"#ymm1"\n\t" \ + "vpshufb %%"#mask", %%"#ymm0", %%"#ymm0"\n\t" \ + "vpshufb %%"#mask", %%"#ymm1", %%"#ymm1"\n\t" -#define Block_Y_xx_3(i, w_0, w_4, w_8, w_12)\ - s0_2y (YMM_TEMP0, W_I_15y);\ +#define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \ + _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) -#define Block_Y_xx_4(i, w_0, w_4, w_8, w_12)\ - s0_3y (YMM_TEMP0, W_I_15y);\ +#define LOAD_BLOCK_W_Y(mask, reg) \ + LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg, 0) \ + LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64) -#define Block_Y_xx_5(i, w_0, w_4, w_8, w_12)\ - ADDy(W_I_TEMPy, w_0, YMM_TEMP0);\ +#define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \ + "vpaddq "#i"+ 0(%%"#reg"), %%"#ymm0", %%"#ymm2"\n\t" \ + "vpaddq "#i"+32(%%"#reg"), %%"#ymm1", %%"#ymm3"\n\t" \ + "vmovdqu %%"#ymm2", "#i"+ 0("WX")\n\t" \ + "vmovdqu %%"#ymm3", "#i"+32("WX")\n\t" -#define Block_Y_xx_6(i, w_0, w_4, w_8, w_12)\ - ADDy(W_I_TEMPy, W_I_TEMPy, W_I_7y);\ - s1_1y (YMM_TEMP0, W_I_2y);\ +#define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \ + _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) -#define Block_Y_xx_7(i, w_0, w_4, w_8, w_12)\ - s1_2y (YMM_TEMP0, W_I_2y);\ +#define SET_BLOCK_W_Y(reg) \ + SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg, 0) \ + SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64) -#define Block_Y_xx_8(i, w_0, w_4, w_8, w_12)\ - s1_3y (YMM_TEMP0, W_I_2y);\ - ADDy(w_0, W_I_TEMPy, YMM_TEMP0);\ +/* Load into YMM registers and swap endian. */ +#define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \ + "vmovdqu "#i"+ 0(%%"#reg"), %%"#X0"\n\t" \ + "vmovdqu "#i"+ 16(%%"#reg"), %%"#X1"\n\t" \ + "vmovdqu "#i"+128(%%"#reg"), %%"#X8"\n\t" \ + "vmovdqu "#i"+144(%%"#reg"), %%"#X9"\n\t" \ + "vinserti128 $1, %%"#X8", %%"#Y0", %%"#Y0"\n\t" \ + "vinserti128 $1, %%"#X9", %%"#Y1", %%"#Y1"\n\t" \ + "vpshufb %%"#mask", %%"#Y0", %%"#Y0"\n\t" \ + "vpshufb %%"#mask", %%"#Y1", %%"#Y1"\n\t" -#define Block_Y_xx_9(i, w_0, w_4, w_8, w_12)\ - FEEDBACK1_to_W_I_2(W_I_2y, w_0);\ +#define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \ + _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) -#define Block_Y_xx_10(i, w_0, w_4, w_8, w_12) \ - s1_1y (YMM_TEMP0, W_I_2y);\ +#define LOAD_BLOCK2_W_Y(mask, reg) \ + LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, 0) \ + LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \ + LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \ + LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \ -#define Block_Y_xx_11(i, w_0, w_4, w_8, w_12) \ - s1_2y (YMM_TEMP0, W_I_2y);\ +#define SET_BLOCK2_W_Y(reg) \ + SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg, 0) \ + SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg, 64) \ + SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \ + SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192) -#define Block_Y_xx_12(i, w_0, w_4, w_8, w_12)\ - s1_3y (YMM_TEMP0, W_I_2y);\ - ADDy(w_0, W_I_TEMPy, YMM_TEMP0);\ - MOVE_to_MEMy(w,0, w_4);\ +static const word64 K512_AVX2[160] = { + W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), + W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), + W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), + W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), + W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), + W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), + W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), + W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), + W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), + W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), + W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), + W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), + W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), + W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), + W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), + W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), + W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), + W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), + W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), + W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), + W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), + W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), + W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), + W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), + W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), + W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), + W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), + W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), + W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), + W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), + W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), + W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), + W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), + W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), + W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), + W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), + W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), + W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), + W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), + W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), + W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), + W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), + W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), + W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), + W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), + W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), + W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), + W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), + W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), + W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), + W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), + W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), + W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), + W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), + W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), + W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), + W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), + W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), + W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), + W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), + W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), + W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), + W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), + W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), + W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), + W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), + W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), + W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), + W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), + W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), + W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), + W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), + W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), + W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), + W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), + W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), + W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), + W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), + W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817), + W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) +}; +static const word64* K512_AVX2_END = &K512_AVX2[128]; - -static INLINE void Block_Y_0_1(void) { Block_Y_xx_1(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_2(void) { Block_Y_xx_2(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_3(void) { Block_Y_xx_3(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_4(void) { Block_Y_xx_4(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_5(void) { Block_Y_xx_5(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_6(void) { Block_Y_xx_6(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_7(void) { Block_Y_xx_7(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y); } -static INLINE void Block_Y_0_12(word64 *w){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y); } - -static INLINE void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_3(void) { Block_Y_xx_3(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_4(void) { Block_Y_xx_4(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_5(void) { Block_Y_xx_5(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_6(void) { Block_Y_xx_6(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_7(void) { Block_Y_xx_7(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y); } -static INLINE void Block_Y_4_12(word64 *w) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y); } - -static INLINE void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_3(void) { Block_Y_xx_3(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_4(void) { Block_Y_xx_4(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_5(void) { Block_Y_xx_5(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_6(void) { Block_Y_xx_6(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_7(void) { Block_Y_xx_7(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y); } -static INLINE void Block_Y_8_12(word64 *w) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y); } - -static INLINE void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_3(void) { Block_Y_xx_3(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_4(void) { Block_Y_xx_4(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_5(void) { Block_Y_xx_5(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_6(void) { Block_Y_xx_6(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_7(void) { Block_Y_xx_7(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y); } -static INLINE void Block_Y_12_12(word64 *w) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y); } - - -static int Transform_AVX2(wc_Sha512* sha512) +static int Transform_Sha512_AVX2(wc_Sha512* sha512) { - const word64* K = K512; - word64 w[4]; - word32 j; - word64 T[8]; + __asm__ __volatile__ ( - /* Copy digest to working vars */ - XMEMCPY(T, sha512->digest, sizeof(T)); + /* 16 Ws plus loop counter and K512. */ + "subq $136, %%rsp\n\t" + "leaq 64(%[sha512]), %%rax\n\t" - W_from_buff_Y(sha512->buffer); - MOVE_to_MEMy(w,0, W_0y); - for (j = 0; j < 80; j += 16) { - Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); Block_Y_0_2(); - Ry_3( 0, w[0]); Block_Y_0_3(); - Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); Block_Y_0_5(); - Ry_3( 1, w[1]); Block_Y_0_6(); - Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8(); - Ry_3( 2, w[2]); Block_Y_0_9(); - Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11(); - Ry_3( 3, w[3]); Block_Y_0_12(w); + INIT_MASK(MASK_Y) + LOAD_DIGEST() - Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2(); - Ry_3( 4, w[0]); Block_Y_4_3(); - Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); Block_Y_4_5(); - Ry_3( 5, w[1]); Block_Y_4_6(); - Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8(); - Ry_3( 6, w[2]); Block_Y_4_9(); - Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11(); - Ry_3( 7, w[3]);Block_Y_4_12(w); + LOAD_BLOCK_W_Y(MASK_Y, rax) - Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2(); - Ry_3( 8, w[0]); Block_Y_8_3(); - Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); Block_Y_8_5(); - Ry_3( 9, w[1]); Block_Y_8_6(); - Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8(); - Ry_3(10, w[2]); Block_Y_8_9(); - Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11(); - Ry_3(11, w[3]); Block_Y_8_12(w); + "movl $4, 16*8("WX")\n\t" + "leaq %[K512], %%rsi\n\t" + /* b */ + "movq %%r9, "L4"\n\t" + /* e */ + "movq %%r12, "L1"\n\t" + /* b ^ c */ + "xorq %%r10, "L4"\n\t" - Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2(); - Ry_3(12, w[0]); Block_Y_12_3(); - Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); Block_Y_12_5(); - Ry_3(13, w[1]); Block_Y_12_6(); - Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8(); - Ry_3(14, w[2]); Block_Y_12_9(); - Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11(); - Ry_3(15, w[3]);Block_Y_12_12(w); - } + SET_BLOCK_W_Y(rsi) - /* Add the working vars back into digest */ - sha512->digest[0] += a(0); - sha512->digest[1] += b(0); - sha512->digest[2] += c(0); - sha512->digest[3] += d(0); - sha512->digest[4] += e(0); - sha512->digest[5] += f(0); - sha512->digest[6] += g(0); - sha512->digest[7] += h(0); + "# Start of 16 rounds\n" + "1:\n\t" - /* Wipe variables */ -#if !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) - XMEMSET(W, 0, sizeof(word64) * 16); -#endif - XMEMSET(T, 0, sizeof(T)); + "addq $128, %%rsi\n\t" + + MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4) + MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8) + MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12) + + SET_BLOCK_W_Y(rsi) + + "subl $1, 16*8("WX")\n\t" + "jne 1b\n\t" + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + STORE_ADD_DIGEST() + + "addq $136, %%rsp\n\t" + + : + : [mask] "m" (mBYTE_FLIP_MASK_Y), + [sha512] "r" (sha512), + [K512] "m" (K512) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" + ); return 0; } + +static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len) +{ + if ((len & WC_SHA512_BLOCK_SIZE) != 0) { + Transform_Sha512_AVX2(sha512); + sha512->data += WC_SHA512_BLOCK_SIZE; + len -= WC_SHA512_BLOCK_SIZE; + if (len == 0) + return 0; + } + + __asm__ __volatile__ ( + + "movq 224(%[sha512]), %%rcx\n\t" + + INIT_MASK(MASK_Y) + LOAD_DIGEST() + + "# Start of processing two blocks\n" + "2:\n\t" + + "subq $1344, %%rsp\n\t" + "leaq %[K512], %%rsi\n\t" + + /* L4 = b */ + "movq %%r9, "L4"\n\t" + /* e */ + "movq %%r12, "L1"\n\t" + + LOAD_BLOCK2_W_Y(MASK_Y, rcx) + + /* L4 = b ^ c */ + "xorq %%r10, "L4"\n\t" + "\n" + "1:\n\t" + SET_BLOCK2_W_Y(rsi) + MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4) + MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8) + MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12) + MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16) + MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20) + MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24) + MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28) + "addq $256, %%rsi\n\t" + "addq $256, %%rsp\n\t" + "cmpq %[K512_END], %%rsi\n\t" + "jne 1b\n\t" + + SET_BLOCK2_W_Y(rsi) + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12) + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28) + "subq $1024, %%rsp\n\t" + + ADD_DIGEST() + STORE_DIGEST() + + /* L4 = b */ + "movq %%r9, "L4"\n\t" + /* e */ + "movq %%r12, "L1"\n\t" + /* L4 = b ^ c */ + "xorq %%r10, "L4"\n\t" + + "movq $5, %%rsi\n\t" + "\n" + "3:\n\t" + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18) + RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22) + RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26) + RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30) + "addq $256, %%rsp\n\t" + "subq $1, %%rsi\n\t" + "jnz 3b\n\t" + + ADD_DIGEST() + + "movq 224(%[sha512]), %%rcx\n\t" + "addq $64, %%rsp\n\t" + "addq $256, %%rcx\n\t" + "subl $256, %[len]\n\t" + "movq %%rcx, 224(%[sha512])\n\t" + + STORE_DIGEST() + + "jnz 2b\n\t" + + : + : [mask] "m" (mBYTE_FLIP_MASK_Y), + [len] "m" (len), + [sha512] "r" (sha512), + [K512] "m" (K512_AVX2), + [K512_END] "m" (K512_AVX2_END) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" + ); + + return 0; +} + +#ifdef HAVE_INTEL_RORX +static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512) +{ + __asm__ __volatile__ ( + + /* 16 Ws plus loop counter. */ + "subq $136, %%rsp\n\t" + "leaq 64(%[sha512]), "L2"\n\t" + + INIT_MASK(MASK_Y) + LOAD_DIGEST() + + LOAD_BLOCK_W_Y(MASK_Y, rcx) + + "movl $4, 16*8("WX")\n\t" + "leaq %[K512], %%rsi\n\t" + /* b */ + "movq %%r9, "L4"\n\t" + /* L3 = 0 (add to prev h) */ + "xorq "L3", "L3"\n\t" + /* b ^ c */ + "xorq %%r10, "L4"\n\t" + + SET_BLOCK_W_Y(rsi) + + "# Start of 16 rounds\n" + "1:\n\t" + + "addq $128, %%rsi\n\t" + + MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4) + MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8) + MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12) + + "subl $1, 16*8(%%rsp)\n\t" + "jnz 1b\n\t" + + RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4) + RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8) + RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12) + /* Prev RND: h += Maj(a,b,c) */ + "addq "L3", %%r8\n\t" + "addq $136, %%rsp\n\t" + + STORE_ADD_DIGEST() + + : + : [mask] "m" (mBYTE_FLIP_MASK_Y), + [sha512] "r" (sha512), + [K512] "m" (K512) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" + ); + + return 0; +} + +static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len) +{ + if ((len & WC_SHA512_BLOCK_SIZE) != 0) { + Transform_Sha512_AVX2_RORX(sha512); + sha512->data += WC_SHA512_BLOCK_SIZE; + len -= WC_SHA512_BLOCK_SIZE; + if (len == 0) + return 0; + } + + __asm__ __volatile__ ( + + "movq 224(%[sha512]), %%rax\n\t" + + INIT_MASK(MASK_Y) + LOAD_DIGEST() + + "# Start of processing two blocks\n" + "2:\n\t" + + "subq $1344, %%rsp\n\t" + "leaq %[K512], %%rsi\n\t" + + /* L4 = b */ + "movq %%r9, "L4"\n\t" + /* L3 = 0 (add to prev h) */ + "xorq "L3", "L3"\n\t" + + LOAD_BLOCK2_W_Y(MASK_Y, rax) + + /* L4 = b ^ c */ + "xorq %%r10, "L4"\n\t" + "\n" + "1:\n\t" + SET_BLOCK2_W_Y(rsi) + MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0) + MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4) + MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8) + MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12) + MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16) + MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20) + MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24) + MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28) + "addq $256, %%rsi\n\t" + "addq $256, %%rsp\n\t" + "cmpq %[K512_END], %%rsi\n\t" + "jne 1b\n\t" + + SET_BLOCK2_W_Y(rsi) + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12) + + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28) + "addq "L3", %%r8\n\t" + "subq $1024, %%rsp\n\t" + + ADD_DIGEST() + STORE_DIGEST() + + /* L4 = b */ + "movq %%r9, "L4"\n\t" + /* L3 = 0 (add to prev h) */ + "xorq "L3", "L3"\n\t" + /* L4 = b ^ c */ + "xorq %%r10, "L4"\n\t" + + "movq $5, %%rsi\n\t" + "\n" + "3:\n\t" + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) + + RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18) + RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22) + RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26) + RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30) + "addq $256, %%rsp\n\t" + "subq $1, %%rsi\n\t" + "jnz 3b\n\t" + + "addq "L3", %%r8\n\t" + + ADD_DIGEST() + + "movq 224(%[sha512]), %%rax\n\t" + "addq $64, %%rsp\n\t" + "addq $256, %%rax\n\t" + "subl $256, %[len]\n\t" + "movq %%rax, 224(%[sha512])\n\t" + + STORE_DIGEST() + + "jnz 2b\n\t" + + : + : [mask] "m" (mBYTE_FLIP_MASK_Y), + [len] "m" (len), + [sha512] "r" (sha512), + [K512] "m" (K512_AVX2), + [K512_END] "m" (K512_AVX2_END) + : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" + ); + + return 0; +} +#endif /* HAVE_INTEL_RORX */ #endif /* HAVE_INTEL_AVX2 */ diff --git a/wolfssl/wolfcrypt/sha256.h b/wolfssl/wolfcrypt/sha256.h index c71dca32a..0cc9beb3c 100644 --- a/wolfssl/wolfcrypt/sha256.h +++ b/wolfssl/wolfcrypt/sha256.h @@ -66,6 +66,14 @@ #include #endif +#if defined(_MSC_VER) + #define SHA256_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #define SHA256_NOINLINE __attribute__((noinline)) +#else + #define SHA256_NOINLINE +#endif + #ifndef NO_OLD_WC_NAMES #define Sha256 wc_Sha256 #define SHA256 WC_SHA256 @@ -96,6 +104,9 @@ typedef struct wc_Sha256 { word32 loLen; /* length in bytes */ word32 hiLen; /* length in bytes */ void* heap; +#ifdef USE_INTEL_SPEEDUP + const byte* data; +#endif #ifdef WOLFSSL_PIC32MZ_HASH hashUpdCache cache; /* cache for updates */ #endif diff --git a/wolfssl/wolfcrypt/sha512.h b/wolfssl/wolfcrypt/sha512.h index f8982850a..277f1a687 100644 --- a/wolfssl/wolfcrypt/sha512.h +++ b/wolfssl/wolfcrypt/sha512.h @@ -59,6 +59,14 @@ #include #endif +#if defined(_MSC_VER) + #define SHA512_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #define SHA512_NOINLINE __attribute__((noinline)) +#else + #define SHA512_NOINLINE +#endif + #ifndef NO_OLD_WC_NAMES #define Sha512 wc_Sha512 #define SHA512 WC_SHA512 @@ -78,12 +86,15 @@ enum { /* wc_Sha512 digest */ typedef struct wc_Sha512 { + word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)]; + word64 buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64)]; word32 buffLen; /* in bytes */ word64 loLen; /* length in bytes */ word64 hiLen; /* length in bytes */ - word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)]; - word64 buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64)]; void* heap; +#ifdef USE_INTEL_SPEEDUP + const byte* data; +#endif #ifdef WOLFSSL_ASYNC_CRYPT WC_ASYNC_DEV asyncDev; #endif /* WOLFSSL_ASYNC_CRYPT */