diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index f1ea22533..25995431c 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -58,8 +58,20 @@ #ifdef USE_INTEL_CHACHA_SPEEDUP #include #include + + #if defined(__GNUC__) && ((__GNUC__ < 4) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) + #define NO_AVX2_SUPPORT + #endif + #if defined(__clang__) && ((__clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 5)) + #define NO_AVX2_SUPPORT + #endif + #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 + #ifndef NO_AVX2_SUPPORT + #define HAVE_INTEL_AVX2 + #endif #endif #ifdef BIG_ENDIAN_ORDER @@ -408,12 +420,10 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, byte* output; word32 i; word32 cnt = 0; - static const word64 add[2] = { 0x0000000100000000UL,0x0000000300000002UL }; - static const word64 four[2] = { 0x0000000400000004UL,0x0000000400000004UL }; - static const word64 rotl8[2] = - { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; - static const word64 rotl16[2] = - { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; + static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL }; + static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL }; + static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; + static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; if (bytes == 0) return; @@ -638,8 +648,8 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, : [bytes] "+r" (bytes), [cnt] "+r" (cnt), [in] "+r" (m), [out] "+r" (c) : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), - [add] "m" (add), [four] "m" (four), - [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) + [add] "xrm" (add), [four] "xrm" (four), + [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", @@ -675,17 +685,14 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, byte* output; word32 i; word32 cnt = 0; - static const word64 add[4] = { 0x0000000100000000UL, 0x0000000300000002UL, - 0x0000000500000004UL, 0x0000000700000006UL }; - static const word64 eight[4] = - { 0x0000000800000008UL, 0x0000000800000008UL, - 0x0000000800000008UL, 0x0000000800000008UL }; - static const word64 rotl8[4] = - { 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL, - 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL }; - static const word64 rotl16[4] = - { 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL, - 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL }; + static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL, + 0x0000000500000004UL,0x0000000700000006UL }; + static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL, + 0x0000000800000008UL,0x0000000800000008UL }; + static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, + 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; + static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, + 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; if (bytes == 0) return; @@ -926,8 +933,8 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, : [bytes] "+r" (bytes), [cnt] "+r" (cnt), [in] "+r" (m), [out] "+r" (c) : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), - [add] "m" (add), [eight] "m" (eight), - [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) + [add] "xrm" (add), [eight] "xrm" (eight), + [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index 6d81facba..e50e1b1c0 100755 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -857,15 +857,15 @@ __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ -#define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1::) +#define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1::) +#define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1::) +#define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1::) +#define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1::) +#define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1::) +#define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1::) +#define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1::) +#define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1::) #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ a,b,c,d,e,f,g,h,_i)\ @@ -1021,8 +1021,8 @@ __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ ::"m"(sha256->buffer[12]):"%xmm7");\ #define _SET_W_K_XFER(reg, i)\ - __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs);\ - __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs); + __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]));\ + __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i]):); #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) @@ -1057,8 +1057,6 @@ __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])); #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ #define BYTE_FLIP_MASK %xmm13 -#define XMM_REGs /* Registers are saved in Sha256Update/Finel */ - /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */ static int Transform_AVX1(Sha256* sha256) { @@ -1218,42 +1216,42 @@ static int Transform_AVX1_RORX(Sha256* sha256) #if defined(HAVE_INTEL_AVX2) -#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs); -#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs); +#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem)); +#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem):); #define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\ - :: "m"(map):YMM_REGs); + :: "m"(map)); #define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\ - #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs); + #ymm2", %%"#ymm1", %%"#ymm0" "::); #define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\ - #ymm0"\n\t":: "m"(map):YMM_REGs); + #ymm0"\n\t":: "m"(map)); #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\ #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ - #temp",%%"#dest", %%"#dest" ":::YMM_REGs); + #temp",%%"#dest", %%"#dest" "::); #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\ - #src", %%"#dest" ":::YMM_REGs); + #src", %%"#dest" "::); #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs); + #src2", %%"#dest" "::); #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs); + #src2", %%"#dest" "::); #define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs); + #src2", %%"#dest" "::); #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\ - #dest" "::"m"(mem):YMM_REGs); + #dest" "::"m"(mem)); #define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ - #src1", %%"#src2", %%"#dest" ":::YMM_REGs); + #src1", %%"#src2", %%"#dest" "::); -#define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); -#define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); -#define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); -#define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); +#define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem):); +#define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem):); +#define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem):); +#define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem):); #define _EXTRACT_XMM_4(ymm, xmm, mem)\ - __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs);\ - __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); -#define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); -#define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); -#define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); + __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" "::);\ + __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem):); +#define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem):); +#define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem):); +#define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem):); -#define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs); +#define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" "::); #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) #define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem) @@ -1308,28 +1306,25 @@ static int Transform_AVX1_RORX(Sha256* sha256) #define W_K_TEMP ymm15 #define W_K_TEMPx xmm15 -#define YMM_REGs /* Registers are saved in Sha256Update/Finel */ - /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ - #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs);\ - __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs);\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\ - __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\ - __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" "::);\ + __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" "::);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" "::);\ + __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" "::);\ + __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" "::);\ #define MOVE_7_to_15(w_i_15, w_i_7)\ - __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\ + __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" "::);\ #define MOVE_I_to_7(w_i_7, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\ - __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\ - __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" "::);\ + __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" "::);\ + __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" "::);\ #define MOVE_I_to_2(w_i_2, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs);\ - __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" "::);\ + __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" "::);\ #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ MOVE_15_to_16(w_i_16, w_i_15, w_i_7); \ diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index 60c9a1102..f41eb9bb2 100755 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -140,7 +140,7 @@ #if defined(HAVE_INTEL_RORX) #define ROTR(func, bits, x) \ word64 func(word64 x) { word64 ret ;\ - __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\ + __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x)) ;\ return ret ;\ } @@ -691,15 +691,15 @@ void wc_Sha512Free(Sha512* sha512) /* INLINE Assember for Intel AVX1 instructions */ #if defined(HAVE_INTEL_AVX1) #if defined(DEBUG_XMM) - #define SAVE_REG(i) __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0])::XMM_REGs); - #define RECV_REG(i) __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0]):XMM_REGs); + #define SAVE_REG(i) __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0]):); + #define RECV_REG(i) __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0])); #define _DUMP_REG(REG, name)\ { word64 buf[16];word64 reg[16][2];int k;\ SAVE_REG(0); SAVE_REG(1); SAVE_REG(2); SAVE_REG(3); SAVE_REG(4); \ SAVE_REG(5); SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\ SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \ - __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::XMM_REGs);\ + __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0]):);\ printf(" "#name":\t"); for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n"); \ RECV_REG(0); RECV_REG(1); RECV_REG(2); RECV_REG(3); RECV_REG(4);\ RECV_REG(5); RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\ @@ -714,25 +714,25 @@ void wc_Sha512Free(Sha512* sha512) #endif /* DEBUG_XMM */ #define _MOVE_to_REG(xymm, mem) __asm__ volatile("vmovdqu %0, %%"#xymm" "\ - :: "m"(mem):XMM_REGs); + :: "m"(mem)); #define _MOVE_to_MEM(mem,i, xymm) __asm__ volatile("vmovdqu %%"#xymm", %0" :\ - "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::XMM_REGs); + "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3]):); #define _MOVE(dest, src) __asm__ volatile("vmovdqu %%"#src", %%"\ - #dest" ":::XMM_REGs); + #dest" "::); #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrlq $"#bits", %%"\ #src", %%"#dest"\n\tvpsllq $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ - #temp",%%"#dest", %%"#dest" ":::XMM_REGs); + #temp",%%"#dest", %%"#dest" "::); #define _AVX1_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\ - #src", %%"#dest" ":::XMM_REGs); + #src", %%"#dest" "::); #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ - #src2", %%"#dest" ":::XMM_REGs); + #src2", %%"#dest" "::); #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ - #src2", %%"#dest" ":::XMM_REGs); + #src2", %%"#dest" "::); #define _ADD(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\ - #src2", %%"#dest" ":::XMM_REGs); + #src2", %%"#dest" "::); #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddq %0, %%"#src1", %%"\ - #dest" "::"m"(mem):XMM_REGs); + #dest" "::"m"(mem)); #define MOVE_to_REG(xymm, mem) _MOVE_to_REG(xymm, mem) #define MOVE_to_MEM(mem, i, xymm) _MOVE_to_MEM(mem, i, xymm) @@ -788,8 +788,6 @@ static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; #define W_12 xmm8 #define W_14 xmm9 -#define XMM_REGs - #define s0_1(dest, src) AVX1_S(dest, src, 1); #define s0_2(dest, src) AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest); #define s0_3(dest, src) AVX1_R(G_TEMP, src, 7); XOR(dest, G_TEMP, dest); @@ -909,29 +907,29 @@ static const unsigned long mBYTE_FLIP_MASK_Y[] = #define W_from_buff_Y(buff)\ { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15]; */\ - __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]));\ __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\ "vmovdqu %1, %%ymm4\n\t"\ "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\ "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\ - :: "m"(buff[0]), "m"(buff[4]):YMM_REGs);\ + :: "m"(buff[0]), "m"(buff[4]));\ __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\ "vmovdqu %1, %%ymm6\n\t"\ "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\ "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\ - :: "m"(buff[8]), "m"(buff[12]):YMM_REGs);\ + :: "m"(buff[8]), "m"(buff[12]));\ } #if defined(DEBUG_YMM) - #define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0])::YMM_REGs); - #define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0]):YMM_REGs); + #define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0]):); + #define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0])); #define _DUMP_REG_Y(REG, name)\ { word64 buf[16];word64 reg[16][2];int k;\ SAVE_REG_Y(4); SAVE_REG_Y(5); SAVE_REG_Y(6); SAVE_REG_Y(7); \ SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\ SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \ - __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0]):);\ printf(" "#name":\t"); for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]); printf("\n"); \ RECV_REG_Y(4); RECV_REG_Y(5); RECV_REG_Y(6); RECV_REG_Y(7); \ RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \ @@ -948,26 +946,26 @@ static const unsigned long mBYTE_FLIP_MASK_Y[] = #endif /* DEBUG_YMM */ #define _MOVE_to_REGy(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" "\ - :: "m"(mem):YMM_REGs); + :: "m"(mem)); #define _MOVE_to_MEMy(mem,i, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" \ - : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::YMM_REGs); + : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3]):); #define _MOVE_128y(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"\ - #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs); + #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" "::); #define _S_TEMPy(dest, src, bits, temp) \ __asm__ volatile("vpsrlq $"#bits", %%"#src", %%"#dest"\n\tvpsllq $64-"#bits\ - ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" ":::YMM_REGs); + ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" "::); #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\ - #src", %%"#dest" ":::YMM_REGs); + #src", %%"#dest" "::); #define _XORy(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs); + #src2", %%"#dest" "::); #define _ADDy(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs); + #src2", %%"#dest" "::); #define _BLENDy(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ - #src1", %%"#src2", %%"#dest" ":::YMM_REGs); + #src1", %%"#src2", %%"#dest" "::); #define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd $"#map", %%"\ - #src1", %%"#src2", %%"#dest" ":::YMM_REGs); + #src1", %%"#src2", %%"#dest" "::); #define _PERMQy(map, dest, src) __asm__ volatile("vpermq $"#map", %%"\ - #src", %%"#dest" ":::YMM_REGs); + #src", %%"#dest" "::); #define MOVE_to_REGy(ymm, mem) _MOVE_to_REGy(ymm, mem) #define MOVE_to_MEMy(mem, i, ymm) _MOVE_to_MEMy(mem, i, ymm) @@ -1013,28 +1011,25 @@ static const unsigned long mBYTE_FLIP_MASK_Y[] = #define W_8y ymm5 #define W_12y ymm6 -#define YMM_REGs -/* Registers are saved in Sha512Update/Final */ - /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs);\ - __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs);\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\ - __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\ - __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" "::);\ + __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" "::);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" "::);\ + __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" "::);\ + __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" "::);\ #define MOVE_7_to_15(w_i_15, w_i_7)\ - __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\ + __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" "::);\ #define MOVE_I_to_7(w_i_7, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\ - __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\ - __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" "::);\ + __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" "::);\ + __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" "::);\ #define MOVE_I_to_2(w_i_2, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs);\ - __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs);\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" "::);\ + __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" "::);\ #endif /* HAVE_INTEL_AVX2 */