Merge branch 'kojo-intel'

This commit is contained in:
toddouska
2015-03-30 11:11:28 -07:00
2 changed files with 44 additions and 50 deletions

View File

@@ -94,60 +94,64 @@ __asm__( \
"movq %1,%%rdx\n\t" \ "movq %1,%%rdx\n\t" \
"addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \ "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \
"adoxq %%r10, %%r10\n\t" /* Reset OF */ \ "adoxq %%r10, %%r10\n\t" /* Reset OF */ \
:"+m"(c0):"r"(a0),"r"(cy):"%r8","%r10","%r11","%r12","%rdx") ; \ :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r9", "%r10","%r11","%r12","%rdx") ; \
#define MULX_INNERMUL_R1(c0, c1, pre)\ #define MULX_INNERMUL_R1(c0, c1, pre, rdx)\
{ \ { \
__asm__ volatile ( \ __asm__ volatile ( \
"mulx %%r11,%%r9, %%r8 \n\t" \ "movq %3, %%rdx\n\t" \
"mulx %%r11,%%r9, %%r8 \n\t" \
"movq %2, %%r12\n\t" \ "movq %2, %%r12\n\t" \
"adoxq %%r9,%0 \n\t" \ "adoxq %%r9,%0 \n\t" \
"adcxq %%r8,%1 \n\t" \ "adcxq %%r8,%1 \n\t" \
:"+r"(c0),"+r"(c1):"m"(pre):"%r8","%r9","%r11","%r12","%rdx" \ :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
); } ); }
#define MULX_INNERMUL_R2(c0, c1, pre)\ #define MULX_INNERMUL_R2(c0, c1, pre, rdx)\
{ \ { \
__asm__ volatile ( \ __asm__ volatile ( \
"mulx %%r12,%%r9, %%r8 \n\t" \ "movq %3, %%rdx\n\t" \
"mulx %%r12,%%r9, %%r8 \n\t" \
"movq %2, %%r11\n\t" \ "movq %2, %%r11\n\t" \
"adoxq %%r9,%0 \n\t" \ "adoxq %%r9,%0 \n\t" \
"adcxq %%r8,%1 \n\t" \ "adcxq %%r8,%1 \n\t" \
:"+r"(c0),"+r"(c1):"m"(pre):"%r8","%r9","%r11","%r12","%rdx" \ :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
); } ); }
#define MULX_LOAD_R1(val)\ #define MULX_LOAD_R1(val)\
__asm__ volatile ( \ __asm__ volatile ( \
"movq %0, %%r11\n\t"\ "movq %0, %%r11\n\t"\
::"m"(val):"%r11"\ ::"m"(val):"%r8","%r9", "%r10", "%r11","%r12","%rdx"\
) ; ) ;
#define MULX_INNERMUL_LAST(c0, c1)\ #define MULX_INNERMUL_LAST(c0, c1, rdx)\
{ \ { \
__asm__ volatile ( \ __asm__ volatile ( \
"movq %2, %%rdx\n\t" \
"mulx %%r12,%%r9, %%r8 \n\t" \ "mulx %%r12,%%r9, %%r8 \n\t" \
"movq $0, %%r10 \n\t" \ "movq $0, %%r10 \n\t" \
"adoxq %%r10, %%r9 \n\t" \ "adoxq %%r10, %%r9 \n\t" \
"adcq $0,%%r8 \n\t" \ "adcq $0,%%r8 \n\t" \
"addq %%r9,%0 \n\t" \ "addq %%r9,%0 \n\t" \
"adcq $0,%%r8 \n\t" \ "adcq $0,%%r8 \n\t" \
"movq %%r8,%1 \n\t" \ "movq %%r8,%1 \n\t" \
:"+m"(c0),"=m"(c1)::"%r8","%r9","%r10","%r12","%rdx"\ :"+m"(c0),"=m"(c1):"r"(rdx):"%r8","%r9","%r10", "%r11", "%r12","%rdx"\
); } ); }
#define MULX_INNERMUL8(x,y,z,cy)\ #define MULX_INNERMUL8(x,y,z,cy)\
{ word64 rdx = y ;\
MULX_LOAD_R1(x[0]) ;\ MULX_LOAD_R1(x[0]) ;\
MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \ MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \
MULX_INNERMUL_R1(_c0, _c1, x[1]) ;\ MULX_INNERMUL_R1(_c0, _c1, x[1], rdx) ;\
MULX_INNERMUL_R2(_c1, _c2, x[2]) ;\ MULX_INNERMUL_R2(_c1, _c2, x[2], rdx) ;\
MULX_INNERMUL_R1(_c2, _c3, x[3]) ;\ MULX_INNERMUL_R1(_c2, _c3, x[3], rdx) ;\
MULX_INNERMUL_R2(_c3, _c4, x[4]) ;\ MULX_INNERMUL_R2(_c3, _c4, x[4], rdx) ;\
MULX_INNERMUL_R1(_c4, _c5, x[5]) ;\ MULX_INNERMUL_R1(_c4, _c5, x[5], rdx) ;\
MULX_INNERMUL_R2(_c5, _c6, x[6]) ;\ MULX_INNERMUL_R2(_c5, _c6, x[6], rdx) ;\
MULX_INNERMUL_R1(_c6, _c7, x[7]) ;\ MULX_INNERMUL_R1(_c6, _c7, x[7], rdx) ;\
MULX_INNERMUL_LAST(_c7, cy) ;\ MULX_INNERMUL_LAST(_c7, cy, rdx) ;\
}
#define INNERMUL8_MULX \ #define INNERMUL8_MULX \
{\ {\
MULX_INNERMUL8(tmpm, mu, _c, cy);\ MULX_INNERMUL8(tmpm, mu, _c, cy);\
@@ -1233,7 +1237,7 @@ __asm__( \
"mulx %2,%%r9, %%r8 \n\t" \ "mulx %2,%%r9, %%r8 \n\t" \
"adoxq %%r9,%0 \n\t" \ "adoxq %%r9,%0 \n\t" \
"adcxq %%r8,%1 \n\t" \ "adcxq %%r8,%1 \n\t" \
:"+r"(c0),"+r"(c1):"r"(b0):"%r8","%r9","%rdx"\ :"+r"(c0),"+r"(c1):"r"(b0):"%r8","%r9","%r10","%rdx"\
) )

View File

@@ -209,12 +209,13 @@ static int set_cpuid_flags(void) {
if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
cpuid_check = 1 ; cpuid_check = 1 ;
return 0 ; return 0 ;
} }
return 1 ; return 1 ;
} }
/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
static int Transform(Sha256* sha256); static int Transform(Sha256* sha256);
@@ -256,10 +257,10 @@ static void set_Transform(void) {
/* Dummy for saving MM_REGs on behalf of Transform */ /* Dummy for saving MM_REGs on behalf of Transform */
#if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1)
#define SAVE_XMM_YMM __asm__ volatile("or %%r8, %%r8":::\ #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
"%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
#elif defined(HAVE_INTEL_AVX1) #elif defined(HAVE_INTEL_AVX1)
#define SAVE_XMM_YMM __asm__ volatile("or %%r8, %%r8":::\ #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
"xmm11","xmm12","xmm13","xmm14","xmm15") "xmm11","xmm12","xmm13","xmm14","xmm15")
#else #else
@@ -947,29 +948,18 @@ __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
#define W_K_from_buff\ #define W_K_from_buff\
{ ALIGN32 word64 _buff[2] ; \ __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
/* X0..3(xmm4..7) = sha256->buffer[0.15]; */\
_buff[0] = *(word64*)&sha256->buffer[0] ;\
_buff[1] = *(word64*)&sha256->buffer[2] ;\
__asm__ volatile("vmovaps %0, %%xmm4\n\t"\
"vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\ "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
:: "m"(_buff[0]):"%xmm4") ;\ :: "m"(sha256->buffer[0]):"%xmm4") ;\
_buff[0] = *(word64*)&sha256->buffer[4] ;\ __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
_buff[1] = *(word64*)&sha256->buffer[6] ;\
__asm__ volatile("vmovaps %0, %%xmm5\n\t"\
"vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\ "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
::"m"(_buff[0]):"%xmm5") ;\ ::"m"(sha256->buffer[4]):"%xmm5") ;\
_buff[0] = *(word64*)&sha256->buffer[8] ;\ __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
_buff[1] = *(word64*)&sha256->buffer[10] ;\
__asm__ volatile("vmovaps %0, %%xmm6\n\t"\
"vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\ "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
::"m"(_buff[0]):"%xmm6") ;\ ::"m"(sha256->buffer[8]):"%xmm6") ;\
_buff[0] = *(word64*)&sha256->buffer[12] ;\ __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
_buff[1] = *(word64*)&sha256->buffer[14] ;\
__asm__ volatile("vmovaps %0, %%xmm7\n\t"\
"vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\ "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
::"m"(_buff[0]):"%xmm7") ;\ ::"m"(sha256->buffer[12]):"%xmm7") ;\
}\
#define _SET_W_K_XFER(reg, i)\ #define _SET_W_K_XFER(reg, i)\
__asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\ __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\
@@ -977,15 +967,15 @@ __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
static const __attribute__((aligned(32))) word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */ static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */
static const __attribute__((aligned(32))) word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */ static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */
static const __attribute__((aligned(32))) word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ; static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
#define _Init_Masks(mask1, mask2, mask3)\ #define _Init_Masks(mask1, mask2, mask3)\
__asm__ volatile("vmovaps %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0]):) ;\ __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\
__asm__ volatile("vmovaps %0, %"#mask2 ::"m"(mSHUF_00BA[0]):) ;\ __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\
__asm__ volatile("vmovaps %0, %"#mask3 ::"m"(mSHUF_DC00[0]):) ; __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ;
#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
_Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)