diff --git a/wolfcrypt/src/asm.c b/wolfcrypt/src/asm.c old mode 100644 new mode 100755 index bdaa51b40..2c0fa3ee5 --- a/wolfcrypt/src/asm.c +++ b/wolfcrypt/src/asm.c @@ -72,7 +72,7 @@ __asm__( \ #define MONT_FINI #define LOOP_END #define LOOP_START \ - mu = c[x] * mp + mu = c[x] * mp; #define INNERMUL \ __asm__( \ @@ -87,6 +87,73 @@ __asm__( \ :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ : "%rax", "%rdx", "cc") +#ifdef HAVE_INTEL_MULX +#define MULX_INIT(a0, c0, cy)\ + __asm__ volatile( \ + "xorq %%r10, %%r10\n\t" \ + "movq %1,%%rdx\n\t" \ + "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \ + "adoxq %%r10, %%r10\n\t" /* Reset OF */ \ + :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r10","%r11","%r12","%rdx") ; \ + +#define MULX_INNERMUL_R1(c0, c1, pre)\ + { \ + __asm__ volatile ( \ + "mulx %%r11,%%r9, %%r8 \n\t" \ + "movq %2, %%r12\n\t" \ + "adoxq %%r9,%0 \n\t" \ + "adcxq %%r8,%1 \n\t" \ + :"+r"(c0),"+r"(c1):"m"(pre):"%r8","%r9","%r11","%r12","%rdx" \ + ); } + + +#define MULX_INNERMUL_R2(c0, c1, pre)\ + { \ + __asm__ volatile ( \ + "mulx %%r12,%%r9, %%r8 \n\t" \ + "movq %2, %%r11\n\t" \ + "adoxq %%r9,%0 \n\t" \ + "adcxq %%r8,%1 \n\t" \ + :"+r"(c0),"+r"(c1):"m"(pre):"%r8","%r9","%r11","%r12","%rdx" \ + ); } + +#define MULX_LOAD_R1(val)\ + __asm__ volatile ( \ + "movq %0, %%r11\n\t"\ + ::"m"(val):"%r11"\ +) ; + +#define MULX_INNERMUL_LAST(c0, c1)\ + { \ + __asm__ volatile ( \ + "mulx %%r12,%%r9, %%r8 \n\t" \ + "movq $0, %%r10 \n\t" \ + "adoxq %%r10, %%r9 \n\t" \ + "adcq $0,%%r8 \n\t" \ + "addq %%r9,%0 \n\t" \ + "adcq $0,%%r8 \n\t" \ + "movq %%r8,%1 \n\t" \ + :"+m"(c0),"=m"(c1)::"%r8","%r9","%r10","%r12","%rdx"\ + ); } + +#define MULX_INNERMUL8(x,y,z,cy)\ + MULX_LOAD_R1(x[0]) ;\ + MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \ + MULX_INNERMUL_R1(_c0, _c1, x[1]) ;\ + MULX_INNERMUL_R2(_c1, _c2, x[2]) ;\ + MULX_INNERMUL_R1(_c2, _c3, x[3]) ;\ + MULX_INNERMUL_R2(_c3, _c4, x[4]) ;\ + MULX_INNERMUL_R1(_c4, _c5, x[5]) ;\ + MULX_INNERMUL_R2(_c5, _c6, x[6]) ;\ + MULX_INNERMUL_R1(_c6, _c7, x[7]) ;\ + MULX_INNERMUL_LAST(_c7, cy) ;\ + +#define INNERMUL8_MULX \ +{\ + MULX_INNERMUL8(tmpm, mu, _c, cy);\ +} +#endif + #define INNERMUL8 \ __asm__( \ "movq 0(%5),%%rax \n\t" \ @@ -178,8 +245,7 @@ __asm__( \ \ :"=r"(_c), "=r"(cy) \ : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ -: "%rax", "%rdx", "%r10", "%r11", "cc") - +: "%rax", "%rdx", "%r10", "%r11", "cc")\ #define PROPCARRY \ __asm__( \ @@ -1138,6 +1204,80 @@ __asm__( \ "adcl $0,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); +#elif defined(HAVE_INTEL_MULX) + +/* anything you need at the start */ +#define COMBA_START + +/* clear the chaining variables */ +#define COMBA_CLEAR \ + c0 = c1 = c2 = 0; + +/* forward the carry to the next digit */ +#define COMBA_FORWARD \ + do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +/* store the first sum */ +#define COMBA_STORE(x) \ + x = c0; + +/* store the second sum [carry] */ +#define COMBA_STORE2(x) \ + x = c1; + +/* anything you need at the end */ +#define COMBA_FINI + +#define MULADD_MULX(b0, c0, c1)\ + __asm__ volatile ( \ + "mulx %2,%%r9, %%r8 \n\t" \ + "adoxq %%r9,%0 \n\t" \ + "adcxq %%r8,%1 \n\t" \ + :"+r"(c0),"+r"(c1):"r"(b0):"%r8","%r9","%rdx"\ + ) + + +#define MULADD_MULX_ADD_CARRY(c0, c1)\ + __asm__ volatile(\ + "mov $0, %%r10\n\t"\ + "movq %1, %%r8\n\t" \ + "adox %%r10, %0\n\t"\ + "adcx %%r10, %1\n\t"\ + :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ; + +#define MULADD_SET_A(a0)\ + __asm__ volatile("add $0, %%r8\n\t" \ + "movq %0,%%rdx\n\t"::"r"(a0):"%r8","%r9","%r10","%rdx") ; \ + +#define MULADD_BODY(a,b,c)\ + cp = &(c->dp[iz]) ;\ + c0 = cp[0] ; c1 = cp[1];\ + MULADD_SET_A(a->dp[ix]) ;\ + MULADD_MULX(b0, c0, c1) ;\ + cp[0]=c0; c0=cp[2]; cp++ ;\ + MULADD_MULX(b1, c1, c0) ;\ + cp[0]=c1; c1=cp[2]; cp++ ; \ + MULADD_MULX(b2, c0, c1) ;\ + cp[0]=c0; c0=cp[2]; cp++ ; \ + MULADD_MULX(b3, c1, c0) ;\ + cp[0]=c1; c1=cp[2]; cp++ ; \ + MULADD_MULX_ADD_CARRY(c0, c1) ;\ + cp[0]=c0; cp[1]=c1; + +#define TFM_INTEL_MUL_COMBA(a, b, c)\ + for(ix=0; ixdp[ix]=0 ;\ + for(iy=0; (iyused); iy+=4) {\ + fp_digit *bp ;\ + bp = &(b->dp[iy+0]) ; \ + fp_digit b0 = bp[0] , b1= bp[1], b2= bp[2], b3= bp[3];\ + ix=0, iz=iy;\ + while(ixused) {\ + fp_digit c0, c1; \ + fp_digit *cp ;\ + MULADD_BODY(a,b,c); ix++ ; iz++ ; \ + }\ +}; + #elif defined(TFM_X86_64) /* x86-64 optimized */ diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index 097a9c46d..ce02bb551 100755 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -203,14 +203,16 @@ static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { return 0 ; } -static void set_cpuid_flags(void) { +static int set_cpuid_flags(void) { if(cpuid_check==0) { - if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ; } - if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } - if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;} - if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;} - cpuid_check = 1 ; - } + if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} + if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } + if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } + if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } + cpuid_check = 1 ; + return 0 ; + } + return 1 ; } /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ @@ -229,16 +231,19 @@ static int (*Transform_p)(Sha256* sha256) /* = _Transform */; #define XTRANSFORM(sha256, B) (*Transform_p)(sha256) static void set_Transform(void) { - set_cpuid_flags() ; + if(set_cpuid_flags())return ; #if defined(HAVE_INTEL_AVX2) - if(IS_INTEL_AVX2){ Transform_p = Transform_AVX1_RORX; return ; } - Transform_p = Transform_AVX2 ; /* for avoiding warning,"not used" */ + if(IS_INTEL_AVX2){ + Transform_p = Transform_AVX1_RORX; return ; + Transform_p = Transform_AVX2 ; + /* for avoiding warning,"not used" */ + } #endif #if defined(HAVE_INTEL_AVX1) Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ; #endif - Transform_p = Transform ; + Transform_p = Transform ; return ; } #else @@ -251,10 +256,10 @@ static void set_Transform(void) { /* Dummy for saving MM_REGs on behalf of Transform */ #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) -#define SAVE_XMM_YMM __asm__ volatile("vpxor %%ymm7, %%ymm7, %%ymm7":::\ +#define SAVE_XMM_YMM __asm__ volatile("or %%r8, %%r8":::\ "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") #elif defined(HAVE_INTEL_AVX1) -#define SAVE_XMM_YMM __asm__ volatile("vpxor %%xmm7, %%xmm7, %%xmm7":::\ +#define SAVE_XMM_YMM __asm__ volatile("or %%r8, %%r8":::\ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ "xmm11","xmm12","xmm13","xmm14","xmm15") #else @@ -336,25 +341,6 @@ static const ALIGN32 word32 K[64] = { #endif -#if defined(HAVE_INTEL_RORX) -#define ROTR(func, bits, x) \ -word32 func(word32 x) { word32 ret ;\ - __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\ - return ret ;\ -} - -static INLINE ROTR(rotrFixed_2, 2, x) -static INLINE ROTR(rotrFixed_13, 13, x) -static INLINE ROTR(rotrFixed_22, 22, x) -static INLINE ROTR(rotrFixed_6, 6, x) -static INLINE ROTR(rotrFixed_11, 11, x) -static INLINE ROTR(rotrFixed_25, 25, x) -static INLINE ROTR(rotrFixed_7, 7, x) -static INLINE ROTR(rotrFixed_18, 18, x) -static INLINE ROTR(rotrFixed_17, 17, x) -static INLINE ROTR(rotrFixed_19, 19, x) -#endif - #if defined(FREESCALE_MMCAU) static int Transform(Sha256* sha256, byte* buf) @@ -370,18 +356,11 @@ static int Transform(Sha256* sha256, byte* buf) #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) #define R(x, n) (((x)&0xFFFFFFFFU)>>(n)) -#if !defined(HAVE_INTEL_RORX) #define S(x, n) rotrFixed(x, n) #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) -#else -#define Sigma0(x) (rotrFixed_2(x) ^ rotrFixed_13(x) ^ rotrFixed_22(x)) -#define Sigma1(x) (rotrFixed_6(x) ^ rotrFixed_11(x) ^ rotrFixed_25(x)) -#define Gamma0(x) (rotrFixed_7(x) ^ rotrFixed_18(x) ^ R(x, 3)) -#define Gamma1(x) (rotrFixed_17(x) ^ rotrFixed_19(x) ^ R(x, 10)) -#endif #define RND(a,b,c,d,e,f,g,h,i) \ t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ @@ -634,7 +613,7 @@ int wc_Sha256Hash(const byte* data, word32 len, byte* hash) #define S_6 %ebx #define S_7 %r9d -#define SSE_REGs "%esi", "%r8", "%edx", "%ebx","%r9","%r10","%r11","%r12","%r13","%r14","%r15" +#define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" #if defined(HAVE_INTEL_RORX) #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ @@ -732,7 +711,7 @@ __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ -__asm__ volatile("movl %r8d, "#h"\n\t"); \ +__asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ #define RND_X(a,b,c,d,e,f,g,h,i) \ diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index 92ade5941..f77c8a2cf 100755 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -235,12 +235,16 @@ static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { return 0 ; } -static int set_cpuid_flags(void) { - if(cpuid_check==0) { +#define CHECK_SHA512 0x1 +#define CHECK_SHA384 0x2 + +static int set_cpuid_flags(int sha) { + if((cpuid_check & sha) ==0) { if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } + cpuid_check |= sha ; return 0 ; } return 1 ; @@ -269,21 +273,19 @@ static int (*Transform_p)(Sha512* sha512) = _Transform ; #define Transform(sha512) (*Transform_p)(sha512) static void set_Transform(void) { - if(set_cpuid_flags()) return ; + if(set_cpuid_flags(CHECK_SHA512)) return ; -#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; -#elif defined(HAVE_INTEL_AVX2) - #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX) - if(IS_INTEL_AVX2) { Transform_p = Transform_AVX1_RORX ; return ; } - #endif - if(IS_INTEL_AVX2) { Transform_p = Transform_AVX2 ; return ; } - #if defined(HAVE_INTEL_AVX1) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; - #endif -#else - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; +#if defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX2){ + Transform_p = Transform_AVX1_RORX; return ; + Transform_p = Transform_AVX2 ; + /* for avoiding warning,"not used" */ + } #endif +#if defined(HAVE_INTEL_AVX1) + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; return ; +#endif + Transform_p = _Transform ; return ; } #else @@ -1344,7 +1346,7 @@ static int (*Transform384_p)(Sha384* sha384) = _Transform384 ; #define Transform384(sha384) (*Transform384_p)(sha384) static void set_Transform384(void) { - if(set_cpuid_flags())return ; + if(set_cpuid_flags(CHECK_SHA384))return ; #if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ; diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c old mode 100644 new mode 100755 index 54bce6ac9..e479f8c4d --- a/wolfcrypt/src/tfm.c +++ b/wolfcrypt/src/tfm.c @@ -401,6 +401,36 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c) } /* generic PxQ multiplier */ +#if defined(HAVE_INTEL_MULX) +void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) + +{ + int ix, iy, iz, pa; + fp_int tmp, *dst; + + /* get size of output and trim */ + pa = A->used + B->used; + if (pa >= FP_SIZE) { + pa = FP_SIZE-1; + } + + if (A == C || B == C) { + fp_init(&tmp); + dst = &tmp; + } else { + fp_zero(C); + dst = C; + } + + TFM_INTEL_MUL_COMBA(A, B, dst) ; + + dst->used = pa; + dst->sign = A->sign ^ B->sign; + fp_clamp(dst); + fp_copy(dst, C); +} + +#else void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) { int ix, iy, iz, tx, ty, pa; @@ -455,6 +485,7 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) fp_clamp(dst); fp_copy(dst, C); } +#endif /* a/b => cb + d == a */ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d) @@ -1525,6 +1556,19 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b) #include "fp_mont_small.i" #endif +#ifdef HAVE_INTEL_MULX +static inline void innermul8_mulx(fp_digit *c_mulx, fp_digit *cy_mulx, fp_digit *tmpm, fp_digit mu) +{ + fp_digit _c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7, cy ; + + cy = *cy_mulx ; + _c0=c_mulx[0]; _c1=c_mulx[1]; _c2=c_mulx[2]; _c3=c_mulx[3]; _c4=c_mulx[4]; _c5=c_mulx[5]; _c6=c_mulx[6]; _c7=c_mulx[7]; + INNERMUL8_MULX ; + c_mulx[0]=_c0; c_mulx[1]=_c1; c_mulx[2]=_c2; c_mulx[3]=_c3; c_mulx[4]=_c4; c_mulx[5]=_c5; c_mulx[6]=_c6; c_mulx[7]=_c7; + *cy_mulx = cy ; +} +#endif + /* computes x/R == x (mod N) via Montgomery Reduction */ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) { @@ -1565,12 +1609,15 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) y = 0; #if (defined(TFM_SSE2) || defined(TFM_X86_64)) for (; y < (pa & ~7); y += 8) { - INNERMUL8; + #ifdef HAVE_INTEL_MULX + innermul8_mulx(_c, &cy, tmpm, mu) ; + #else + INNERMUL8 ; + #endif _c += 8; tmpm += 8; } #endif - for (; y < pa; y++) { INNERMUL; ++_c;