Merge branch 'kojo-intel'

This commit is contained in:
toddouska
2015-04-08 11:38:27 -07:00
4 changed files with 237 additions and 87 deletions

View File

@ -33,6 +33,89 @@
/******************************************************************/
/* fp_montgomery_reduce.c asm or generic */
/* Each platform needs to query info type 1 from cpuid to see if aesni is
* supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
*/
#if defined(HAVE_INTEL_MULX)
#ifndef _MSC_VER
#define cpuid(reg, leaf, sub)\
__asm__ __volatile__ ("cpuid":\
"=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
"a" (leaf), "c"(sub));
#define XASM_LINK(f) asm(f)
#else
#include <intrin.h>
#define cpuid(a,b) __cpuid((int*)a,b)
#define XASM_LINK(f)
#endif /* _MSC_VER */
#define EAX 0
#define EBX 1
#define ECX 2
#define EDX 3
#define CPUID_AVX1 0x1
#define CPUID_AVX2 0x2
#define CPUID_RDRAND 0x4
#define CPUID_RDSEED 0x8
#define CPUID_BMI2 0x10 /* MULX, RORX */
#define CPUID_ADX 0x20 /* ADCX, ADOX */
#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
#define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
#define SET_FLAGS
static word32 cpuid_check = 0 ;
static word32 cpuid_flags = 0 ;
static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
int got_intel_cpu=0;
unsigned int reg[5];
reg[4] = '\0' ;
cpuid(reg, 0, 0);
if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
got_intel_cpu = 1;
}
if (got_intel_cpu) {
cpuid(reg, leaf, sub);
return((reg[num]>>bit)&0x1) ;
}
return 0 ;
}
INLINE static int set_cpuid_flags(void) {
if(cpuid_check == 0) {
if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
cpuid_check = 1 ;
return 0 ;
}
return 1 ;
}
#define RETURN return
#define IF_HAVE_INTEL_MULX(func, ret) \
if(cpuid_check==0)set_cpuid_flags() ; \
if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
#else
#define IF_HAVE_INTEL_MULX(func, ret)
#endif
#if defined(TFM_X86) && !defined(TFM_SSE2)
/* x86-32 code */
@ -87,7 +170,7 @@ __asm__( \
:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
: "%rax", "%rdx", "cc")
#ifdef HAVE_INTEL_MULX
#if defined(HAVE_INTEL_MULX)
#define MULX_INIT(a0, c0, cy)\
__asm__ volatile( \
"xorq %%r10, %%r10\n\t" \
@ -1208,80 +1291,6 @@ __asm__( \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
#elif defined(HAVE_INTEL_MULX)
/* anything you need at the start */
#define COMBA_START
/* clear the chaining variables */
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
/* forward the carry to the next digit */
#define COMBA_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
x = c0;
/* store the second sum [carry] */
#define COMBA_STORE2(x) \
x = c1;
/* anything you need at the end */
#define COMBA_FINI
#define MULADD_MULX(b0, c0, c1)\
__asm__ volatile ( \
"mulx %2,%%r9, %%r8 \n\t" \
"adoxq %%r9,%0 \n\t" \
"adcxq %%r8,%1 \n\t" \
:"+r"(c0),"+r"(c1):"r"(b0):"%r8","%r9","%r10","%rdx"\
)
#define MULADD_MULX_ADD_CARRY(c0, c1)\
__asm__ volatile(\
"mov $0, %%r10\n\t"\
"movq %1, %%r8\n\t" \
"adox %%r10, %0\n\t"\
"adcx %%r10, %1\n\t"\
:"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
#define MULADD_SET_A(a0)\
__asm__ volatile("add $0, %%r8\n\t" \
"movq %0,%%rdx\n\t"::"r"(a0):"%r8","%r9","%r10","%rdx") ; \
#define MULADD_BODY(a,b,c)\
cp = &(c->dp[iz]) ;\
c0 = cp[0] ; c1 = cp[1];\
MULADD_SET_A(a->dp[ix]) ;\
MULADD_MULX(b0, c0, c1) ;\
cp[0]=c0; c0=cp[2]; cp++ ;\
MULADD_MULX(b1, c1, c0) ;\
cp[0]=c1; c1=cp[2]; cp++ ; \
MULADD_MULX(b2, c0, c1) ;\
cp[0]=c0; c0=cp[2]; cp++ ; \
MULADD_MULX(b3, c1, c0) ;\
cp[0]=c1; c1=cp[2]; cp++ ; \
MULADD_MULX_ADD_CARRY(c0, c1) ;\
cp[0]=c0; cp[1]=c1;
#define TFM_INTEL_MUL_COMBA(a, b, c)\
for(ix=0; ix<pa; ix++)c->dp[ix]=0 ;\
for(iy=0; (iy<b->used); iy+=4) {\
fp_digit *bp ;\
bp = &(b->dp[iy+0]) ; \
fp_digit b0 = bp[0] , b1= bp[1], b2= bp[2], b3= bp[3];\
ix=0, iz=iy;\
while(ix<a->used) {\
fp_digit c0, c1; \
fp_digit *cp ;\
MULADD_BODY(a,b,c); ix++ ; iz++ ; \
}\
};
#elif defined(TFM_X86_64)
/* x86-64 optimized */
@ -1317,6 +1326,65 @@ __asm__ ( \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
#if defined(HAVE_INTEL_MULX)
#define MULADD_MULX(b0, c0, c1, rdx)\
__asm__ volatile ( \
"movq %3, %%rdx\n\t" \
"mulx %2,%%r9, %%r8 \n\t" \
"adoxq %%r9,%0 \n\t" \
"adcxq %%r8,%1 \n\t" \
:"+r"(c0),"+r"(c1):"r"(b0), "r"(rdx):"%r8","%r9","%r10","%rdx"\
)
#define MULADD_MULX_ADD_CARRY(c0, c1)\
__asm__ volatile(\
"mov $0, %%r10\n\t"\
"movq %1, %%r8\n\t"\
"adox %%r10, %0\n\t"\
"adcx %%r10, %1\n\t"\
:"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
#define MULADD_SET_A(a0)\
__asm__ volatile("add $0, %%r8\n\t" \
"movq %0,%%rdx\n\t" \
::"r"(a0):"%r8","%r9","%r10","%rdx") ;
#define MULADD_BODY(a,b,c)\
{ word64 rdx = a->dp[ix] ; \
cp = &(c->dp[iz]) ; \
c0 = cp[0] ; c1 = cp[1]; \
MULADD_SET_A(rdx) ; \
MULADD_MULX(b0, c0, c1, rdx) ;\
cp[0]=c0; c0=cp[2]; \
MULADD_MULX(b1, c1, c0, rdx) ;\
cp[1]=c1; c1=cp[3]; \
MULADD_MULX(b2, c0, c1, rdx) ;\
cp[2]=c0; c0=cp[4]; \
MULADD_MULX(b3, c1, c0, rdx) ;\
cp[3]=c1; c1=cp[5]; \
MULADD_MULX_ADD_CARRY(c0, c1);\
cp[4]=c0; cp[5]=c1; \
}
#define TFM_INTEL_MUL_COMBA(a, b, c)\
for(ix=0; ix<pa; ix++)c->dp[ix]=0 ; \
for(iy=0; (iy<b->used); iy+=4) { \
fp_digit *bp ; \
bp = &(b->dp[iy+0]) ; \
fp_digit b0 = bp[0] , b1= bp[1], \
b2= bp[2], b3= bp[3]; \
ix=0, iz=iy; \
while(ix<a->used) { \
fp_digit c0, c1; \
fp_digit *cp ; \
MULADD_BODY(a,b,c); \
ix++ ; iz++ ; \
} \
};
#endif
#elif defined(TFM_SSE2)
/* use SSE2 optimizations */

View File

@ -176,9 +176,11 @@ int InitSha256(Sha256* sha256) {
#define CPUID_AVX2 0x2
#define CPUID_RDRAND 0x4
#define CPUID_RDSEED 0x8
#define CPUID_BMI2 0x10 /* MULX, RORX */
#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
@ -207,6 +209,7 @@ static int set_cpuid_flags(void) {
if(cpuid_check==0) {
if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
cpuid_check = 1 ;
@ -235,7 +238,7 @@ static void set_Transform(void) {
if(set_cpuid_flags())return ;
#if defined(HAVE_INTEL_AVX2)
if(IS_INTEL_AVX2){
if(IS_INTEL_AVX2 && IS_INTEL_BMI2){
Transform_p = Transform_AVX1_RORX; return ;
Transform_p = Transform_AVX2 ;
/* for avoiding warning,"not used" */

View File

@ -208,9 +208,11 @@ int InitSha512(Sha512* sha512) {
#define CPUID_AVX2 0x2
#define CPUID_RDRAND 0x4
#define CPUID_RDSEED 0x8
#define CPUID_BMI2 0x10 /* MULX, RORX */
#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
@ -242,6 +244,7 @@ static int set_cpuid_flags(int sha) {
if((cpuid_check & sha) ==0) {
if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
cpuid_check |= sha ;
@ -276,7 +279,7 @@ static void set_Transform(void) {
if(set_cpuid_flags(CHECK_SHA512)) return ;
#if defined(HAVE_INTEL_AVX2)
if(IS_INTEL_AVX2){
if(IS_INTEL_AVX2 && IS_INTEL_BMI2){
Transform_p = Transform_AVX1_RORX; return ;
Transform_p = Transform_AVX2 ;
/* for avoiding warning,"not used" */
@ -1352,7 +1355,7 @@ static void set_Transform384(void) {
Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
#elif defined(HAVE_INTEL_AVX2)
#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX)
if(IS_INTEL_AVX2) { Transform384_p = Transform384_AVX1_RORX ; return ; }
if(IS_INTEL_AVX2 && IS_INTEL_BMI2) { Transform384_p = Transform384_AVX1_RORX ; return ; }
#endif
if(IS_INTEL_AVX2) { Transform384_p = Transform384_AVX2 ; return ; }
#if defined(HAVE_INTEL_AVX1)

View File

@ -402,7 +402,8 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* generic PxQ multiplier */
#if defined(HAVE_INTEL_MULX)
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
INLINE static void fp_mul_comba_mulx(fp_int *A, fp_int *B, fp_int *C)
{
int ix, iy, iz, pa;
@ -429,14 +430,16 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
fp_clamp(dst);
fp_copy(dst, C);
}
#endif
#else
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
{
int ix, iy, iz, tx, ty, pa;
fp_digit c0, c1, c2, *tmpx, *tmpy;
fp_int tmp, *dst;
IF_HAVE_INTEL_MULX(fp_mul_comba_mulx(A, B, C), return) ;
COMBA_START;
COMBA_CLEAR;
@ -485,7 +488,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
fp_clamp(dst);
fp_copy(dst, C);
}
#endif
/* a/b => cb + d == a */
int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
@ -1567,10 +1569,9 @@ static inline void innermul8_mulx(fp_digit *c_mulx, fp_digit *cy_mulx, fp_digit
c_mulx[0]=_c0; c_mulx[1]=_c1; c_mulx[2]=_c2; c_mulx[3]=_c3; c_mulx[4]=_c4; c_mulx[5]=_c5; c_mulx[6]=_c6; c_mulx[7]=_c7;
*cy_mulx = cy ;
}
#endif
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
{
fp_digit c[FP_SIZE], *_c, *tmpm, mu = 0;
int oldused, x, y, pa;
@ -1589,6 +1590,85 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
#endif
/* now zero the buff */
XMEMSET(c, 0, sizeof c);
pa = m->used;
/* copy the input */
oldused = a->used;
for (x = 0; x < oldused; x++) {
c[x] = a->dp[x];
}
MONT_START;
for (x = 0; x < pa; x++) {
fp_digit cy = 0;
/* get Mu for this round */
LOOP_START;
_c = c + x;
tmpm = m->dp;
y = 0;
for (; y < (pa & ~7); y += 8) {
innermul8_mulx(_c, &cy, tmpm, mu) ;
_c += 8;
tmpm += 8;
}
for (; y < pa; y++) {
INNERMUL;
++_c;
}
LOOP_END;
while (cy) {
PROPCARRY;
++_c;
}
}
/* now copy out */
_c = c + pa;
tmpm = a->dp;
for (x = 0; x < pa+1; x++) {
*tmpm++ = *_c++;
}
for (; x < oldused; x++) {
*tmpm++ = 0;
}
MONT_FINI;
a->used = pa+1;
fp_clamp(a);
/* if A >= m then A = A - m */
if (fp_cmp_mag (a, m) != FP_LT) {
s_fp_sub (a, m, a);
}
}
#endif
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{
fp_digit c[FP_SIZE], *_c, *tmpm, mu = 0;
int oldused, x, y, pa;
IF_HAVE_INTEL_MULX(fp_montgomery_reduce_mulx(a, m, mp), return) ;
/* bail if too large */
if (m->used > (FP_SIZE/2)) {
(void)mu; /* shut up compiler */
return;
}
#ifdef TFM_SMALL_MONT_SET
if (m->used <= 16) {
fp_montgomery_reduce_small(a, m, mp);
return;
}
#endif
/* now zero the buff */
XMEMSET(c, 0, sizeof c);
pa = m->used;
@ -1609,11 +1689,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
y = 0;
#if (defined(TFM_SSE2) || defined(TFM_X86_64))
for (; y < (pa & ~7); y += 8) {
#ifdef HAVE_INTEL_MULX
innermul8_mulx(_c, &cy, tmpm, mu) ;
#else
INNERMUL8 ;
#endif
_c += 8;
tmpm += 8;
}