From a83e85b91e883ab3b3c1bff2ca86952c69eb2865 Mon Sep 17 00:00:00 2001 From: Takashi Kojo Date: Sun, 29 Mar 2015 10:20:30 +0900 Subject: [PATCH 1/3] Intel mulx/adcx/adox --- wolfcrypt/src/asm.c | 143 +++++++++++++++++++++++++++++++++++++++++++- wolfcrypt/src/tfm.c | 51 +++++++++++++++- 2 files changed, 191 insertions(+), 3 deletions(-) mode change 100644 => 100755 wolfcrypt/src/asm.c mode change 100644 => 100755 wolfcrypt/src/tfm.c diff --git a/wolfcrypt/src/asm.c b/wolfcrypt/src/asm.c old mode 100644 new mode 100755 index bdaa51b40..6c602f4a2 --- a/wolfcrypt/src/asm.c +++ b/wolfcrypt/src/asm.c @@ -72,7 +72,7 @@ __asm__( \ #define MONT_FINI #define LOOP_END #define LOOP_START \ - mu = c[x] * mp + mu = c[x] * mp; #define INNERMUL \ __asm__( \ @@ -87,6 +87,73 @@ __asm__( \ :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ : "%rax", "%rdx", "cc") +#ifdef HAVE_INTEL_MULX +#define MULX_INIT(a0, c0, cy)\ + __asm__ volatile( \ + "xorq %%r10, %%r10\n\t" \ + "movq %1,%%rdx\n\t" \ + "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \ + "adoxq %%r10, %%r10\n\t" /* Reset OF */ \ + :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r10","%r11","%r12","%rdx") ; \ + +#define MULX_INNERMUL_R1(c0, c1, pre)\ + { \ + __asm__ volatile ( \ + "mulx %%r11,%%r9, %%r8 \n\t" \ + "movq %2, %%r12\n\t" \ + "adoxq %%r9,%0 \n\t" \ + "adcxq %%r8,%1 \n\t" \ + :"+r"(c0),"+r"(c1):"m"(pre):"%r8","%r9","%r11","%r12","%rdx" \ + ); } + + +#define MULX_INNERMUL_R2(c0, c1, pre)\ + { \ + __asm__ volatile ( \ + "mulx %%r12,%%r9, %%r8 \n\t" \ + "movq %2, %%r11\n\t" \ + "adoxq %%r9,%0 \n\t" \ + "adcxq %%r8,%1 \n\t" \ + :"+r"(c0),"+r"(c1):"m"(pre):"%r8","%r9","%r11","%r12","%rdx" \ + ); } + +#define MULX_LOAD_R1(val)\ + __asm__ volatile ( \ + "movq %0, %%r11\n\t"\ + ::"m"(val):"%r11"\ +) ; + +#define MULX_INNERMUL_LAST(c0, c1)\ + { \ + __asm__ volatile ( \ + "mulx %%r12,%%r9, %%r8 \n\t" \ + "movq $0, %%r10 \n\t" \ + "adoxq %%r10, %%r9 \n\t" \ + "adcq $0,%%r8 \n\t" \ + "addq %%r9,%0 \n\t" \ + "adcq $0,%%r8 \n\t" \ + "movq %%r8,%1 \n\t" \ + :"+m"(c0),"=m"(c1)::"%r8","%r9","%r10","%r12","%rdx"\ + ); } + +#define MULX_INNERMUL8(x,y,z,cy)\ + MULX_LOAD_R1(x[0]) ;\ + MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \ + MULX_INNERMUL_R1(_c0, _c1, x[1]) ;\ + MULX_INNERMUL_R2(_c1, _c2, x[2]) ;\ + MULX_INNERMUL_R1(_c2, _c3, x[3]) ;\ + MULX_INNERMUL_R2(_c3, _c4, x[4]) ;\ + MULX_INNERMUL_R1(_c4, _c5, x[5]) ;\ + MULX_INNERMUL_R2(_c5, _c6, x[6]) ;\ + MULX_INNERMUL_R1(_c6, _c7, x[7]) ;\ + MULX_INNERMUL_LAST(_c7, cy) ;\ + +#define INNERMUL8_MULX \ +{\ + MULX_INNERMUL8(tmpm, mu, _c, cy);\ +} +#endif + #define INNERMUL8 \ __asm__( \ "movq 0(%5),%%rax \n\t" \ @@ -1138,6 +1205,80 @@ __asm__( \ "adcl $0,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); +#elif defined(HAVE_INTEL_MULX) + +/* anything you need at the start */ +#define COMBA_START + +/* clear the chaining variables */ +#define COMBA_CLEAR \ + c0 = c1 = c2 = 0; + +/* forward the carry to the next digit */ +#define COMBA_FORWARD \ + do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +/* store the first sum */ +#define COMBA_STORE(x) \ + x = c0; + +/* store the second sum [carry] */ +#define COMBA_STORE2(x) \ + x = c1; + +/* anything you need at the end */ +#define COMBA_FINI + +#define MULADD_MULX(b0, c0, c1)\ + __asm__ volatile ( \ + "mulx %2,%%r9, %%r8 \n\t" \ + "adoxq %%r9,%0 \n\t" \ + "adcxq %%r8,%1 \n\t" \ + :"+r"(c0),"+r"(c1):"r"(b0):"%r8","%r9","%rdx"\ + ) + + +#define MULADD_MULX_ADD_CARRY(c0, c1)\ + __asm__ volatile(\ + "mov $0, %%r10\n\t"\ + "movq %1, %%r8\n\t" \ + "adox %%r10, %0\n\t"\ + "adcx %%r10, %1\n\t"\ + :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10") ; + +#define MULADD_SET_A(a0)\ + __asm__ volatile("add $0, %%r8\n\t" \ + "movq %0,%%rdx\n\t"::"r"(a0):"%r8","%rdx") ; \ + +#define MULADD_BODY(a,b,c)\ + cp = &(c->dp[iz]) ;\ + c0 = cp[0] ; c1 = cp[1];\ + MULADD_SET_A(a->dp[ix]) ;\ + MULADD_MULX(b0, c0, c1) ;\ + cp[0]=c0; c0=cp[2]; cp++ ;\ + MULADD_MULX(b1, c1, c0) ;\ + cp[0]=c1; c1=cp[2]; cp++ ; \ + MULADD_MULX(b2, c0, c1) ;\ + cp[0]=c0; c0=cp[2]; cp++ ; \ + MULADD_MULX(b3, c1, c0) ;\ + cp[0]=c1; c1=cp[2]; cp++ ; \ + MULADD_MULX_ADD_CARRY(c0, c1) ;\ + cp[0]=c0; cp[1]=c1; + +#define TFM_INTEL_MUL_COMBA(a, b, c)\ + for(ix=0; ixdp[ix]=0 ;\ + for(iy=0; (iyused); iy+=4) {\ + fp_digit *bp ;\ + bp = &(b->dp[iy+0]) ; \ + fp_digit b0 = bp[0] , b1= bp[1], b2= bp[2], b3= bp[3];\ + ix=0, iz=iy;\ + while(ixused) {\ + fp_digit c0, c1; \ + fp_digit *cp ;\ + MULADD_BODY(a,b,c); ix++ ; iz++ ; \ + }\ +}; + #elif defined(TFM_X86_64) /* x86-64 optimized */ diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c old mode 100644 new mode 100755 index 54bce6ac9..e479f8c4d --- a/wolfcrypt/src/tfm.c +++ b/wolfcrypt/src/tfm.c @@ -401,6 +401,36 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c) } /* generic PxQ multiplier */ +#if defined(HAVE_INTEL_MULX) +void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) + +{ + int ix, iy, iz, pa; + fp_int tmp, *dst; + + /* get size of output and trim */ + pa = A->used + B->used; + if (pa >= FP_SIZE) { + pa = FP_SIZE-1; + } + + if (A == C || B == C) { + fp_init(&tmp); + dst = &tmp; + } else { + fp_zero(C); + dst = C; + } + + TFM_INTEL_MUL_COMBA(A, B, dst) ; + + dst->used = pa; + dst->sign = A->sign ^ B->sign; + fp_clamp(dst); + fp_copy(dst, C); +} + +#else void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) { int ix, iy, iz, tx, ty, pa; @@ -455,6 +485,7 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) fp_clamp(dst); fp_copy(dst, C); } +#endif /* a/b => cb + d == a */ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d) @@ -1525,6 +1556,19 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b) #include "fp_mont_small.i" #endif +#ifdef HAVE_INTEL_MULX +static inline void innermul8_mulx(fp_digit *c_mulx, fp_digit *cy_mulx, fp_digit *tmpm, fp_digit mu) +{ + fp_digit _c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7, cy ; + + cy = *cy_mulx ; + _c0=c_mulx[0]; _c1=c_mulx[1]; _c2=c_mulx[2]; _c3=c_mulx[3]; _c4=c_mulx[4]; _c5=c_mulx[5]; _c6=c_mulx[6]; _c7=c_mulx[7]; + INNERMUL8_MULX ; + c_mulx[0]=_c0; c_mulx[1]=_c1; c_mulx[2]=_c2; c_mulx[3]=_c3; c_mulx[4]=_c4; c_mulx[5]=_c5; c_mulx[6]=_c6; c_mulx[7]=_c7; + *cy_mulx = cy ; +} +#endif + /* computes x/R == x (mod N) via Montgomery Reduction */ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) { @@ -1565,12 +1609,15 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) y = 0; #if (defined(TFM_SSE2) || defined(TFM_X86_64)) for (; y < (pa & ~7); y += 8) { - INNERMUL8; + #ifdef HAVE_INTEL_MULX + innermul8_mulx(_c, &cy, tmpm, mu) ; + #else + INNERMUL8 ; + #endif _c += 8; tmpm += 8; } #endif - for (; y < pa; y++) { INNERMUL; ++_c; From bd5fc1712f84916ee372ccb254cb2bea56b06f6c Mon Sep 17 00:00:00 2001 From: Takashi Kojo Date: Sun, 29 Mar 2015 19:39:03 +0900 Subject: [PATCH 2/3] fixed sha256/AVX1 crash with --enable-debug: missing register def in inline asm destroy register rorx removed when non-AVX2 build . cleaned set_Transform --- wolfcrypt/src/sha256.c | 79 +++++++++++++++++++----------------------- wolfcrypt/src/sha512.c | 34 +++++++++--------- 2 files changed, 54 insertions(+), 59 deletions(-) diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index d3eac5253..ce02bb551 100755 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -53,7 +53,21 @@ int wc_Sha256Hash(const byte* data, word32 len, byte* out) { return Sha256Hash(data, len, out); } + #else /* else build without fips */ + +#if !defined (ALIGN32) + #if defined (__GNUC__) + #define ALIGN32 __attribute__ ( (aligned (32))) + #elif defined(_MSC_VER) + /* disable align warning, we want alignment ! */ + #pragma warning(disable: 4324) + #define ALIGN32 __declspec (align (32)) + #else + #define ALIGN32 + #endif +#endif + #ifdef WOLFSSL_PIC32MZ_HASH #define wc_InitSha256 wc_InitSha256_sw #define wc_Sha256Update wc_Sha256Update_sw @@ -189,14 +203,16 @@ static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { return 0 ; } -static void set_cpuid_flags(void) { +static int set_cpuid_flags(void) { if(cpuid_check==0) { - if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ; } - if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } - if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;} - if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;} - cpuid_check = 1 ; - } + if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} + if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } + if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } + if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } + cpuid_check = 1 ; + return 0 ; + } + return 1 ; } /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ @@ -215,16 +231,19 @@ static int (*Transform_p)(Sha256* sha256) /* = _Transform */; #define XTRANSFORM(sha256, B) (*Transform_p)(sha256) static void set_Transform(void) { - set_cpuid_flags() ; + if(set_cpuid_flags())return ; #if defined(HAVE_INTEL_AVX2) - if(IS_INTEL_AVX2){ Transform_p = Transform_AVX1_RORX; return ; } - Transform_p = Transform_AVX2 ; /* for avoiding warning,"not used" */ + if(IS_INTEL_AVX2){ + Transform_p = Transform_AVX1_RORX; return ; + Transform_p = Transform_AVX2 ; + /* for avoiding warning,"not used" */ + } #endif #if defined(HAVE_INTEL_AVX1) Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ; #endif - Transform_p = Transform ; + Transform_p = Transform ; return ; } #else @@ -237,10 +256,10 @@ static void set_Transform(void) { /* Dummy for saving MM_REGs on behalf of Transform */ #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) -#define SAVE_XMM_YMM __asm__ volatile("vpxor %%ymm7, %%ymm7, %%ymm7":::\ +#define SAVE_XMM_YMM __asm__ volatile("or %%r8, %%r8":::\ "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") #elif defined(HAVE_INTEL_AVX1) -#define SAVE_XMM_YMM __asm__ volatile("vpxor %%xmm7, %%xmm7, %%xmm7":::\ +#define SAVE_XMM_YMM __asm__ volatile("or %%r8, %%r8":::\ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ "xmm11","xmm12","xmm13","xmm14","xmm15") #else @@ -304,7 +323,7 @@ int wc_InitSha256(Sha256* sha256) #if !defined(FREESCALE_MMCAU) -static const __attribute__((aligned(32))) word32 K[64] = { +static const ALIGN32 word32 K[64] = { 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, @@ -322,25 +341,6 @@ static const __attribute__((aligned(32))) word32 K[64] = { #endif -#if defined(HAVE_INTEL_RORX) -#define ROTR(func, bits, x) \ -word32 func(word32 x) { word32 ret ;\ - __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\ - return ret ;\ -} - -static INLINE ROTR(rotrFixed_2, 2, x) -static INLINE ROTR(rotrFixed_13, 13, x) -static INLINE ROTR(rotrFixed_22, 22, x) -static INLINE ROTR(rotrFixed_6, 6, x) -static INLINE ROTR(rotrFixed_11, 11, x) -static INLINE ROTR(rotrFixed_25, 25, x) -static INLINE ROTR(rotrFixed_7, 7, x) -static INLINE ROTR(rotrFixed_18, 18, x) -static INLINE ROTR(rotrFixed_17, 17, x) -static INLINE ROTR(rotrFixed_19, 19, x) -#endif - #if defined(FREESCALE_MMCAU) static int Transform(Sha256* sha256, byte* buf) @@ -356,18 +356,11 @@ static int Transform(Sha256* sha256, byte* buf) #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) #define R(x, n) (((x)&0xFFFFFFFFU)>>(n)) -#if !defined(HAVE_INTEL_RORX) #define S(x, n) rotrFixed(x, n) #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) -#else -#define Sigma0(x) (rotrFixed_2(x) ^ rotrFixed_13(x) ^ rotrFixed_22(x)) -#define Sigma1(x) (rotrFixed_6(x) ^ rotrFixed_11(x) ^ rotrFixed_25(x)) -#define Gamma0(x) (rotrFixed_7(x) ^ rotrFixed_18(x) ^ R(x, 3)) -#define Gamma1(x) (rotrFixed_17(x) ^ rotrFixed_19(x) ^ R(x, 10)) -#endif #define RND(a,b,c,d,e,f,g,h,i) \ t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ @@ -620,7 +613,7 @@ int wc_Sha256Hash(const byte* data, word32 len, byte* hash) #define S_6 %ebx #define S_7 %r9d -#define SSE_REGs "%esi", "%r8", "%edx", "%ebx","%r9","%r10","%r11","%r12","%r13","%r14","%r15" +#define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" #if defined(HAVE_INTEL_RORX) #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ @@ -718,7 +711,7 @@ __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ -__asm__ volatile("movl %r8d, "#h"\n\t"); \ +__asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ #define RND_X(a,b,c,d,e,f,g,h,i) \ @@ -954,7 +947,7 @@ __asm__ volatile("movl %r8d, "#h"\n\t"); \ #define W_K_from_buff\ - { __attribute__ ((aligned (32))) word64 _buff[2] ; \ + { ALIGN32 word64 _buff[2] ; \ /* X0..3(xmm4..7) = sha256->buffer[0.15]; */\ _buff[0] = *(word64*)&sha256->buffer[0] ;\ _buff[1] = *(word64*)&sha256->buffer[2] ;\ diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index 92ade5941..f77c8a2cf 100755 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -235,12 +235,16 @@ static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { return 0 ; } -static int set_cpuid_flags(void) { - if(cpuid_check==0) { +#define CHECK_SHA512 0x1 +#define CHECK_SHA384 0x2 + +static int set_cpuid_flags(int sha) { + if((cpuid_check & sha) ==0) { if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } + cpuid_check |= sha ; return 0 ; } return 1 ; @@ -269,21 +273,19 @@ static int (*Transform_p)(Sha512* sha512) = _Transform ; #define Transform(sha512) (*Transform_p)(sha512) static void set_Transform(void) { - if(set_cpuid_flags()) return ; + if(set_cpuid_flags(CHECK_SHA512)) return ; -#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; -#elif defined(HAVE_INTEL_AVX2) - #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX) - if(IS_INTEL_AVX2) { Transform_p = Transform_AVX1_RORX ; return ; } - #endif - if(IS_INTEL_AVX2) { Transform_p = Transform_AVX2 ; return ; } - #if defined(HAVE_INTEL_AVX1) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; - #endif -#else - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; +#if defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX2){ + Transform_p = Transform_AVX1_RORX; return ; + Transform_p = Transform_AVX2 ; + /* for avoiding warning,"not used" */ + } #endif +#if defined(HAVE_INTEL_AVX1) + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; return ; +#endif + Transform_p = _Transform ; return ; } #else @@ -1344,7 +1346,7 @@ static int (*Transform384_p)(Sha384* sha384) = _Transform384 ; #define Transform384(sha384) (*Transform384_p)(sha384) static void set_Transform384(void) { - if(set_cpuid_flags())return ; + if(set_cpuid_flags(CHECK_SHA384))return ; #if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ; From 57d766f1eb2e0760a5a5285ef777de5e5bc7c98f Mon Sep 17 00:00:00 2001 From: Takashi Kojo Date: Sun, 29 Mar 2015 19:50:32 +0900 Subject: [PATCH 3/3] fixed ftm/MULX crash with --enable-debug, missing register def in inline asm destroying registers --- wolfcrypt/src/asm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/wolfcrypt/src/asm.c b/wolfcrypt/src/asm.c index 6c602f4a2..2c0fa3ee5 100755 --- a/wolfcrypt/src/asm.c +++ b/wolfcrypt/src/asm.c @@ -245,8 +245,7 @@ __asm__( \ \ :"=r"(_c), "=r"(cy) \ : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ -: "%rax", "%rdx", "%r10", "%r11", "cc") - +: "%rax", "%rdx", "%r10", "%r11", "cc")\ #define PROPCARRY \ __asm__( \ @@ -1244,11 +1243,11 @@ __asm__( \ "movq %1, %%r8\n\t" \ "adox %%r10, %0\n\t"\ "adcx %%r10, %1\n\t"\ - :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10") ; + :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ; #define MULADD_SET_A(a0)\ __asm__ volatile("add $0, %%r8\n\t" \ - "movq %0,%%rdx\n\t"::"r"(a0):"%r8","%rdx") ; \ + "movq %0,%%rdx\n\t"::"r"(a0):"%r8","%r9","%r10","%rdx") ; \ #define MULADD_BODY(a,b,c)\ cp = &(c->dp[iz]) ;\