diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm
index 439dacc51..b880762d8 100644
--- a/wolfcrypt/src/aes_asm.asm
+++ b/wolfcrypt/src/aes_asm.asm
@@ -981,69 +981,82 @@ MAKE_RK256_b:
 gfmul PROC
         ; xmm0 holds operand a (128 bits)
         ; xmm1 holds operand b (128 bits)
-        ; rdi  holds the pointer to output (128 bits)
-        movdqa     %xmm0, %xmm3
-        pclmulqdq  $0, %xmm1, %xmm3     ; xmm3 holds a0*b0
-        movdqa     %xmm0, %xmm4
-        pclmulqdq  $16, %xmm1, %xmm4    ; xmm4 holds a0*b1
-        movdqa     %xmm0, %xmm5
-        pclmulqdq  $1, %xmm1, %xmm5     ; xmm5 holds a1*b0
-        movdqa     %xmm0, %xmm6
-        pclmulqdq  $17, %xmm1, %xmm6    ; xmm6 holds a1*b1
-        pxor       %xmm5, %xmm4         ; xmm4 holds a0*b1 + a1*b0
-        movdqa     %xmm4, %xmm5
-        psrldq     $8, %xmm4
-        pslldq     $8, %xmm5
-        pxor       %xmm5, %xmm3
-        pxor       %xmm4, %xmm6         ; <xmm6:xmm3> holds the result of
+        ; r8  holds the pointer to output (128 bits)
+
+        ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
+        sub rsp,8+4*16  ; 8 = align stack , 4 xmm6-9 16 bytes each
+        movdqa [rsp+0], xmm6
+        movdqa [rsp+16], xmm7
+        movdqa [rsp+32], xmm8
+        movdqa [rsp+48], xmm9
+
+        movdqa     xmm3, xmm0
+        pclmulqdq  xmm3, xmm1, 0    ; xmm3 holds a0*b0
+        movdqa     xmm4, xmm0
+        pclmulqdq  xmm4, xmm1, 16    ; xmm4 holds a0*b1
+        movdqa     xmm5, xmm0
+        pclmulqdq  xmm5, xmm1, 1     ; xmm5 holds a1*b0
+        movdqa     xmm6, xmm0
+        pclmulqdq  xmm6, xmm1, 17    ; xmm6 holds a1*b1
+        pxor       xmm4, xmm5         ; xmm4 holds a0*b1 + a1*b0
+        movdqa     xmm5, xmm4
+        psrldq     xmm4, 8
+        pslldq     xmm5, 8
+        pxor       xmm3, xmm5
+        pxor       xmm6, xmm4         ; <xmm6:xmm3> holds the result of
                                         ; the carry-less multiplication of
                                         ; xmm0 by xmm1
 
 ; shift the result by one bit position to the left cope for the fact
 ; that bits are reversed
-        movdqa   %xmm3, %xmm7
-        movdqa   %xmm6, %xmm8
-        pslld    $1, %xmm3
-        pslld    $1, %xmm6
-        psrld    $31, %xmm7
-        psrld    $31, %xmm8
-        movdqa   %xmm7, %xmm9
-        pslldq   $4, %xmm8
-        pslldq   $4, %xmm7
-        psrldq   $12, %xmm9
-        por      %xmm7, %xmm3
-        por      %xmm8, %xmm6
-        por      %xmm9, %xmm6
+        movdqa   xmm7, xmm3
+        movdqa   xmm8, xmm6
+        pslld    xmm3, 1
+        pslld    xmm6, 1
+        psrld    xmm7, 31
+        psrld    xmm8, 31
+        movdqa   xmm9, xmm7
+        pslldq   xmm8, 4
+        pslldq   xmm7, 4
+        psrldq   xmm9, 12
+        por      xmm3, xmm7
+        por      xmm6, xmm8
+        por      xmm6, xmm9
 
 ; first phase of the reduction
-        movdqa   %xmm3, %xmm7
-        movdqa   %xmm3, %xmm8
-        movdqa   %xmm3, %xmm9
-        pslld    $31, %xmm7             ; packed right shifting << 31
-        pslld    $30, %xmm8             ; packed right shifting shift << 30
-        pslld    $25, %xmm9             ; packed right shifting shift << 25
-        pxor     %xmm8, %xmm7           ; xor the shifted versions
-        pxor     %xmm9, %xmm7
+        movdqa   xmm7, xmm3
+        movdqa   xmm8, xmm3
+        movdqa   xmm9, xmm3
+        pslld    xmm7, 31             ; packed right shifting << 31
+        pslld    xmm8, 30             ; packed right shifting shift << 30
+        pslld    xmm9, 25             ; packed right shifting shift << 25
+        pxor     xmm7, xmm8           ; xor the shifted versions
+        pxor     xmm7, xmm9
 
-        movdqa   %xmm7, %xmm8
-        pslldq   $12, %xmm7
-        psrldq   $4, %xmm8
-        pxor     %xmm7, %xmm3           ; first phase of the reduction complete
-        movdqa   %xmm3,%xmm2            ; second phase of the reduction
-        movdqa   %xmm3,%xmm4
-        movdqa   %xmm3,%xmm5
-        psrld    $1, %xmm2              ; packed left shifting >> 1
-        psrld    $2, %xmm4              ; packed left shifting >> 2
-        psrld    $7, %xmm5              ; packed left shifting >> 7
+        movdqa   xmm8, xmm7
+        pslldq   xmm7, 12
+        psrldq   xmm8, 4
+        pxor     xmm3, xmm7     ; first phase of the reduction complete
+        movdqa   xmm2, xmm3           ; second phase of the reduction
+        movdqa   xmm4, xmm3
+        movdqa   xmm5, xmm3
+        psrld    xmm2, 1              ; packed left shifting >> 1
+        psrld    xmm4, 2              ; packed left shifting >> 2
+        psrld    xmm5, 7              ; packed left shifting >> 7
 
-        pxor     %xmm4, %xmm2           ; xor the shifted versions
-        pxor     %xmm5, %xmm2
-        pxor     %xmm8, %xmm2
-        pxor     %xmm2, %xmm3
-        pxor     %xmm3, %xmm6           ; the result is in xmm6
-        movdqu   %xmm6, (%rdi)          ; store the result
+        pxor     xmm2, xmm4           ; xor the shifted versions
+        pxor     xmm2, xmm5
+        pxor     xmm2, xmm8
+        pxor     xmm3, xmm2
+        pxor     xmm6, xmm3           ; the result is in xmm6
+        movdqu   [r8],xmm6          ; store the result
 
-        ; restore xmm6 and xmm7
+        ; restore non volatile xmms from stack
+        movdqa xmm6, [rsp+0]
+        movdqa xmm7, [rsp+16]
+        movdqa xmm8, [rsp+32]
+        movdqa xmm9, [rsp+48]
+        add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
 
         ret
 gfmul ENDP