switch gfmul to intel syntax in aes_asm.asm

This commit is contained in:
toddouska
2015-11-04 11:55:04 -08:00
parent 3b102862b1
commit 124f1f8ce7

View File

@@ -981,69 +981,82 @@ MAKE_RK256_b:
gfmul PROC gfmul PROC
; xmm0 holds operand a (128 bits) ; xmm0 holds operand a (128 bits)
; xmm1 holds operand b (128 bits) ; xmm1 holds operand b (128 bits)
; rdi holds the pointer to output (128 bits) ; r8 holds the pointer to output (128 bits)
movdqa %xmm0, %xmm3
pclmulqdq $0, %xmm1, %xmm3 ; xmm3 holds a0*b0 ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
movdqa %xmm0, %xmm4 sub rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
pclmulqdq $16, %xmm1, %xmm4 ; xmm4 holds a0*b1 movdqa [rsp+0], xmm6
movdqa %xmm0, %xmm5 movdqa [rsp+16], xmm7
pclmulqdq $1, %xmm1, %xmm5 ; xmm5 holds a1*b0 movdqa [rsp+32], xmm8
movdqa %xmm0, %xmm6 movdqa [rsp+48], xmm9
pclmulqdq $17, %xmm1, %xmm6 ; xmm6 holds a1*b1
pxor %xmm5, %xmm4 ; xmm4 holds a0*b1 + a1*b0 movdqa xmm3, xmm0
movdqa %xmm4, %xmm5 pclmulqdq xmm3, xmm1, 0 ; xmm3 holds a0*b0
psrldq $8, %xmm4 movdqa xmm4, xmm0
pslldq $8, %xmm5 pclmulqdq xmm4, xmm1, 16 ; xmm4 holds a0*b1
pxor %xmm5, %xmm3 movdqa xmm5, xmm0
pxor %xmm4, %xmm6 ; <xmm6:xmm3> holds the result of pclmulqdq xmm5, xmm1, 1 ; xmm5 holds a1*b0
movdqa xmm6, xmm0
pclmulqdq xmm6, xmm1, 17 ; xmm6 holds a1*b1
pxor xmm4, xmm5 ; xmm4 holds a0*b1 + a1*b0
movdqa xmm5, xmm4
psrldq xmm4, 8
pslldq xmm5, 8
pxor xmm3, xmm5
pxor xmm6, xmm4 ; <xmm6:xmm3> holds the result of
; the carry-less multiplication of ; the carry-less multiplication of
; xmm0 by xmm1 ; xmm0 by xmm1
; shift the result by one bit position to the left cope for the fact ; shift the result by one bit position to the left cope for the fact
; that bits are reversed ; that bits are reversed
movdqa %xmm3, %xmm7 movdqa xmm7, xmm3
movdqa %xmm6, %xmm8 movdqa xmm8, xmm6
pslld $1, %xmm3 pslld xmm3, 1
pslld $1, %xmm6 pslld xmm6, 1
psrld $31, %xmm7 psrld xmm7, 31
psrld $31, %xmm8 psrld xmm8, 31
movdqa %xmm7, %xmm9 movdqa xmm9, xmm7
pslldq $4, %xmm8 pslldq xmm8, 4
pslldq $4, %xmm7 pslldq xmm7, 4
psrldq $12, %xmm9 psrldq xmm9, 12
por %xmm7, %xmm3 por xmm3, xmm7
por %xmm8, %xmm6 por xmm6, xmm8
por %xmm9, %xmm6 por xmm6, xmm9
; first phase of the reduction ; first phase of the reduction
movdqa %xmm3, %xmm7 movdqa xmm7, xmm3
movdqa %xmm3, %xmm8 movdqa xmm8, xmm3
movdqa %xmm3, %xmm9 movdqa xmm9, xmm3
pslld $31, %xmm7 ; packed right shifting << 31 pslld xmm7, 31 ; packed right shifting << 31
pslld $30, %xmm8 ; packed right shifting shift << 30 pslld xmm8, 30 ; packed right shifting shift << 30
pslld $25, %xmm9 ; packed right shifting shift << 25 pslld xmm9, 25 ; packed right shifting shift << 25
pxor %xmm8, %xmm7 ; xor the shifted versions pxor xmm7, xmm8 ; xor the shifted versions
pxor %xmm9, %xmm7 pxor xmm7, xmm9
movdqa %xmm7, %xmm8 movdqa xmm8, xmm7
pslldq $12, %xmm7 pslldq xmm7, 12
psrldq $4, %xmm8 psrldq xmm8, 4
pxor %xmm7, %xmm3 ; first phase of the reduction complete pxor xmm3, xmm7 ; first phase of the reduction complete
movdqa %xmm3,%xmm2 ; second phase of the reduction movdqa xmm2, xmm3 ; second phase of the reduction
movdqa %xmm3,%xmm4 movdqa xmm4, xmm3
movdqa %xmm3,%xmm5 movdqa xmm5, xmm3
psrld $1, %xmm2 ; packed left shifting >> 1 psrld xmm2, 1 ; packed left shifting >> 1
psrld $2, %xmm4 ; packed left shifting >> 2 psrld xmm4, 2 ; packed left shifting >> 2
psrld $7, %xmm5 ; packed left shifting >> 7 psrld xmm5, 7 ; packed left shifting >> 7
pxor %xmm4, %xmm2 ; xor the shifted versions pxor xmm2, xmm4 ; xor the shifted versions
pxor %xmm5, %xmm2 pxor xmm2, xmm5
pxor %xmm8, %xmm2 pxor xmm2, xmm8
pxor %xmm2, %xmm3 pxor xmm3, xmm2
pxor %xmm3, %xmm6 ; the result is in xmm6 pxor xmm6, xmm3 ; the result is in xmm6
movdqu %xmm6, (%rdi) ; store the result movdqu [r8],xmm6 ; store the result
; restore xmm6 and xmm7 ; restore non volatile xmms from stack
movdqa xmm6, [rsp+0]
movdqa xmm7, [rsp+16]
movdqa xmm8, [rsp+32]
movdqa xmm9, [rsp+48]
add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
ret ret
gfmul ENDP gfmul ENDP