mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2025-08-03 20:54:41 +02:00
switch gfmul to intel syntax in aes_asm.asm
This commit is contained in:
@@ -981,69 +981,82 @@ MAKE_RK256_b:
|
|||||||
gfmul PROC
|
gfmul PROC
|
||||||
; xmm0 holds operand a (128 bits)
|
; xmm0 holds operand a (128 bits)
|
||||||
; xmm1 holds operand b (128 bits)
|
; xmm1 holds operand b (128 bits)
|
||||||
; rdi holds the pointer to output (128 bits)
|
; r8 holds the pointer to output (128 bits)
|
||||||
movdqa %xmm0, %xmm3
|
|
||||||
pclmulqdq $0, %xmm1, %xmm3 ; xmm3 holds a0*b0
|
; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
|
||||||
movdqa %xmm0, %xmm4
|
sub rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
|
||||||
pclmulqdq $16, %xmm1, %xmm4 ; xmm4 holds a0*b1
|
movdqa [rsp+0], xmm6
|
||||||
movdqa %xmm0, %xmm5
|
movdqa [rsp+16], xmm7
|
||||||
pclmulqdq $1, %xmm1, %xmm5 ; xmm5 holds a1*b0
|
movdqa [rsp+32], xmm8
|
||||||
movdqa %xmm0, %xmm6
|
movdqa [rsp+48], xmm9
|
||||||
pclmulqdq $17, %xmm1, %xmm6 ; xmm6 holds a1*b1
|
|
||||||
pxor %xmm5, %xmm4 ; xmm4 holds a0*b1 + a1*b0
|
movdqa xmm3, xmm0
|
||||||
movdqa %xmm4, %xmm5
|
pclmulqdq xmm3, xmm1, 0 ; xmm3 holds a0*b0
|
||||||
psrldq $8, %xmm4
|
movdqa xmm4, xmm0
|
||||||
pslldq $8, %xmm5
|
pclmulqdq xmm4, xmm1, 16 ; xmm4 holds a0*b1
|
||||||
pxor %xmm5, %xmm3
|
movdqa xmm5, xmm0
|
||||||
pxor %xmm4, %xmm6 ; <xmm6:xmm3> holds the result of
|
pclmulqdq xmm5, xmm1, 1 ; xmm5 holds a1*b0
|
||||||
|
movdqa xmm6, xmm0
|
||||||
|
pclmulqdq xmm6, xmm1, 17 ; xmm6 holds a1*b1
|
||||||
|
pxor xmm4, xmm5 ; xmm4 holds a0*b1 + a1*b0
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
psrldq xmm4, 8
|
||||||
|
pslldq xmm5, 8
|
||||||
|
pxor xmm3, xmm5
|
||||||
|
pxor xmm6, xmm4 ; <xmm6:xmm3> holds the result of
|
||||||
; the carry-less multiplication of
|
; the carry-less multiplication of
|
||||||
; xmm0 by xmm1
|
; xmm0 by xmm1
|
||||||
|
|
||||||
; shift the result by one bit position to the left cope for the fact
|
; shift the result by one bit position to the left cope for the fact
|
||||||
; that bits are reversed
|
; that bits are reversed
|
||||||
movdqa %xmm3, %xmm7
|
movdqa xmm7, xmm3
|
||||||
movdqa %xmm6, %xmm8
|
movdqa xmm8, xmm6
|
||||||
pslld $1, %xmm3
|
pslld xmm3, 1
|
||||||
pslld $1, %xmm6
|
pslld xmm6, 1
|
||||||
psrld $31, %xmm7
|
psrld xmm7, 31
|
||||||
psrld $31, %xmm8
|
psrld xmm8, 31
|
||||||
movdqa %xmm7, %xmm9
|
movdqa xmm9, xmm7
|
||||||
pslldq $4, %xmm8
|
pslldq xmm8, 4
|
||||||
pslldq $4, %xmm7
|
pslldq xmm7, 4
|
||||||
psrldq $12, %xmm9
|
psrldq xmm9, 12
|
||||||
por %xmm7, %xmm3
|
por xmm3, xmm7
|
||||||
por %xmm8, %xmm6
|
por xmm6, xmm8
|
||||||
por %xmm9, %xmm6
|
por xmm6, xmm9
|
||||||
|
|
||||||
; first phase of the reduction
|
; first phase of the reduction
|
||||||
movdqa %xmm3, %xmm7
|
movdqa xmm7, xmm3
|
||||||
movdqa %xmm3, %xmm8
|
movdqa xmm8, xmm3
|
||||||
movdqa %xmm3, %xmm9
|
movdqa xmm9, xmm3
|
||||||
pslld $31, %xmm7 ; packed right shifting << 31
|
pslld xmm7, 31 ; packed right shifting << 31
|
||||||
pslld $30, %xmm8 ; packed right shifting shift << 30
|
pslld xmm8, 30 ; packed right shifting shift << 30
|
||||||
pslld $25, %xmm9 ; packed right shifting shift << 25
|
pslld xmm9, 25 ; packed right shifting shift << 25
|
||||||
pxor %xmm8, %xmm7 ; xor the shifted versions
|
pxor xmm7, xmm8 ; xor the shifted versions
|
||||||
pxor %xmm9, %xmm7
|
pxor xmm7, xmm9
|
||||||
|
|
||||||
movdqa %xmm7, %xmm8
|
movdqa xmm8, xmm7
|
||||||
pslldq $12, %xmm7
|
pslldq xmm7, 12
|
||||||
psrldq $4, %xmm8
|
psrldq xmm8, 4
|
||||||
pxor %xmm7, %xmm3 ; first phase of the reduction complete
|
pxor xmm3, xmm7 ; first phase of the reduction complete
|
||||||
movdqa %xmm3,%xmm2 ; second phase of the reduction
|
movdqa xmm2, xmm3 ; second phase of the reduction
|
||||||
movdqa %xmm3,%xmm4
|
movdqa xmm4, xmm3
|
||||||
movdqa %xmm3,%xmm5
|
movdqa xmm5, xmm3
|
||||||
psrld $1, %xmm2 ; packed left shifting >> 1
|
psrld xmm2, 1 ; packed left shifting >> 1
|
||||||
psrld $2, %xmm4 ; packed left shifting >> 2
|
psrld xmm4, 2 ; packed left shifting >> 2
|
||||||
psrld $7, %xmm5 ; packed left shifting >> 7
|
psrld xmm5, 7 ; packed left shifting >> 7
|
||||||
|
|
||||||
pxor %xmm4, %xmm2 ; xor the shifted versions
|
pxor xmm2, xmm4 ; xor the shifted versions
|
||||||
pxor %xmm5, %xmm2
|
pxor xmm2, xmm5
|
||||||
pxor %xmm8, %xmm2
|
pxor xmm2, xmm8
|
||||||
pxor %xmm2, %xmm3
|
pxor xmm3, xmm2
|
||||||
pxor %xmm3, %xmm6 ; the result is in xmm6
|
pxor xmm6, xmm3 ; the result is in xmm6
|
||||||
movdqu %xmm6, (%rdi) ; store the result
|
movdqu [r8],xmm6 ; store the result
|
||||||
|
|
||||||
; restore xmm6 and xmm7
|
; restore non volatile xmms from stack
|
||||||
|
movdqa xmm6, [rsp+0]
|
||||||
|
movdqa xmm7, [rsp+16]
|
||||||
|
movdqa xmm8, [rsp+32]
|
||||||
|
movdqa xmm9, [rsp+48]
|
||||||
|
add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
|
||||||
|
|
||||||
ret
|
ret
|
||||||
gfmul ENDP
|
gfmul ENDP
|
||||||
|
Reference in New Issue
Block a user