mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2025-08-03 04:34:41 +02:00
switch gfmul to intel syntax in aes_asm.asm
This commit is contained in:
@@ -981,69 +981,82 @@ MAKE_RK256_b:
|
||||
gfmul PROC
|
||||
; xmm0 holds operand a (128 bits)
|
||||
; xmm1 holds operand b (128 bits)
|
||||
; rdi holds the pointer to output (128 bits)
|
||||
movdqa %xmm0, %xmm3
|
||||
pclmulqdq $0, %xmm1, %xmm3 ; xmm3 holds a0*b0
|
||||
movdqa %xmm0, %xmm4
|
||||
pclmulqdq $16, %xmm1, %xmm4 ; xmm4 holds a0*b1
|
||||
movdqa %xmm0, %xmm5
|
||||
pclmulqdq $1, %xmm1, %xmm5 ; xmm5 holds a1*b0
|
||||
movdqa %xmm0, %xmm6
|
||||
pclmulqdq $17, %xmm1, %xmm6 ; xmm6 holds a1*b1
|
||||
pxor %xmm5, %xmm4 ; xmm4 holds a0*b1 + a1*b0
|
||||
movdqa %xmm4, %xmm5
|
||||
psrldq $8, %xmm4
|
||||
pslldq $8, %xmm5
|
||||
pxor %xmm5, %xmm3
|
||||
pxor %xmm4, %xmm6 ; <xmm6:xmm3> holds the result of
|
||||
; r8 holds the pointer to output (128 bits)
|
||||
|
||||
; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
|
||||
sub rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
|
||||
movdqa [rsp+0], xmm6
|
||||
movdqa [rsp+16], xmm7
|
||||
movdqa [rsp+32], xmm8
|
||||
movdqa [rsp+48], xmm9
|
||||
|
||||
movdqa xmm3, xmm0
|
||||
pclmulqdq xmm3, xmm1, 0 ; xmm3 holds a0*b0
|
||||
movdqa xmm4, xmm0
|
||||
pclmulqdq xmm4, xmm1, 16 ; xmm4 holds a0*b1
|
||||
movdqa xmm5, xmm0
|
||||
pclmulqdq xmm5, xmm1, 1 ; xmm5 holds a1*b0
|
||||
movdqa xmm6, xmm0
|
||||
pclmulqdq xmm6, xmm1, 17 ; xmm6 holds a1*b1
|
||||
pxor xmm4, xmm5 ; xmm4 holds a0*b1 + a1*b0
|
||||
movdqa xmm5, xmm4
|
||||
psrldq xmm4, 8
|
||||
pslldq xmm5, 8
|
||||
pxor xmm3, xmm5
|
||||
pxor xmm6, xmm4 ; <xmm6:xmm3> holds the result of
|
||||
; the carry-less multiplication of
|
||||
; xmm0 by xmm1
|
||||
|
||||
; shift the result by one bit position to the left cope for the fact
|
||||
; that bits are reversed
|
||||
movdqa %xmm3, %xmm7
|
||||
movdqa %xmm6, %xmm8
|
||||
pslld $1, %xmm3
|
||||
pslld $1, %xmm6
|
||||
psrld $31, %xmm7
|
||||
psrld $31, %xmm8
|
||||
movdqa %xmm7, %xmm9
|
||||
pslldq $4, %xmm8
|
||||
pslldq $4, %xmm7
|
||||
psrldq $12, %xmm9
|
||||
por %xmm7, %xmm3
|
||||
por %xmm8, %xmm6
|
||||
por %xmm9, %xmm6
|
||||
movdqa xmm7, xmm3
|
||||
movdqa xmm8, xmm6
|
||||
pslld xmm3, 1
|
||||
pslld xmm6, 1
|
||||
psrld xmm7, 31
|
||||
psrld xmm8, 31
|
||||
movdqa xmm9, xmm7
|
||||
pslldq xmm8, 4
|
||||
pslldq xmm7, 4
|
||||
psrldq xmm9, 12
|
||||
por xmm3, xmm7
|
||||
por xmm6, xmm8
|
||||
por xmm6, xmm9
|
||||
|
||||
; first phase of the reduction
|
||||
movdqa %xmm3, %xmm7
|
||||
movdqa %xmm3, %xmm8
|
||||
movdqa %xmm3, %xmm9
|
||||
pslld $31, %xmm7 ; packed right shifting << 31
|
||||
pslld $30, %xmm8 ; packed right shifting shift << 30
|
||||
pslld $25, %xmm9 ; packed right shifting shift << 25
|
||||
pxor %xmm8, %xmm7 ; xor the shifted versions
|
||||
pxor %xmm9, %xmm7
|
||||
movdqa xmm7, xmm3
|
||||
movdqa xmm8, xmm3
|
||||
movdqa xmm9, xmm3
|
||||
pslld xmm7, 31 ; packed right shifting << 31
|
||||
pslld xmm8, 30 ; packed right shifting shift << 30
|
||||
pslld xmm9, 25 ; packed right shifting shift << 25
|
||||
pxor xmm7, xmm8 ; xor the shifted versions
|
||||
pxor xmm7, xmm9
|
||||
|
||||
movdqa %xmm7, %xmm8
|
||||
pslldq $12, %xmm7
|
||||
psrldq $4, %xmm8
|
||||
pxor %xmm7, %xmm3 ; first phase of the reduction complete
|
||||
movdqa %xmm3,%xmm2 ; second phase of the reduction
|
||||
movdqa %xmm3,%xmm4
|
||||
movdqa %xmm3,%xmm5
|
||||
psrld $1, %xmm2 ; packed left shifting >> 1
|
||||
psrld $2, %xmm4 ; packed left shifting >> 2
|
||||
psrld $7, %xmm5 ; packed left shifting >> 7
|
||||
movdqa xmm8, xmm7
|
||||
pslldq xmm7, 12
|
||||
psrldq xmm8, 4
|
||||
pxor xmm3, xmm7 ; first phase of the reduction complete
|
||||
movdqa xmm2, xmm3 ; second phase of the reduction
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm5, xmm3
|
||||
psrld xmm2, 1 ; packed left shifting >> 1
|
||||
psrld xmm4, 2 ; packed left shifting >> 2
|
||||
psrld xmm5, 7 ; packed left shifting >> 7
|
||||
|
||||
pxor %xmm4, %xmm2 ; xor the shifted versions
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm8, %xmm2
|
||||
pxor %xmm2, %xmm3
|
||||
pxor %xmm3, %xmm6 ; the result is in xmm6
|
||||
movdqu %xmm6, (%rdi) ; store the result
|
||||
pxor xmm2, xmm4 ; xor the shifted versions
|
||||
pxor xmm2, xmm5
|
||||
pxor xmm2, xmm8
|
||||
pxor xmm3, xmm2
|
||||
pxor xmm6, xmm3 ; the result is in xmm6
|
||||
movdqu [r8],xmm6 ; store the result
|
||||
|
||||
; restore xmm6 and xmm7
|
||||
; restore non volatile xmms from stack
|
||||
movdqa xmm6, [rsp+0]
|
||||
movdqa xmm7, [rsp+16]
|
||||
movdqa xmm8, [rsp+32]
|
||||
movdqa xmm9, [rsp+48]
|
||||
add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
|
||||
|
||||
ret
|
||||
gfmul ENDP
|
||||
|
Reference in New Issue
Block a user