mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-02-03 23:45:05 +01:00
Configurations: ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=small2048 --enable-sp-math ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=2048 --enable-sp-math ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=small2048 --enable-sp-math-all ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=small2048 --enable-sp-math --enable-sp-asm ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=2048 --enable-sp-math --enable-sp-asm
52720 lines
1.5 MiB
52720 lines
1.5 MiB
; /* sp_x86_64_asm
|
|
; *
|
|
; * Copyright (C) 2006-2021 wolfSSL Inc.
|
|
; *
|
|
; * This file is part of wolfSSL.
|
|
; *
|
|
; * wolfSSL is free software; you can redistribute it and/or modify
|
|
; * it under the terms of the GNU General Public License as published by
|
|
; * the Free Software Foundation; either version 2 of the License, or
|
|
; * (at your option) any later version.
|
|
; *
|
|
; * wolfSSL is distributed in the hope that it will be useful,
|
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
; * GNU General Public License for more details.
|
|
; *
|
|
; * You should have received a copy of the GNU General Public License
|
|
; * along with this program; if not, write to the Free Software
|
|
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
|
; */
|
|
IF @Version LT 1200
|
|
; AVX2 instructions not recognized by old versions of MASM
|
|
IFNDEF NO_AVX2_SUPPORT
|
|
NO_AVX2_SUPPORT = 1
|
|
ENDIF
|
|
; MOVBE instruction not recognized by old versions of MASM
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
NO_MOVBE_SUPPORT = 1
|
|
ENDIF
|
|
ENDIF
|
|
|
|
IFNDEF HAVE_INTEL_AVX1
|
|
HAVE_INTEL_AVX1 = 1
|
|
ENDIF
|
|
IFNDEF NO_AVX2_SUPPORT
|
|
HAVE_INTEL_AVX2 = 1
|
|
ENDIF
|
|
|
|
IFNDEF _WIN64
|
|
_WIN64 = 1
|
|
ENDIF
|
|
|
|
IFNDEF WOLFSSL_SP_NO_2048
|
|
IFNDEF WOLFSSL_SP_NO_2048
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_from_bin_bswap PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 256
|
|
xor r13, r13
|
|
jmp L_2048_from_bin_bswap_64_end
|
|
L_2048_from_bin_bswap_64_start:
|
|
sub r11, 64
|
|
mov rax, QWORD PTR [r11+56]
|
|
mov r10, QWORD PTR [r11+48]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [r11+40]
|
|
mov r10, QWORD PTR [r11+32]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [r11+24]
|
|
mov r10, QWORD PTR [r11+16]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [r11+8]
|
|
mov r10, QWORD PTR [r11]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_2048_from_bin_bswap_64_end:
|
|
cmp r9, 63
|
|
jg L_2048_from_bin_bswap_64_start
|
|
jmp L_2048_from_bin_bswap_8_end
|
|
L_2048_from_bin_bswap_8_start:
|
|
sub r11, 8
|
|
mov rax, QWORD PTR [r11]
|
|
bswap rax
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_2048_from_bin_bswap_8_end:
|
|
cmp r9, 7
|
|
jg L_2048_from_bin_bswap_8_start
|
|
cmp r9, r13
|
|
je L_2048_from_bin_bswap_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_2048_from_bin_bswap_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_2048_from_bin_bswap_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_2048_from_bin_bswap_hi_end:
|
|
cmp rcx, r12
|
|
je L_2048_from_bin_bswap_zero_end
|
|
L_2048_from_bin_bswap_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_2048_from_bin_bswap_zero_start
|
|
L_2048_from_bin_bswap_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_from_bin_bswap ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the movbe instruction which is an optional instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_from_bin_movbe PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 256
|
|
xor r13, r13
|
|
jmp L_2048_from_bin_movbe_64_end
|
|
L_2048_from_bin_movbe_64_start:
|
|
sub r11, 64
|
|
movbe rax, QWORD PTR [r11+56]
|
|
movbe r10, QWORD PTR [r11+48]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
movbe rax, QWORD PTR [r11+40]
|
|
movbe r10, QWORD PTR [r11+32]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
movbe rax, QWORD PTR [r11+24]
|
|
movbe r10, QWORD PTR [r11+16]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
movbe rax, QWORD PTR [r11+8]
|
|
movbe r10, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_2048_from_bin_movbe_64_end:
|
|
cmp r9, 63
|
|
jg L_2048_from_bin_movbe_64_start
|
|
jmp L_2048_from_bin_movbe_8_end
|
|
L_2048_from_bin_movbe_8_start:
|
|
sub r11, 8
|
|
movbe rax, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_2048_from_bin_movbe_8_end:
|
|
cmp r9, 7
|
|
jg L_2048_from_bin_movbe_8_start
|
|
cmp r9, r13
|
|
je L_2048_from_bin_movbe_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_2048_from_bin_movbe_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_2048_from_bin_movbe_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_2048_from_bin_movbe_hi_end:
|
|
cmp rcx, r12
|
|
je L_2048_from_bin_movbe_zero_end
|
|
L_2048_from_bin_movbe_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_2048_from_bin_movbe_zero_start
|
|
L_2048_from_bin_movbe_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_from_bin_movbe ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 256
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_to_bin_bswap_32 PROC
|
|
mov rax, QWORD PTR [rcx+248]
|
|
mov r8, QWORD PTR [rcx+240]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
mov rax, QWORD PTR [rcx+232]
|
|
mov r8, QWORD PTR [rcx+224]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
mov rax, QWORD PTR [rcx+216]
|
|
mov r8, QWORD PTR [rcx+208]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
mov rax, QWORD PTR [rcx+200]
|
|
mov r8, QWORD PTR [rcx+192]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+48], rax
|
|
mov QWORD PTR [rdx+56], r8
|
|
mov rax, QWORD PTR [rcx+184]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+64], rax
|
|
mov QWORD PTR [rdx+72], r8
|
|
mov rax, QWORD PTR [rcx+168]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+80], rax
|
|
mov QWORD PTR [rdx+88], r8
|
|
mov rax, QWORD PTR [rcx+152]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+96], rax
|
|
mov QWORD PTR [rdx+104], r8
|
|
mov rax, QWORD PTR [rcx+136]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+112], rax
|
|
mov QWORD PTR [rdx+120], r8
|
|
mov rax, QWORD PTR [rcx+120]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+128], rax
|
|
mov QWORD PTR [rdx+136], r8
|
|
mov rax, QWORD PTR [rcx+104]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+144], rax
|
|
mov QWORD PTR [rdx+152], r8
|
|
mov rax, QWORD PTR [rcx+88]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+160], rax
|
|
mov QWORD PTR [rdx+168], r8
|
|
mov rax, QWORD PTR [rcx+72]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+176], rax
|
|
mov QWORD PTR [rdx+184], r8
|
|
mov rax, QWORD PTR [rcx+56]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+192], rax
|
|
mov QWORD PTR [rdx+200], r8
|
|
mov rax, QWORD PTR [rcx+40]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+208], rax
|
|
mov QWORD PTR [rdx+216], r8
|
|
mov rax, QWORD PTR [rcx+24]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+224], rax
|
|
mov QWORD PTR [rdx+232], r8
|
|
mov rax, QWORD PTR [rcx+8]
|
|
mov r8, QWORD PTR [rcx]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+240], rax
|
|
mov QWORD PTR [rdx+248], r8
|
|
ret
|
|
sp_2048_to_bin_bswap_32 ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 256
|
|
; * Uses the movbe instruction which is optional.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_to_bin_movbe_32 PROC
|
|
movbe rax, QWORD PTR [rcx+248]
|
|
movbe r8, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
movbe rax, QWORD PTR [rcx+232]
|
|
movbe r8, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
movbe rax, QWORD PTR [rcx+216]
|
|
movbe r8, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
movbe rax, QWORD PTR [rcx+200]
|
|
movbe r8, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rdx+48], rax
|
|
mov QWORD PTR [rdx+56], r8
|
|
movbe rax, QWORD PTR [rcx+184]
|
|
movbe r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rdx+64], rax
|
|
mov QWORD PTR [rdx+72], r8
|
|
movbe rax, QWORD PTR [rcx+168]
|
|
movbe r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rdx+80], rax
|
|
mov QWORD PTR [rdx+88], r8
|
|
movbe rax, QWORD PTR [rcx+152]
|
|
movbe r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rdx+96], rax
|
|
mov QWORD PTR [rdx+104], r8
|
|
movbe rax, QWORD PTR [rcx+136]
|
|
movbe r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rdx+112], rax
|
|
mov QWORD PTR [rdx+120], r8
|
|
movbe rax, QWORD PTR [rcx+120]
|
|
movbe r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rdx+128], rax
|
|
mov QWORD PTR [rdx+136], r8
|
|
movbe rax, QWORD PTR [rcx+104]
|
|
movbe r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rdx+144], rax
|
|
mov QWORD PTR [rdx+152], r8
|
|
movbe rax, QWORD PTR [rcx+88]
|
|
movbe r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rdx+160], rax
|
|
mov QWORD PTR [rdx+168], r8
|
|
movbe rax, QWORD PTR [rcx+72]
|
|
movbe r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rdx+176], rax
|
|
mov QWORD PTR [rdx+184], r8
|
|
movbe rax, QWORD PTR [rcx+56]
|
|
movbe r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rdx+192], rax
|
|
mov QWORD PTR [rdx+200], r8
|
|
movbe rax, QWORD PTR [rcx+40]
|
|
movbe r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rdx+208], rax
|
|
mov QWORD PTR [rdx+216], r8
|
|
movbe rax, QWORD PTR [rcx+24]
|
|
movbe r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rdx+224], rax
|
|
mov QWORD PTR [rdx+232], r8
|
|
movbe rax, QWORD PTR [rcx+8]
|
|
movbe r8, QWORD PTR [rcx]
|
|
mov QWORD PTR [rdx+240], rax
|
|
mov QWORD PTR [rdx+248], r8
|
|
ret
|
|
sp_2048_to_bin_movbe_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_16 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
sub rsp, 128
|
|
; A[0] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
mov QWORD PTR [rsp], rax
|
|
mov r11, rdx
|
|
; A[0] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+8], r11
|
|
; A[0] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+16], r12
|
|
; A[0] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+24], r10
|
|
; A[0] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+32], r11
|
|
; A[0] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+40], r12
|
|
; A[0] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+48], r10
|
|
; A[0] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+56], r11
|
|
; A[0] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+64], r12
|
|
; A[0] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+72], r10
|
|
; A[0] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+80], r11
|
|
; A[0] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+88], r12
|
|
; A[0] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+96], r10
|
|
; A[0] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+104], r11
|
|
; A[0] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+112], r12
|
|
; A[0] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+120], r10
|
|
; A[1] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[2] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+16]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[3] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+24]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+144], r10
|
|
; A[4] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+32]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+152], r11
|
|
; A[5] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+40]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+160], r12
|
|
; A[6] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+48]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+168], r10
|
|
; A[7] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+56]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+176], r11
|
|
; A[8] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+64]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+184], r12
|
|
; A[9] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+72]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+192], r10
|
|
; A[10] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+80]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+200], r11
|
|
; A[11] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+88]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+208], r12
|
|
; A[12] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+96]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+216], r10
|
|
; A[13] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+104]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+224], r11
|
|
; A[14] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+112]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+232], r12
|
|
; A[15] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+240], r10
|
|
mov QWORD PTR [rcx+248], r11
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r10, QWORD PTR [rsp+16]
|
|
mov r11, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov r10, QWORD PTR [rsp+48]
|
|
mov r11, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rsp+64]
|
|
mov rdx, QWORD PTR [rsp+72]
|
|
mov r10, QWORD PTR [rsp+80]
|
|
mov r11, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], rdx
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rsp+96]
|
|
mov rdx, QWORD PTR [rsp+104]
|
|
mov r10, QWORD PTR [rsp+112]
|
|
mov r11, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], rdx
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
add rsp, 128
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_16 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sqr_16 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r8, rdx
|
|
sub rsp, 128
|
|
; A[0] * A[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul rax
|
|
xor r11, r11
|
|
mov QWORD PTR [rsp], rax
|
|
mov r10, rdx
|
|
; A[0] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+8], r10
|
|
; A[0] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[1] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+16], r11
|
|
; A[0] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8+8]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+24], r9
|
|
; A[0] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[1] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[2] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+32], r10
|
|
; A[0] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+40], r11
|
|
; A[0] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+48], r9
|
|
; A[0] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+56], r10
|
|
; A[0] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+64], r11
|
|
; A[0] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+72], r9
|
|
; A[0] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+80], r10
|
|
; A[0] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+88], r11
|
|
; A[0] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+96], r9
|
|
; A[0] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+104], r10
|
|
; A[0] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+112], r11
|
|
; A[0] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+120], r9
|
|
; A[1] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[2] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+128], r10
|
|
; A[2] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+16]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[3] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+136], r11
|
|
; A[3] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+24]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[4] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+144], r9
|
|
; A[4] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+32]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[5] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+152], r10
|
|
; A[5] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+40]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[6] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[6] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+48]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[7] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+168], r9
|
|
; A[7] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+56]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[8] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[11] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+176], r10
|
|
; A[8] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+64]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[9] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[11] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+88]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+184], r11
|
|
; A[9] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+72]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[10] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[11] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+88]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[12] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+192], r9
|
|
; A[10] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+80]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[11] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+88]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[12] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+96]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+200], r10
|
|
; A[11] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+88]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[12] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+96]
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[13] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+208], r11
|
|
; A[12] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+96]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+104]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+216], r9
|
|
; A[13] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+104]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[14] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+224], r10
|
|
; A[14] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+112]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+232], r11
|
|
; A[15] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul rax
|
|
add r9, rax
|
|
adc r10, rdx
|
|
mov QWORD PTR [rcx+240], r9
|
|
mov QWORD PTR [rcx+248], r10
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r12, QWORD PTR [rsp+16]
|
|
mov r13, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov r12, QWORD PTR [rsp+48]
|
|
mov r13, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
mov QWORD PTR [rcx+48], r12
|
|
mov QWORD PTR [rcx+56], r13
|
|
mov rax, QWORD PTR [rsp+64]
|
|
mov rdx, QWORD PTR [rsp+72]
|
|
mov r12, QWORD PTR [rsp+80]
|
|
mov r13, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], rdx
|
|
mov QWORD PTR [rcx+80], r12
|
|
mov QWORD PTR [rcx+88], r13
|
|
mov rax, QWORD PTR [rsp+96]
|
|
mov rdx, QWORD PTR [rsp+104]
|
|
mov r12, QWORD PTR [rsp+112]
|
|
mov r13, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], rdx
|
|
mov QWORD PTR [rcx+112], r12
|
|
mov QWORD PTR [rcx+120], r13
|
|
add rsp, 128
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_sqr_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply.
|
|
; * b Second number to multiply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_avx2_16 PROC
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
mov rbp, r8
|
|
mov r8, rcx
|
|
mov r9, rdx
|
|
sub rsp, 128
|
|
cmp r9, r8
|
|
mov rbx, rsp
|
|
cmovne rbx, r8
|
|
cmp rbp, r8
|
|
cmove rbx, rsp
|
|
add r8, 128
|
|
xor rdi, rdi
|
|
mov rdx, QWORD PTR [r9]
|
|
; A[0] * B[0]
|
|
mulx r11, r10, QWORD PTR [rbp]
|
|
; A[0] * B[1]
|
|
mulx r12, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx], r10
|
|
adcx r11, rax
|
|
; A[0] * B[2]
|
|
mulx r13, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+8], r11
|
|
adcx r12, rax
|
|
; A[0] * B[3]
|
|
mulx r14, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r13, rax
|
|
mov QWORD PTR [rbx+24], r13
|
|
; A[0] * B[4]
|
|
mulx r10, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
; A[0] * B[5]
|
|
mulx r11, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
; A[0] * B[6]
|
|
mulx r12, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
; A[0] * B[7]
|
|
mulx r13, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
mov QWORD PTR [rbx+56], r12
|
|
; A[0] * B[8]
|
|
mulx r14, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
; A[0] * B[9]
|
|
mulx r10, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
; A[0] * B[10]
|
|
mulx r11, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
; A[0] * B[11]
|
|
mulx r12, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
mov QWORD PTR [rbx+88], r11
|
|
; A[0] * B[12]
|
|
mulx r13, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
; A[0] * B[13]
|
|
mulx r14, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
; A[0] * B[14]
|
|
mulx r10, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
; A[0] * B[15]
|
|
mulx r11, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adcx r11, rdi
|
|
mov r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov QWORD PTR [r8], r11
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mov r11, QWORD PTR [rbx+8]
|
|
mov r12, QWORD PTR [rbx+16]
|
|
mov r13, QWORD PTR [rbx+24]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
; A[1] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[1] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[1] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+24], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+32], r14
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
; A[1] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[1] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+64], r13
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
; A[1] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[1] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+96], r12
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[1] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[1] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[1] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, rdi
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8], r11
|
|
mov QWORD PTR [r8+8], r12
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [rbx+16]
|
|
mov r13, QWORD PTR [rbx+24]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
; A[2] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[2] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[2] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+24], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[2] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+40], r10
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
; A[2] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[2] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[2] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+72], r14
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
; A[2] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[2] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[2] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+104], r13
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[2] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[2] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[2] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, rdi
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
adcx r13, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+8], r12
|
|
mov QWORD PTR [r8+16], r13
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mov r13, QWORD PTR [rbx+24]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
; A[3] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[3] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+24], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[3] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+48], r11
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
; A[3] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[3] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[3] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[3] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+80], r10
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
; A[3] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[3] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[3] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[3] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+112], r14
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[3] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[3] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[3] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, rdi
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
adcx r14, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+16], r13
|
|
mov QWORD PTR [r8+24], r14
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
; A[4] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[4] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[4] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+56], r12
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
; A[4] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[4] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[4] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+88], r11
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[4] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[4] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[4] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[4] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[4] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[4] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[4] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[4] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, rdi
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
adcx r10, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+24], r14
|
|
mov QWORD PTR [r8+32], r10
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
; A[5] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[5] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[5] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+64], r13
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
; A[5] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[5] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[5] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+96], r12
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[5] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[5] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[5] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[5] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[5] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[5] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[5] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+24], r14
|
|
mov r11, rdi
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+32], r10
|
|
mov QWORD PTR [r8+40], r11
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
; A[6] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[6] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[6] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+72], r14
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
; A[6] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[6] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+104], r13
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[6] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[6] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[6] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[6] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[6] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, rdi
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+40], r11
|
|
mov QWORD PTR [r8+48], r12
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
; A[7] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[7] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[7] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[7] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+80], r10
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
; A[7] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[7] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[7] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+112], r14
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[7] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[7] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[7] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
; A[7] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[7] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[7] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r13, rdi
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
adcx r13, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+48], r12
|
|
mov QWORD PTR [r8+56], r13
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
; A[8] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[8] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[8] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+88], r11
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[8] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[8] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[8] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[8] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[8] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[8] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[8] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r14
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
; A[8] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[8] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r14, rdi
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
adcx r14, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+56], r13
|
|
mov QWORD PTR [r8+64], r14
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
; A[9] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[9] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[9] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+96], r12
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[9] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[9] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[9] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[9] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[9] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[9] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[9] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
; A[9] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[9] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[9] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[9] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+56], r13
|
|
mov r10, rdi
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
adcx r10, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+64], r14
|
|
mov QWORD PTR [r8+72], r10
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
; A[10] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[10] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+104], r13
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[10] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[10] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
; A[10] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[10] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[10] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[10] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[10] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[10] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[10] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+64], r14
|
|
mov r11, rdi
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+72], r10
|
|
mov QWORD PTR [r8+80], r11
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
; A[11] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[11] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[11] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+112], r14
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[11] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[11] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
; A[11] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[11] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
; A[11] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[11] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[11] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+72], r10
|
|
mov r12, rdi
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+80], r11
|
|
mov QWORD PTR [r8+88], r12
|
|
mov rdx, QWORD PTR [r9+96]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[12] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[12] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[12] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[12] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[12] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[12] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[12] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r14
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
; A[12] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[12] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[12] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+56], r13
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, QWORD PTR [r8+88]
|
|
; A[12] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[12] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[12] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+80], r11
|
|
mov r13, rdi
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
adcx r13, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+88], r12
|
|
mov QWORD PTR [r8+96], r13
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[13] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[13] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[13] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[13] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[13] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[13] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[13] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[13] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[13] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[13] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+64], r14
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, QWORD PTR [r8+88]
|
|
mov r13, QWORD PTR [r8+96]
|
|
; A[13] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+80], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[13] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+88], r12
|
|
mov r14, rdi
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
adcx r14, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+96], r13
|
|
mov QWORD PTR [r8+104], r14
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[14] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[14] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[14] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
; A[14] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[14] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[14] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
; A[14] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[14] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[14] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[14] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+72], r10
|
|
mov r12, QWORD PTR [r8+88]
|
|
mov r13, QWORD PTR [r8+96]
|
|
mov r14, QWORD PTR [r8+104]
|
|
; A[14] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[14] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+80], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[14] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+88], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[14] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+96], r13
|
|
mov r10, rdi
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
adcx r10, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+104], r14
|
|
mov QWORD PTR [r8+112], r10
|
|
mov rdx, QWORD PTR [r9+120]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[15] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[15] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
; A[15] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[15] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, QWORD PTR [r8+88]
|
|
; A[15] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[15] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[15] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+80], r11
|
|
mov r13, QWORD PTR [r8+96]
|
|
mov r14, QWORD PTR [r8+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
; A[15] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[15] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+88], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[15] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+96], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[15] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+104], r14
|
|
mov r11, rdi
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r15
|
|
mov QWORD PTR [r8+112], r10
|
|
mov QWORD PTR [r8+120], r11
|
|
sub r8, 128
|
|
cmp r9, r8
|
|
je L_start_2048_mul_avx2_16
|
|
cmp rbp, r8
|
|
jne L_end_2048_mul_avx2_16
|
|
L_start_2048_mul_avx2_16:
|
|
vmovdqu xmm0, OWORD PTR [rbx]
|
|
vmovups OWORD PTR [r8], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+16]
|
|
vmovups OWORD PTR [r8+16], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+32]
|
|
vmovups OWORD PTR [r8+32], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+48]
|
|
vmovups OWORD PTR [r8+48], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+64]
|
|
vmovups OWORD PTR [r8+64], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+80]
|
|
vmovups OWORD PTR [r8+80], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+96]
|
|
vmovups OWORD PTR [r8+96], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+112]
|
|
vmovups OWORD PTR [r8+112], xmm0
|
|
L_end_2048_mul_avx2_16:
|
|
add rsp, 128
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
sp_2048_mul_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sqr_avx2_16 PROC
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov r8, rcx
|
|
mov r9, rdx
|
|
sub rsp, 128
|
|
cmp r9, r8
|
|
mov rbp, rsp
|
|
cmovne rbp, r8
|
|
add r8, 128
|
|
xor r13, r13
|
|
; Diagonal 1
|
|
xor r12, r12
|
|
; A[1] x A[0]
|
|
mov rdx, QWORD PTR [r9]
|
|
mulx r11, r10, QWORD PTR [r9+8]
|
|
; A[2] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+16]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+8], r10
|
|
mov QWORD PTR [rbp+16], r11
|
|
mov r10, r13
|
|
mov r11, r13
|
|
; A[3] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[4] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+24], r12
|
|
mov QWORD PTR [rbp+32], r10
|
|
mov r12, r13
|
|
mov r10, r13
|
|
; A[5] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+40], r11
|
|
mov QWORD PTR [rbp+48], r12
|
|
mov r11, r13
|
|
mov r12, r13
|
|
; A[7] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+56], r10
|
|
mov QWORD PTR [rbp+64], r11
|
|
mov r10, r13
|
|
mov r11, r13
|
|
; A[9] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[10] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+72], r12
|
|
mov QWORD PTR [rbp+80], r10
|
|
mov r12, r13
|
|
mov r10, r13
|
|
; A[11] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+88], r11
|
|
mov r15, r12
|
|
mov r11, r13
|
|
mov r12, r13
|
|
; A[13] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov rdi, r10
|
|
mov rsi, r11
|
|
mov r10, r13
|
|
; A[15] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov rbx, r12
|
|
; Carry
|
|
adcx r10, r13
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8], r10
|
|
; Diagonal 2
|
|
mov r10, QWORD PTR [rbp+24]
|
|
mov r11, QWORD PTR [rbp+32]
|
|
mov r12, QWORD PTR [rbp+40]
|
|
; A[2] x A[1]
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mulx rcx, rax, QWORD PTR [r9+16]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+24], r10
|
|
mov QWORD PTR [rbp+32], r11
|
|
mov r10, QWORD PTR [rbp+48]
|
|
mov r11, QWORD PTR [rbp+56]
|
|
; A[4] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[5] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+40], r12
|
|
mov QWORD PTR [rbp+48], r10
|
|
mov r12, QWORD PTR [rbp+64]
|
|
mov r10, QWORD PTR [rbp+72]
|
|
; A[6] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+56], r11
|
|
mov QWORD PTR [rbp+64], r12
|
|
mov r11, QWORD PTR [rbp+80]
|
|
mov r12, QWORD PTR [rbp+88]
|
|
; A[8] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+72], r10
|
|
mov QWORD PTR [rbp+80], r11
|
|
; No load %r13 - %r8
|
|
; No load %r14 - %r9
|
|
; A[10] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; A[11] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r12
|
|
; No store %r13
|
|
; No load %r15 - %r10
|
|
; No load %rbx - %r8
|
|
; A[12] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[13] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, r13
|
|
; A[14] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; A[15] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+120]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r11
|
|
mov r10, r13
|
|
; A[15] x A[2]
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mulx rcx, rax, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+16], r10
|
|
; Diagonal 3
|
|
mov r10, QWORD PTR [rbp+40]
|
|
mov r11, QWORD PTR [rbp+48]
|
|
mov r12, QWORD PTR [rbp+56]
|
|
; A[3] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+40], r10
|
|
mov QWORD PTR [rbp+48], r11
|
|
mov r10, QWORD PTR [rbp+64]
|
|
mov r11, QWORD PTR [rbp+72]
|
|
; A[5] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[6] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+56], r12
|
|
mov QWORD PTR [rbp+64], r10
|
|
mov r12, QWORD PTR [rbp+80]
|
|
mov r10, QWORD PTR [rbp+88]
|
|
; A[7] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+72], r11
|
|
mov QWORD PTR [rbp+80], r12
|
|
; No load %r13 - %r9
|
|
; No load %r14 - %r10
|
|
; A[9] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r15, rcx
|
|
; A[10] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r10
|
|
; No store %r13
|
|
; No load %r15 - %r8
|
|
; No load %rbx - %r9
|
|
; A[11] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[12] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [r8+8]
|
|
; A[13] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx rbx, rax
|
|
adox r12, rcx
|
|
; A[14] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r12
|
|
mov r11, QWORD PTR [r8+16]
|
|
mov r12, r13
|
|
; A[14] x A[3]
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r10
|
|
mov QWORD PTR [r8+16], r11
|
|
mov r10, r13
|
|
; A[14] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+32], r10
|
|
; Diagonal 4
|
|
mov r10, QWORD PTR [rbp+56]
|
|
mov r11, QWORD PTR [rbp+64]
|
|
mov r12, QWORD PTR [rbp+72]
|
|
; A[4] x A[3]
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+56], r10
|
|
mov QWORD PTR [rbp+64], r11
|
|
mov r10, QWORD PTR [rbp+80]
|
|
mov r11, QWORD PTR [rbp+88]
|
|
; A[6] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[7] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+72], r12
|
|
mov QWORD PTR [rbp+80], r10
|
|
; No load %r13 - %r10
|
|
; No load %r14 - %r8
|
|
; A[8] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r15, rcx
|
|
; A[9] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r11
|
|
; No store %r13
|
|
; No load %r15 - %r9
|
|
; No load %rbx - %r10
|
|
; A[10] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[11] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[12] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; A[13] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[13] x A[4]
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, r13
|
|
; A[13] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, r13
|
|
; A[13] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+48], r10
|
|
; Diagonal 5
|
|
mov r10, QWORD PTR [rbp+72]
|
|
mov r11, QWORD PTR [rbp+80]
|
|
mov r12, QWORD PTR [rbp+88]
|
|
; A[5] x A[4]
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+72], r10
|
|
mov QWORD PTR [rbp+80], r11
|
|
; No load %r13 - %r8
|
|
; No load %r14 - %r9
|
|
; A[7] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; A[8] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r12
|
|
; No store %r13
|
|
; No load %r15 - %r10
|
|
; No load %rbx - %r8
|
|
; A[9] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[10] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[11] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; A[12] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[12] x A[5]
|
|
mov rdx, QWORD PTR [r9+96]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[12] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov QWORD PTR [r8+16], r10
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
; A[12] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r11
|
|
mov QWORD PTR [r8+32], r12
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r12, r13
|
|
; A[12] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[12] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r10
|
|
mov QWORD PTR [r8+48], r11
|
|
mov r10, r13
|
|
; A[12] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+56], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+64], r10
|
|
; Diagonal 6
|
|
mov r10, QWORD PTR [rbp+88]
|
|
; No load %r13 - %r9
|
|
; No load %r14 - %r10
|
|
; A[6] x A[5]
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r15, rcx
|
|
; A[7] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r10
|
|
; No store %r13
|
|
; No load %r15 - %r8
|
|
; No load %rbx - %r9
|
|
; A[8] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[9] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [r8+8]
|
|
; A[10] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rbx, rax
|
|
adox r12, rcx
|
|
; A[11] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r12
|
|
mov r11, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [r8+24]
|
|
; A[11] x A[6]
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r10
|
|
mov QWORD PTR [r8+16], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[11] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[11] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r12
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [r8+56]
|
|
; A[11] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] x A[9]
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r11, QWORD PTR [r8+64]
|
|
mov r12, r13
|
|
; A[13] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+56], r10
|
|
mov QWORD PTR [r8+64], r11
|
|
mov r10, r13
|
|
; A[13] x A[12]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+72], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+80], r10
|
|
; Diagonal 7
|
|
; No load %r14 - %r8
|
|
; No load %r15 - %r9
|
|
; No load %rbx - %r10
|
|
; A[7] x A[6]
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[8] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[9] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; A[10] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[10] x A[7]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, QWORD PTR [r8+40]
|
|
; A[10] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[6]
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[14] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[14] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+40], r12
|
|
mov QWORD PTR [r8+48], r10
|
|
mov r12, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[14] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[14] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+56], r11
|
|
mov QWORD PTR [r8+64], r12
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, r13
|
|
; A[14] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[12]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+72], r10
|
|
mov QWORD PTR [r8+80], r11
|
|
mov r10, r13
|
|
; A[14] x A[13]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+88], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+96], r10
|
|
; Diagonal 8
|
|
; No load %rbx - %r8
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[8] x A[7]
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; A[9] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[9] x A[8]
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[15] x A[3]
|
|
mov rdx, QWORD PTR [r9+120]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov QWORD PTR [r8+16], r10
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
; A[15] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r11
|
|
mov QWORD PTR [r8+32], r12
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r12, QWORD PTR [r8+56]
|
|
; A[15] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r10
|
|
mov QWORD PTR [r8+48], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
; A[15] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[15] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+56], r12
|
|
mov QWORD PTR [r8+64], r10
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
; A[15] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+72], r11
|
|
mov QWORD PTR [r8+80], r12
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r12, r13
|
|
; A[15] x A[12]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] x A[13]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+88], r10
|
|
mov QWORD PTR [r8+96], r11
|
|
mov r10, r13
|
|
; A[15] x A[14]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+104], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+112], r10
|
|
mov QWORD PTR [r8+120], r14
|
|
; Double and Add in A[i] x A[i]
|
|
mov r11, QWORD PTR [rbp+8]
|
|
; A[0] x A[0]
|
|
mov rdx, QWORD PTR [r9]
|
|
mulx rcx, rax, rdx
|
|
mov QWORD PTR [rbp], rax
|
|
adox r11, r11
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+8], r11
|
|
mov r10, QWORD PTR [rbp+16]
|
|
mov r11, QWORD PTR [rbp+24]
|
|
; A[1] x A[1]
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+16], r10
|
|
mov QWORD PTR [rbp+24], r11
|
|
mov r10, QWORD PTR [rbp+32]
|
|
mov r11, QWORD PTR [rbp+40]
|
|
; A[2] x A[2]
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+32], r10
|
|
mov QWORD PTR [rbp+40], r11
|
|
mov r10, QWORD PTR [rbp+48]
|
|
mov r11, QWORD PTR [rbp+56]
|
|
; A[3] x A[3]
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+48], r10
|
|
mov QWORD PTR [rbp+56], r11
|
|
mov r10, QWORD PTR [rbp+64]
|
|
mov r11, QWORD PTR [rbp+72]
|
|
; A[4] x A[4]
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+64], r10
|
|
mov QWORD PTR [rbp+72], r11
|
|
mov r10, QWORD PTR [rbp+80]
|
|
mov r11, QWORD PTR [rbp+88]
|
|
; A[5] x A[5]
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+80], r10
|
|
mov QWORD PTR [rbp+88], r11
|
|
; A[6] x A[6]
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mulx rcx, rax, rdx
|
|
adox r15, r15
|
|
adox rdi, rdi
|
|
adcx r15, rax
|
|
adcx rdi, rcx
|
|
; A[7] x A[7]
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mulx rcx, rax, rdx
|
|
adox rsi, rsi
|
|
adox rbx, rbx
|
|
adcx rsi, rax
|
|
adcx rbx, rcx
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[8] x A[8]
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[9] x A[9]
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+16], r10
|
|
mov QWORD PTR [r8+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[10] x A[10]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[11] x A[11]
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+48], r10
|
|
mov QWORD PTR [r8+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
; A[12] x A[12]
|
|
mov rdx, QWORD PTR [r9+96]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+64], r10
|
|
mov QWORD PTR [r8+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
; A[13] x A[13]
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+80], r10
|
|
mov QWORD PTR [r8+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
; A[14] x A[14]
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+96], r10
|
|
mov QWORD PTR [r8+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
; A[15] x A[15]
|
|
mov rdx, QWORD PTR [r9+120]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+112], r10
|
|
mov QWORD PTR [r8+120], r11
|
|
mov QWORD PTR [r8+-32], r15
|
|
mov QWORD PTR [r8+-24], rdi
|
|
mov QWORD PTR [r8+-16], rsi
|
|
mov QWORD PTR [r8+-8], rbx
|
|
sub r8, 128
|
|
cmp r9, r8
|
|
jne L_end_2048_sqr_avx2_16
|
|
vmovdqu xmm0, OWORD PTR [rbp]
|
|
vmovups OWORD PTR [r8], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+16]
|
|
vmovups OWORD PTR [r8+16], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+32]
|
|
vmovups OWORD PTR [r8+32], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+48]
|
|
vmovups OWORD PTR [r8+48], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+64]
|
|
vmovups OWORD PTR [r8+64], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+80]
|
|
vmovups OWORD PTR [r8+80], xmm0
|
|
L_end_2048_sqr_avx2_16:
|
|
add rsp, 128
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
sp_2048_sqr_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_add_16 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
adc r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
adc r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
adc r10, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_2048_add_16 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sub_in_place_32 PROC
|
|
mov r8, QWORD PTR [rcx]
|
|
xor rax, rax
|
|
sub r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
sbb r9, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r8, QWORD PTR [rdx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
sbb r9, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
sbb r8, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
sbb r9, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
sbb r8, QWORD PTR [rdx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
sbb r9, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
sbb r8, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
sbb r9, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
sbb r8, QWORD PTR [rdx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
sbb r9, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
sbb r8, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
sbb r9, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
sbb r8, QWORD PTR [rdx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
sbb r9, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rcx+120], r9
|
|
sbb r8, QWORD PTR [rdx+128]
|
|
mov r9, QWORD PTR [rcx+136]
|
|
mov QWORD PTR [rcx+128], r8
|
|
sbb r9, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rcx+136], r9
|
|
sbb r8, QWORD PTR [rdx+144]
|
|
mov r9, QWORD PTR [rcx+152]
|
|
mov QWORD PTR [rcx+144], r8
|
|
sbb r9, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rcx+152], r9
|
|
sbb r8, QWORD PTR [rdx+160]
|
|
mov r9, QWORD PTR [rcx+168]
|
|
mov QWORD PTR [rcx+160], r8
|
|
sbb r9, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rcx+168], r9
|
|
sbb r8, QWORD PTR [rdx+176]
|
|
mov r9, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [rcx+176], r8
|
|
sbb r9, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rcx+184], r9
|
|
sbb r8, QWORD PTR [rdx+192]
|
|
mov r9, QWORD PTR [rcx+200]
|
|
mov QWORD PTR [rcx+192], r8
|
|
sbb r9, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rcx+200], r9
|
|
sbb r8, QWORD PTR [rdx+208]
|
|
mov r9, QWORD PTR [rcx+216]
|
|
mov QWORD PTR [rcx+208], r8
|
|
sbb r9, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rcx+216], r9
|
|
sbb r8, QWORD PTR [rdx+224]
|
|
mov r9, QWORD PTR [rcx+232]
|
|
mov QWORD PTR [rcx+224], r8
|
|
sbb r9, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rcx+232], r9
|
|
sbb r8, QWORD PTR [rdx+240]
|
|
mov r9, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [rcx+240], r8
|
|
sbb r9, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+248], r9
|
|
sbb rax, 0
|
|
ret
|
|
sp_2048_sub_in_place_32 ENDP
|
|
_text ENDS
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_add_32 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
adc r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
adc r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
adc r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
adc r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
adc r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
adc r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
adc r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
adc r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
adc r10, QWORD PTR [r8+184]
|
|
mov r9, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r10
|
|
adc r9, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r9
|
|
adc r10, QWORD PTR [r8+200]
|
|
mov r9, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r10
|
|
adc r9, QWORD PTR [r8+208]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r9
|
|
adc r10, QWORD PTR [r8+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r9, QWORD PTR [r8+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r9
|
|
adc r10, QWORD PTR [r8+232]
|
|
mov r9, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r10
|
|
adc r9, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r9
|
|
adc r10, QWORD PTR [r8+248]
|
|
mov QWORD PTR [rcx+248], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_2048_add_32 ENDP
|
|
_text ENDS
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_32 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 808
|
|
mov QWORD PTR [rsp+768], rcx
|
|
mov QWORD PTR [rsp+776], rdx
|
|
mov QWORD PTR [rsp+784], r8
|
|
lea r12, QWORD PTR [rsp+512]
|
|
lea r14, QWORD PTR [rdx+128]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov QWORD PTR [r12+120], rax
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+792], r15
|
|
lea r13, QWORD PTR [rsp+640]
|
|
lea r14, QWORD PTR [r8+128]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov QWORD PTR [r13+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov QWORD PTR [r13+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mov QWORD PTR [r13+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov QWORD PTR [r13+120], rax
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+800], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_2048_mul_16
|
|
mov r8, QWORD PTR [rsp+784]
|
|
mov rdx, QWORD PTR [rsp+776]
|
|
lea rcx, QWORD PTR [rsp+256]
|
|
add r8, 128
|
|
add rdx, 128
|
|
call sp_2048_mul_16
|
|
mov r8, QWORD PTR [rsp+784]
|
|
mov rdx, QWORD PTR [rsp+776]
|
|
mov rcx, QWORD PTR [rsp+768]
|
|
call sp_2048_mul_16
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+784]
|
|
mov rdx, QWORD PTR [rsp+776]
|
|
mov rcx, QWORD PTR [rsp+768]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+792]
|
|
mov rdi, QWORD PTR [rsp+800]
|
|
mov rsi, QWORD PTR [rsp+768]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+512]
|
|
lea r13, QWORD PTR [rsp+640]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 256
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12], rax
|
|
mov QWORD PTR [r13], r9
|
|
mov rax, QWORD PTR [r12+8]
|
|
mov r9, QWORD PTR [r13+8]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+8], rax
|
|
mov QWORD PTR [r13+8], r9
|
|
mov rax, QWORD PTR [r12+16]
|
|
mov r9, QWORD PTR [r13+16]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+16], rax
|
|
mov QWORD PTR [r13+16], r9
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+24], rax
|
|
mov QWORD PTR [r13+24], r9
|
|
mov rax, QWORD PTR [r12+32]
|
|
mov r9, QWORD PTR [r13+32]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+32], rax
|
|
mov QWORD PTR [r13+32], r9
|
|
mov rax, QWORD PTR [r12+40]
|
|
mov r9, QWORD PTR [r13+40]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+40], rax
|
|
mov QWORD PTR [r13+40], r9
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+48], rax
|
|
mov QWORD PTR [r13+48], r9
|
|
mov rax, QWORD PTR [r12+56]
|
|
mov r9, QWORD PTR [r13+56]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+56], rax
|
|
mov QWORD PTR [r13+56], r9
|
|
mov rax, QWORD PTR [r12+64]
|
|
mov r9, QWORD PTR [r13+64]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+64], rax
|
|
mov QWORD PTR [r13+64], r9
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+72], rax
|
|
mov QWORD PTR [r13+72], r9
|
|
mov rax, QWORD PTR [r12+80]
|
|
mov r9, QWORD PTR [r13+80]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+80], rax
|
|
mov QWORD PTR [r13+80], r9
|
|
mov rax, QWORD PTR [r12+88]
|
|
mov r9, QWORD PTR [r13+88]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+88], rax
|
|
mov QWORD PTR [r13+88], r9
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [r13+96]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+96], rax
|
|
mov QWORD PTR [r13+96], r9
|
|
mov rax, QWORD PTR [r12+104]
|
|
mov r9, QWORD PTR [r13+104]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+104], rax
|
|
mov QWORD PTR [r13+104], r9
|
|
mov rax, QWORD PTR [r12+112]
|
|
mov r9, QWORD PTR [r13+112]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+112], rax
|
|
mov QWORD PTR [r13+112], r9
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [r13+120]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+120], rax
|
|
mov QWORD PTR [r13+120], r9
|
|
mov rax, QWORD PTR [r12]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+256]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [r13+248]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [rcx+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [rcx+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [rcx+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [rcx+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r11, 0
|
|
sub rsi, 128
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+384], r11
|
|
add rsi, 128
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
xor r11, r11
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r11, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+136]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+144]
|
|
mov QWORD PTR [rsi+136], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+152]
|
|
mov QWORD PTR [rsi+144], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+160]
|
|
mov QWORD PTR [rsi+152], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+168]
|
|
mov QWORD PTR [rsi+160], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+176]
|
|
mov QWORD PTR [rsi+168], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+184]
|
|
mov QWORD PTR [rsi+176], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+192]
|
|
mov QWORD PTR [rsi+184], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+200]
|
|
mov QWORD PTR [rsi+192], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+208]
|
|
mov QWORD PTR [rsi+200], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+216]
|
|
mov QWORD PTR [rsi+208], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+224]
|
|
mov QWORD PTR [rsi+216], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+232]
|
|
mov QWORD PTR [rsi+224], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+240]
|
|
mov QWORD PTR [rsi+232], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+248]
|
|
mov QWORD PTR [rsi+240], r9
|
|
adc r10, 0
|
|
mov QWORD PTR [rsi+248], r10
|
|
add rsp, 808
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_32 ENDP
|
|
_text ENDS
|
|
; /* Add a to a into r. (r = a + a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_dbl_16 PROC
|
|
mov r8, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r8, r8
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
adc r9, r9
|
|
mov QWORD PTR [rcx+120], r9
|
|
adc rax, 0
|
|
ret
|
|
sp_2048_dbl_16 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sqr_32 PROC
|
|
push r12
|
|
sub rsp, 664
|
|
mov QWORD PTR [rsp+640], rcx
|
|
mov QWORD PTR [rsp+648], rdx
|
|
lea r10, QWORD PTR [rsp+512]
|
|
lea r11, QWORD PTR [rdx+128]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc rax, QWORD PTR [r11+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
adc r8, QWORD PTR [r11+104]
|
|
mov rax, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
adc rax, QWORD PTR [r11+112]
|
|
mov r8, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
adc r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r10+120], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+656], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_2048_sqr_16
|
|
mov rdx, QWORD PTR [rsp+648]
|
|
lea rcx, QWORD PTR [rsp+256]
|
|
add rdx, 128
|
|
call sp_2048_sqr_16
|
|
mov rdx, QWORD PTR [rsp+648]
|
|
mov rcx, QWORD PTR [rsp+640]
|
|
call sp_2048_sqr_16
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+648]
|
|
mov rcx, QWORD PTR [rsp+640]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+656]
|
|
lea r10, QWORD PTR [rsp+512]
|
|
mov r9, r12
|
|
neg r12
|
|
mov rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r10+8]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+256], rax
|
|
mov QWORD PTR [rcx+264], r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+272], rax
|
|
mov QWORD PTR [rcx+280], r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+288], rax
|
|
mov QWORD PTR [rcx+296], r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+304], rax
|
|
mov QWORD PTR [rcx+312], r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+320], rax
|
|
mov QWORD PTR [rcx+328], r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+336], rax
|
|
mov QWORD PTR [rcx+344], r8
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+352], rax
|
|
mov QWORD PTR [rcx+360], r8
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+368], rax
|
|
mov QWORD PTR [rcx+376], r8
|
|
mov rax, QWORD PTR [rcx+256]
|
|
add rax, rax
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, r8
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+256]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rcx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rcx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rcx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rcx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb r9, 0
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+128]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [rcx+136]
|
|
mov QWORD PTR [rcx+128], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rcx+136], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [rcx+152]
|
|
mov QWORD PTR [rcx+144], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rcx+152], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [rcx+168]
|
|
mov QWORD PTR [rcx+160], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rcx+168], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [rcx+176], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rcx+184], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [rcx+200]
|
|
mov QWORD PTR [rcx+192], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rcx+200], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [rcx+216]
|
|
mov QWORD PTR [rcx+208], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rcx+216], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [rcx+232]
|
|
mov QWORD PTR [rcx+224], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rcx+232], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [rcx+240], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [rcx+256]
|
|
mov QWORD PTR [rcx+248], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, QWORD PTR [r10+216]
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, QWORD PTR [r10+232]
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+384], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+256]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rcx+384]
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc rax, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+384], rax
|
|
adc r9, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+136]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+392], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+400], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+408], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+416], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+424], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+432], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+440], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+448], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+456], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+464], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+472], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+480], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+488], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+496], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [rcx+504], rax
|
|
add rsp, 664
|
|
pop r12
|
|
ret
|
|
sp_2048_sqr_32 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_avx2_32 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 808
|
|
mov QWORD PTR [rsp+768], rcx
|
|
mov QWORD PTR [rsp+776], rdx
|
|
mov QWORD PTR [rsp+784], r8
|
|
lea r12, QWORD PTR [rsp+512]
|
|
lea r14, QWORD PTR [rdx+128]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov QWORD PTR [r12+120], rax
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+792], r15
|
|
lea r13, QWORD PTR [rsp+640]
|
|
lea r14, QWORD PTR [r8+128]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov QWORD PTR [r13+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov QWORD PTR [r13+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mov QWORD PTR [r13+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov QWORD PTR [r13+120], rax
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+800], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_2048_mul_avx2_16
|
|
mov r8, QWORD PTR [rsp+784]
|
|
mov rdx, QWORD PTR [rsp+776]
|
|
lea rcx, QWORD PTR [rsp+256]
|
|
add r8, 128
|
|
add rdx, 128
|
|
call sp_2048_mul_avx2_16
|
|
mov r8, QWORD PTR [rsp+784]
|
|
mov rdx, QWORD PTR [rsp+776]
|
|
mov rcx, QWORD PTR [rsp+768]
|
|
call sp_2048_mul_avx2_16
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+784]
|
|
mov rdx, QWORD PTR [rsp+776]
|
|
mov rcx, QWORD PTR [rsp+768]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+792]
|
|
mov rdi, QWORD PTR [rsp+800]
|
|
mov rsi, QWORD PTR [rsp+768]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+512]
|
|
lea r13, QWORD PTR [rsp+640]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 256
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
add rax, r9
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [r13+8]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [r13+16]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [r13+32]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [r13+40]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [r13+56]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [r13+64]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [r13+80]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [r13+88]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [r13+96]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [r13+104]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [r13+112]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [r13+120]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, r9
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+256]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [r13+248]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [rcx+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [rcx+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [rcx+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [rcx+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r11, 0
|
|
sub rsi, 128
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+384], r11
|
|
add rsi, 128
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
xor r11, r11
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r11, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+136]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+144]
|
|
mov QWORD PTR [rsi+136], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+152]
|
|
mov QWORD PTR [rsi+144], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+160]
|
|
mov QWORD PTR [rsi+152], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+168]
|
|
mov QWORD PTR [rsi+160], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+176]
|
|
mov QWORD PTR [rsi+168], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+184]
|
|
mov QWORD PTR [rsi+176], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+192]
|
|
mov QWORD PTR [rsi+184], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+200]
|
|
mov QWORD PTR [rsi+192], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+208]
|
|
mov QWORD PTR [rsi+200], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+216]
|
|
mov QWORD PTR [rsi+208], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+224]
|
|
mov QWORD PTR [rsi+216], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+232]
|
|
mov QWORD PTR [rsi+224], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+240]
|
|
mov QWORD PTR [rsi+232], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+248]
|
|
mov QWORD PTR [rsi+240], r9
|
|
adc r10, 0
|
|
mov QWORD PTR [rsi+248], r10
|
|
add rsp, 808
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_avx2_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sqr_avx2_32 PROC
|
|
push r12
|
|
sub rsp, 664
|
|
mov QWORD PTR [rsp+640], rcx
|
|
mov QWORD PTR [rsp+648], rdx
|
|
lea r10, QWORD PTR [rsp+512]
|
|
lea r11, QWORD PTR [rdx+128]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc rax, QWORD PTR [r11+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
adc r8, QWORD PTR [r11+104]
|
|
mov rax, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
adc rax, QWORD PTR [r11+112]
|
|
mov r8, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
adc r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r10+120], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+656], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_2048_sqr_avx2_16
|
|
mov rdx, QWORD PTR [rsp+648]
|
|
lea rcx, QWORD PTR [rsp+256]
|
|
add rdx, 128
|
|
call sp_2048_sqr_avx2_16
|
|
mov rdx, QWORD PTR [rsp+648]
|
|
mov rcx, QWORD PTR [rsp+640]
|
|
call sp_2048_sqr_avx2_16
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+648]
|
|
mov rcx, QWORD PTR [rsp+640]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+656]
|
|
lea r10, QWORD PTR [rsp+512]
|
|
mov r9, r12
|
|
neg r12
|
|
mov rax, QWORD PTR [r10]
|
|
pext rax, rax, r12
|
|
add rax, rax
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [rcx+256], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [rcx+264], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [rcx+272], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [rcx+280], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [rcx+288], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [rcx+296], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [rcx+304], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [rcx+312], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [rcx+320], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [rcx+328], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [rcx+336], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [rcx+344], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [rcx+352], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [rcx+360], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [rcx+368], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+256]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rcx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rcx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rcx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rcx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb r9, 0
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+128]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [rcx+136]
|
|
mov QWORD PTR [rcx+128], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rcx+136], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [rcx+152]
|
|
mov QWORD PTR [rcx+144], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rcx+152], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [rcx+168]
|
|
mov QWORD PTR [rcx+160], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rcx+168], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [rcx+176], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rcx+184], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [rcx+200]
|
|
mov QWORD PTR [rcx+192], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rcx+200], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [rcx+216]
|
|
mov QWORD PTR [rcx+208], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rcx+216], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [rcx+232]
|
|
mov QWORD PTR [rcx+224], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rcx+232], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [rcx+240], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [rcx+256]
|
|
mov QWORD PTR [rcx+248], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, QWORD PTR [r10+216]
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, QWORD PTR [r10+232]
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+384], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+256]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rcx+384]
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc rax, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+384], rax
|
|
adc r9, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+136]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+392], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+400], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+408], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+416], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+424], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+432], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+440], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+448], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+456], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+464], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+472], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+480], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+488], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+496], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [rcx+504], rax
|
|
add rsp, 664
|
|
pop r12
|
|
ret
|
|
sp_2048_sqr_avx2_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sub_in_place_16 PROC
|
|
mov r8, QWORD PTR [rcx]
|
|
xor rax, rax
|
|
sub r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
sbb r9, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r8, QWORD PTR [rdx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
sbb r9, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
sbb r8, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
sbb r9, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
sbb r8, QWORD PTR [rdx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
sbb r9, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
sbb r8, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
sbb r9, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
sbb r8, QWORD PTR [rdx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
sbb r9, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
sbb r8, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
sbb r9, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
sbb r8, QWORD PTR [rdx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
sbb r9, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+120], r9
|
|
sbb rax, 0
|
|
ret
|
|
sp_2048_sub_in_place_16 ENDP
|
|
_text ENDS
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_d_32 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[16] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+128]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+128], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[17] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+136], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[18] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+144], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[19] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+152]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+152], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[20] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+160]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+160], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[21] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+168]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[22] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+176]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+176], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[23] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+184]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+184], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[24] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+192]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+192], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[25] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+200]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+200], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[26] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+208]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+208], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[27] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+216]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[28] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+224]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+224], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[29] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+232]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+232], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[30] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+240]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+240], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[31] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+248]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
mov QWORD PTR [rcx+248], r11
|
|
mov QWORD PTR [rcx+256], r12
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_d_32 ENDP
|
|
_text ENDS
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cond_sub_16 PROC
|
|
sub rsp, 128
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb rax, 0
|
|
add rsp, 128
|
|
ret
|
|
sp_2048_cond_sub_16 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 2048 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mont_reduce_16 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 16
|
|
mov r10, 16
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_2048_mont_loop_16:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc r11, 0
|
|
; a[i+6] += m[6] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+48], r14
|
|
adc r12, 0
|
|
; a[i+7] += m[7] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+56]
|
|
mov r14, QWORD PTR [rcx+56]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+56], r14
|
|
adc r11, 0
|
|
; a[i+8] += m[8] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+64]
|
|
mov r14, QWORD PTR [rcx+64]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+64], r14
|
|
adc r12, 0
|
|
; a[i+9] += m[9] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rcx+72]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+72], r14
|
|
adc r11, 0
|
|
; a[i+10] += m[10] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+80]
|
|
mov r14, QWORD PTR [rcx+80]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+80], r14
|
|
adc r12, 0
|
|
; a[i+11] += m[11] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
mov r14, QWORD PTR [rcx+88]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+88], r14
|
|
adc r11, 0
|
|
; a[i+12] += m[12] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
mov r14, QWORD PTR [rcx+96]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+96], r14
|
|
adc r12, 0
|
|
; a[i+13] += m[13] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+104]
|
|
mov r14, QWORD PTR [rcx+104]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+104], r14
|
|
adc r11, 0
|
|
; a[i+14] += m[14] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rcx+112]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+112], r14
|
|
adc r12, 0
|
|
; a[i+15] += m[15] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+120]
|
|
mov r14, QWORD PTR [rcx+120]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+120], r14
|
|
adc QWORD PTR [rcx+128], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_2048_mont_loop_16
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
neg rsi
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 128
|
|
call sp_2048_cond_sub_16
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mont_reduce_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cond_sub_avx2_16 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
sbb r10, r11
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_2048_cond_sub_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_d_16 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+120], r10
|
|
mov QWORD PTR [rcx+128], r11
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_d_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_d_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[6] * B
|
|
mulx r10, r9, QWORD PTR [rax+48]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; A[7] * B
|
|
mulx r10, r9, QWORD PTR [rax+56]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; A[8] * B
|
|
mulx r10, r9, QWORD PTR [rax+64]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[9] * B
|
|
mulx r10, r9, QWORD PTR [rax+72]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
; A[10] * B
|
|
mulx r10, r9, QWORD PTR [rax+80]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
; A[11] * B
|
|
mulx r10, r9, QWORD PTR [rax+88]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+88], r12
|
|
; A[12] * B
|
|
mulx r10, r9, QWORD PTR [rax+96]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+96], r11
|
|
; A[13] * B
|
|
mulx r10, r9, QWORD PTR [rax+104]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+104], r12
|
|
; A[14] * B
|
|
mulx r10, r9, QWORD PTR [rax+112]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[15] * B
|
|
mulx r10, r9, QWORD PTR [rax+120]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+120], r12
|
|
mov QWORD PTR [rcx+128], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_d_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_2048_word_asm_16 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_2048_word_asm_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cmp_16 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+120]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+104]
|
|
mov r12, QWORD PTR [rdx+104]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+96]
|
|
mov r12, QWORD PTR [rdx+96]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+80]
|
|
mov r12, QWORD PTR [rdx+80]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+72]
|
|
mov r12, QWORD PTR [rdx+72]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+56]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+48]
|
|
mov r12, QWORD PTR [rdx+48]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_2048_cmp_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 2048 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mont_reduce_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r9, rcx
|
|
mov r10, rdx
|
|
xor rbp, rbp
|
|
; i = 16
|
|
mov r11, 16
|
|
mov r15, QWORD PTR [r9]
|
|
mov rdi, QWORD PTR [r9+8]
|
|
mov rsi, QWORD PTR [r9+16]
|
|
mov rbx, QWORD PTR [r9+24]
|
|
add r9, 64
|
|
xor rbp, rbp
|
|
L_2048_mont_loop_avx2_16:
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-32]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-24]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-24], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9+-8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-16], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-8], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+8], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+16], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+24], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+32], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+40], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+48], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+56], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+64], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-24]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-16]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-8]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-16], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-8], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9+8]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+16]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+8], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+24]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+16], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+24], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+40]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+32], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+48]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+40], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+56]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+48], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+56], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+72]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+64], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+72], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; a += 2
|
|
add r9, 16
|
|
; i -= 2
|
|
sub r11, 2
|
|
jnz L_2048_mont_loop_avx2_16
|
|
sub r9, 64
|
|
neg rbp
|
|
mov r8, r9
|
|
sub r9, 128
|
|
mov rcx, QWORD PTR [r10]
|
|
mov rdx, r15
|
|
pext rcx, rcx, rbp
|
|
sub rdx, rcx
|
|
mov rcx, QWORD PTR [r10+8]
|
|
mov rax, rdi
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+16]
|
|
mov rcx, rsi
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+8], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+24]
|
|
mov rdx, rbx
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+16], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+32]
|
|
mov rax, QWORD PTR [r8+32]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+24], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+40]
|
|
mov rcx, QWORD PTR [r8+40]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+32], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov rdx, QWORD PTR [r8+48]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+40], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r8+56]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+48], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+64]
|
|
mov rcx, QWORD PTR [r8+64]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+56], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+72]
|
|
mov rdx, QWORD PTR [r8+72]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+64], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+80]
|
|
mov rax, QWORD PTR [r8+80]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+72], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+88]
|
|
mov rcx, QWORD PTR [r8+88]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+80], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov rdx, QWORD PTR [r8+96]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+88], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r8+104]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+96], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+112]
|
|
mov rcx, QWORD PTR [r8+112]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+104], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+120]
|
|
mov rdx, QWORD PTR [r8+120]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+112], rcx
|
|
sbb rdx, rax
|
|
mov QWORD PTR [r9+120], rdx
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mont_reduce_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cond_sub_32 PROC
|
|
sub rsp, 256
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [r8+136]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+128], r10
|
|
mov QWORD PTR [rsp+136], r11
|
|
mov r10, QWORD PTR [r8+144]
|
|
mov r11, QWORD PTR [r8+152]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+144], r10
|
|
mov QWORD PTR [rsp+152], r11
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r11, QWORD PTR [r8+168]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+160], r10
|
|
mov QWORD PTR [rsp+168], r11
|
|
mov r10, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [r8+184]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+176], r10
|
|
mov QWORD PTR [rsp+184], r11
|
|
mov r10, QWORD PTR [r8+192]
|
|
mov r11, QWORD PTR [r8+200]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+192], r10
|
|
mov QWORD PTR [rsp+200], r11
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r11, QWORD PTR [r8+216]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+208], r10
|
|
mov QWORD PTR [rsp+216], r11
|
|
mov r10, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [r8+232]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+224], r10
|
|
mov QWORD PTR [rsp+232], r11
|
|
mov r10, QWORD PTR [r8+240]
|
|
mov r11, QWORD PTR [r8+248]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+240], r10
|
|
mov QWORD PTR [rsp+248], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov r10, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rsp+128]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+120], r11
|
|
mov r11, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rsp+136]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+128], r10
|
|
mov r10, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rsp+144]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+136], r11
|
|
mov r11, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rsp+152]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+144], r10
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rsp+160]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+152], r11
|
|
mov r11, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rsp+168]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+160], r10
|
|
mov r10, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rsp+176]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+168], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rsp+184]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov r10, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rsp+192]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+184], r11
|
|
mov r11, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rsp+200]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+192], r10
|
|
mov r10, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [rsp+208]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+200], r11
|
|
mov r11, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rsp+216]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+208], r10
|
|
mov r10, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rsp+224]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+216], r11
|
|
mov r11, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rsp+232]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+224], r10
|
|
mov r10, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [rsp+240]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+232], r11
|
|
mov r11, QWORD PTR [rdx+248]
|
|
mov r8, QWORD PTR [rsp+248]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+240], r10
|
|
mov QWORD PTR [rcx+248], r11
|
|
sbb rax, 0
|
|
add rsp, 256
|
|
ret
|
|
sp_2048_cond_sub_32 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 2048 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mont_reduce_32 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 32
|
|
mov r10, 32
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_2048_mont_loop_32:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc r11, 0
|
|
; a[i+6] += m[6] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+48], r14
|
|
adc r12, 0
|
|
; a[i+7] += m[7] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+56]
|
|
mov r14, QWORD PTR [rcx+56]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+56], r14
|
|
adc r11, 0
|
|
; a[i+8] += m[8] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+64]
|
|
mov r14, QWORD PTR [rcx+64]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+64], r14
|
|
adc r12, 0
|
|
; a[i+9] += m[9] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rcx+72]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+72], r14
|
|
adc r11, 0
|
|
; a[i+10] += m[10] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+80]
|
|
mov r14, QWORD PTR [rcx+80]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+80], r14
|
|
adc r12, 0
|
|
; a[i+11] += m[11] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
mov r14, QWORD PTR [rcx+88]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+88], r14
|
|
adc r11, 0
|
|
; a[i+12] += m[12] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
mov r14, QWORD PTR [rcx+96]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+96], r14
|
|
adc r12, 0
|
|
; a[i+13] += m[13] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+104]
|
|
mov r14, QWORD PTR [rcx+104]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+104], r14
|
|
adc r11, 0
|
|
; a[i+14] += m[14] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rcx+112]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+112], r14
|
|
adc r12, 0
|
|
; a[i+15] += m[15] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+120]
|
|
mov r14, QWORD PTR [rcx+120]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+120], r14
|
|
adc r11, 0
|
|
; a[i+16] += m[16] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+128]
|
|
mov r14, QWORD PTR [rcx+128]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+128], r14
|
|
adc r12, 0
|
|
; a[i+17] += m[17] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
mov r14, QWORD PTR [rcx+136]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+136], r14
|
|
adc r11, 0
|
|
; a[i+18] += m[18] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
mov r14, QWORD PTR [rcx+144]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+144], r14
|
|
adc r12, 0
|
|
; a[i+19] += m[19] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+152]
|
|
mov r14, QWORD PTR [rcx+152]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+152], r14
|
|
adc r11, 0
|
|
; a[i+20] += m[20] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+160]
|
|
mov r14, QWORD PTR [rcx+160]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+160], r14
|
|
adc r12, 0
|
|
; a[i+21] += m[21] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+168]
|
|
mov r14, QWORD PTR [rcx+168]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+168], r14
|
|
adc r11, 0
|
|
; a[i+22] += m[22] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+176]
|
|
mov r14, QWORD PTR [rcx+176]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+176], r14
|
|
adc r12, 0
|
|
; a[i+23] += m[23] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+184]
|
|
mov r14, QWORD PTR [rcx+184]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+184], r14
|
|
adc r11, 0
|
|
; a[i+24] += m[24] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+192]
|
|
mov r14, QWORD PTR [rcx+192]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+192], r14
|
|
adc r12, 0
|
|
; a[i+25] += m[25] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+200]
|
|
mov r14, QWORD PTR [rcx+200]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+200], r14
|
|
adc r11, 0
|
|
; a[i+26] += m[26] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+208]
|
|
mov r14, QWORD PTR [rcx+208]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+208], r14
|
|
adc r12, 0
|
|
; a[i+27] += m[27] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+216]
|
|
mov r14, QWORD PTR [rcx+216]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+216], r14
|
|
adc r11, 0
|
|
; a[i+28] += m[28] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+224]
|
|
mov r14, QWORD PTR [rcx+224]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+224], r14
|
|
adc r12, 0
|
|
; a[i+29] += m[29] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+232]
|
|
mov r14, QWORD PTR [rcx+232]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+232], r14
|
|
adc r11, 0
|
|
; a[i+30] += m[30] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+240]
|
|
mov r14, QWORD PTR [rcx+240]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+240], r14
|
|
adc r12, 0
|
|
; a[i+31] += m[31] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+248]
|
|
mov r14, QWORD PTR [rcx+248]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+248], r14
|
|
adc QWORD PTR [rcx+256], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_2048_mont_loop_32
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
neg rsi
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 256
|
|
call sp_2048_cond_sub_32
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mont_reduce_32 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into r. (r = a - b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_sub_32 PROC
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
sub r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
sbb r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
sbb r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
sbb r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
sbb r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
sbb r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
sbb r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
sbb r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
sbb r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
sbb r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
sbb r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
sbb r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
sbb r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
sbb r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
sbb r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
sbb r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
sbb r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
sbb r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
sbb r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
sbb r10, QWORD PTR [r8+184]
|
|
mov r9, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r10
|
|
sbb r9, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r9
|
|
sbb r10, QWORD PTR [r8+200]
|
|
mov r9, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r10
|
|
sbb r9, QWORD PTR [r8+208]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r9
|
|
sbb r10, QWORD PTR [r8+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r10
|
|
sbb r9, QWORD PTR [r8+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r9
|
|
sbb r10, QWORD PTR [r8+232]
|
|
mov r9, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r10
|
|
sbb r9, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r9
|
|
sbb r10, QWORD PTR [r8+248]
|
|
mov QWORD PTR [rcx+248], r10
|
|
sbb rax, 0
|
|
ret
|
|
sp_2048_sub_32 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mul_d_avx2_32 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[6] * B
|
|
mulx r10, r9, QWORD PTR [rax+48]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; A[7] * B
|
|
mulx r10, r9, QWORD PTR [rax+56]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; A[8] * B
|
|
mulx r10, r9, QWORD PTR [rax+64]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[9] * B
|
|
mulx r10, r9, QWORD PTR [rax+72]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
; A[10] * B
|
|
mulx r10, r9, QWORD PTR [rax+80]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
; A[11] * B
|
|
mulx r10, r9, QWORD PTR [rax+88]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+88], r12
|
|
; A[12] * B
|
|
mulx r10, r9, QWORD PTR [rax+96]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+96], r11
|
|
; A[13] * B
|
|
mulx r10, r9, QWORD PTR [rax+104]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+104], r12
|
|
; A[14] * B
|
|
mulx r10, r9, QWORD PTR [rax+112]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[15] * B
|
|
mulx r10, r9, QWORD PTR [rax+120]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+120], r12
|
|
; A[16] * B
|
|
mulx r10, r9, QWORD PTR [rax+128]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[17] * B
|
|
mulx r10, r9, QWORD PTR [rax+136]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[18] * B
|
|
mulx r10, r9, QWORD PTR [rax+144]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+144], r11
|
|
; A[19] * B
|
|
mulx r10, r9, QWORD PTR [rax+152]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+152], r12
|
|
; A[20] * B
|
|
mulx r10, r9, QWORD PTR [rax+160]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[21] * B
|
|
mulx r10, r9, QWORD PTR [rax+168]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+168], r12
|
|
; A[22] * B
|
|
mulx r10, r9, QWORD PTR [rax+176]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+176], r11
|
|
; A[23] * B
|
|
mulx r10, r9, QWORD PTR [rax+184]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+184], r12
|
|
; A[24] * B
|
|
mulx r10, r9, QWORD PTR [rax+192]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+192], r11
|
|
; A[25] * B
|
|
mulx r10, r9, QWORD PTR [rax+200]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+200], r12
|
|
; A[26] * B
|
|
mulx r10, r9, QWORD PTR [rax+208]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+208], r11
|
|
; A[27] * B
|
|
mulx r10, r9, QWORD PTR [rax+216]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+216], r12
|
|
; A[28] * B
|
|
mulx r10, r9, QWORD PTR [rax+224]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+224], r11
|
|
; A[29] * B
|
|
mulx r10, r9, QWORD PTR [rax+232]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+232], r12
|
|
; A[30] * B
|
|
mulx r10, r9, QWORD PTR [rax+240]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+240], r11
|
|
; A[31] * B
|
|
mulx r10, r9, QWORD PTR [rax+248]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+248], r12
|
|
mov QWORD PTR [rcx+256], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mul_d_avx2_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_2048_word_asm_32 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_2048_word_asm_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cond_sub_avx2_32 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [rdx+128]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+128], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+136], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+152]
|
|
mov r11, QWORD PTR [rdx+152]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+144], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+152], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+168]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+160], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [rdx+176]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+176], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+192]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+184], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+200]
|
|
mov r11, QWORD PTR [rdx+200]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+192], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+200], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+216]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+208], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [rdx+224]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+216], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+224], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+232], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+248]
|
|
mov r11, QWORD PTR [rdx+248]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+240], r10
|
|
sbb r11, r12
|
|
mov QWORD PTR [rcx+248], r11
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_2048_cond_sub_avx2_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cmp_32 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+248]
|
|
mov r12, QWORD PTR [rdx+248]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+240]
|
|
mov r12, QWORD PTR [rdx+240]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+224]
|
|
mov r12, QWORD PTR [rdx+224]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+216]
|
|
mov r12, QWORD PTR [rdx+216]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+200]
|
|
mov r12, QWORD PTR [rdx+200]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+192]
|
|
mov r12, QWORD PTR [rdx+192]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+176]
|
|
mov r12, QWORD PTR [rdx+176]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+168]
|
|
mov r12, QWORD PTR [rdx+168]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+152]
|
|
mov r12, QWORD PTR [rdx+152]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+144]
|
|
mov r12, QWORD PTR [rdx+144]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+128]
|
|
mov r12, QWORD PTR [rdx+128]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+120]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+104]
|
|
mov r12, QWORD PTR [rdx+104]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+96]
|
|
mov r12, QWORD PTR [rdx+96]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+80]
|
|
mov r12, QWORD PTR [rdx+80]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+72]
|
|
mov r12, QWORD PTR [rdx+72]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+56]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+48]
|
|
mov r12, QWORD PTR [rdx+48]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_2048_cmp_32 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 2048 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_mont_reduce_avx2_32 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r9, rcx
|
|
mov r10, rdx
|
|
xor rbp, rbp
|
|
; i = 32
|
|
mov r11, 32
|
|
mov r15, QWORD PTR [r9]
|
|
mov rdi, QWORD PTR [r9+8]
|
|
mov rsi, QWORD PTR [r9+16]
|
|
mov rbx, QWORD PTR [r9+24]
|
|
add r9, 128
|
|
xor rbp, rbp
|
|
L_2048_mont_loop_avx2_32:
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-96]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-88]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-88], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9+-72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-80], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9+-64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-72], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+-56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-64], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+-48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-56], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+-40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-48], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+-32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-40], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+-24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-32], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+-16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-24], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+-8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-16], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-8], r13
|
|
; a[i+16] += m[16] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+128]
|
|
mov r13, QWORD PTR [r9+8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9], r12
|
|
; a[i+17] += m[17] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+136]
|
|
mov r12, QWORD PTR [r9+16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+8], r13
|
|
; a[i+18] += m[18] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+144]
|
|
mov r13, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+16], r12
|
|
; a[i+19] += m[19] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+152]
|
|
mov r12, QWORD PTR [r9+32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+24], r13
|
|
; a[i+20] += m[20] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+160]
|
|
mov r13, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+32], r12
|
|
; a[i+21] += m[21] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+168]
|
|
mov r12, QWORD PTR [r9+48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+40], r13
|
|
; a[i+22] += m[22] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+176]
|
|
mov r13, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+48], r12
|
|
; a[i+23] += m[23] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+184]
|
|
mov r12, QWORD PTR [r9+64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+56], r13
|
|
; a[i+24] += m[24] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+192]
|
|
mov r13, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+64], r12
|
|
; a[i+25] += m[25] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+200]
|
|
mov r12, QWORD PTR [r9+80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+72], r13
|
|
; a[i+26] += m[26] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+208]
|
|
mov r13, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+80], r12
|
|
; a[i+27] += m[27] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+216]
|
|
mov r12, QWORD PTR [r9+96]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+88], r13
|
|
; a[i+28] += m[28] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+224]
|
|
mov r13, QWORD PTR [r9+104]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+96], r12
|
|
; a[i+29] += m[29] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+232]
|
|
mov r12, QWORD PTR [r9+112]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+104], r13
|
|
; a[i+30] += m[30] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+240]
|
|
mov r13, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+112], r12
|
|
; a[i+31] += m[31] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+248]
|
|
mov r12, QWORD PTR [r9+128]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+120], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+128], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; a += 1
|
|
add r9, 8
|
|
; i -= 1
|
|
sub r11, 1
|
|
jnz L_2048_mont_loop_avx2_32
|
|
sub r9, 128
|
|
neg rbp
|
|
mov r8, r9
|
|
sub r9, 256
|
|
mov rcx, QWORD PTR [r10]
|
|
mov rdx, r15
|
|
pext rcx, rcx, rbp
|
|
sub rdx, rcx
|
|
mov rcx, QWORD PTR [r10+8]
|
|
mov rax, rdi
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+16]
|
|
mov rcx, rsi
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+8], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+24]
|
|
mov rdx, rbx
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+16], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+32]
|
|
mov rax, QWORD PTR [r8+32]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+24], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+40]
|
|
mov rcx, QWORD PTR [r8+40]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+32], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov rdx, QWORD PTR [r8+48]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+40], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r8+56]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+48], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+64]
|
|
mov rcx, QWORD PTR [r8+64]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+56], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+72]
|
|
mov rdx, QWORD PTR [r8+72]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+64], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+80]
|
|
mov rax, QWORD PTR [r8+80]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+72], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+88]
|
|
mov rcx, QWORD PTR [r8+88]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+80], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov rdx, QWORD PTR [r8+96]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+88], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r8+104]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+96], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+112]
|
|
mov rcx, QWORD PTR [r8+112]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+104], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+120]
|
|
mov rdx, QWORD PTR [r8+120]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+112], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+128]
|
|
mov rax, QWORD PTR [r8+128]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+120], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+136]
|
|
mov rcx, QWORD PTR [r8+136]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+128], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov rdx, QWORD PTR [r8+144]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+136], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r8+152]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+144], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+160]
|
|
mov rcx, QWORD PTR [r8+160]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+152], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+168]
|
|
mov rdx, QWORD PTR [r8+168]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+160], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+176]
|
|
mov rax, QWORD PTR [r8+176]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+168], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+184]
|
|
mov rcx, QWORD PTR [r8+184]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+176], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov rdx, QWORD PTR [r8+192]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+184], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [r8+200]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+192], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+208]
|
|
mov rcx, QWORD PTR [r8+208]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+200], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+216]
|
|
mov rdx, QWORD PTR [r8+216]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+208], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+224]
|
|
mov rax, QWORD PTR [r8+224]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+216], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+232]
|
|
mov rcx, QWORD PTR [r8+232]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+224], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov rdx, QWORD PTR [r8+240]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+232], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [r8+248]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+240], rdx
|
|
sbb rax, rcx
|
|
mov QWORD PTR [r9+248], rax
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_mont_reduce_avx2_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cond_add_16 PROC
|
|
sub rsp, 128
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
add r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
adc rax, 0
|
|
add rsp, 128
|
|
ret
|
|
sp_2048_cond_add_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_cond_add_avx2_16 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
add r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, r11
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc rax, 0
|
|
pop r12
|
|
ret
|
|
sp_2048_cond_add_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Shift number left by n bit. (r = a << n)
|
|
; *
|
|
; * r Result of left shift by n.
|
|
; * a Number to shift.
|
|
; * n Amoutnt o shift.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_lshift_32 PROC
|
|
push r12
|
|
push r13
|
|
mov r9, rcx
|
|
mov rcx, r8
|
|
mov r12, 0
|
|
mov r13, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rdx+232]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
mov r11, QWORD PTR [rdx+248]
|
|
shld r12, r11, cl
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+224], rax
|
|
mov QWORD PTR [r9+232], r8
|
|
mov QWORD PTR [r9+240], r10
|
|
mov QWORD PTR [r9+248], r11
|
|
mov QWORD PTR [r9+256], r12
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rdx+200]
|
|
mov r10, QWORD PTR [rdx+208]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+192], rax
|
|
mov QWORD PTR [r9+200], r8
|
|
mov QWORD PTR [r9+208], r10
|
|
mov QWORD PTR [r9+216], r13
|
|
mov r13, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov r10, QWORD PTR [rdx+176]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+160], rax
|
|
mov QWORD PTR [r9+168], r8
|
|
mov QWORD PTR [r9+176], r10
|
|
mov QWORD PTR [r9+184], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+128], rax
|
|
mov QWORD PTR [r9+136], r8
|
|
mov QWORD PTR [r9+144], r10
|
|
mov QWORD PTR [r9+152], r13
|
|
mov r13, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+96], rax
|
|
mov QWORD PTR [r9+104], r8
|
|
mov QWORD PTR [r9+112], r10
|
|
mov QWORD PTR [r9+120], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+64], rax
|
|
mov QWORD PTR [r9+72], r8
|
|
mov QWORD PTR [r9+80], r10
|
|
mov QWORD PTR [r9+88], r13
|
|
mov r13, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+32], rax
|
|
mov QWORD PTR [r9+40], r8
|
|
mov QWORD PTR [r9+48], r10
|
|
mov QWORD PTR [r9+56], r11
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shl rax, cl
|
|
mov QWORD PTR [r9], rax
|
|
mov QWORD PTR [r9+8], r8
|
|
mov QWORD PTR [r9+16], r10
|
|
mov QWORD PTR [r9+24], r13
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_2048_lshift_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFNDEF WOLFSSL_SP_NO_3072
|
|
IFNDEF WOLFSSL_SP_NO_3072
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_from_bin_bswap PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 384
|
|
xor r13, r13
|
|
jmp L_3072_from_bin_bswap_64_end
|
|
L_3072_from_bin_bswap_64_start:
|
|
sub r11, 64
|
|
mov rax, QWORD PTR [r11+56]
|
|
mov r10, QWORD PTR [r11+48]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [r11+40]
|
|
mov r10, QWORD PTR [r11+32]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [r11+24]
|
|
mov r10, QWORD PTR [r11+16]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [r11+8]
|
|
mov r10, QWORD PTR [r11]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_3072_from_bin_bswap_64_end:
|
|
cmp r9, 63
|
|
jg L_3072_from_bin_bswap_64_start
|
|
jmp L_3072_from_bin_bswap_8_end
|
|
L_3072_from_bin_bswap_8_start:
|
|
sub r11, 8
|
|
mov rax, QWORD PTR [r11]
|
|
bswap rax
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_3072_from_bin_bswap_8_end:
|
|
cmp r9, 7
|
|
jg L_3072_from_bin_bswap_8_start
|
|
cmp r9, r13
|
|
je L_3072_from_bin_bswap_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_3072_from_bin_bswap_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_3072_from_bin_bswap_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_3072_from_bin_bswap_hi_end:
|
|
cmp rcx, r12
|
|
je L_3072_from_bin_bswap_zero_end
|
|
L_3072_from_bin_bswap_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_3072_from_bin_bswap_zero_start
|
|
L_3072_from_bin_bswap_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_from_bin_bswap ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the movbe instruction which is an optional instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_from_bin_movbe PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 384
|
|
xor r13, r13
|
|
jmp L_3072_from_bin_movbe_64_end
|
|
L_3072_from_bin_movbe_64_start:
|
|
sub r11, 64
|
|
movbe rax, QWORD PTR [r11+56]
|
|
movbe r10, QWORD PTR [r11+48]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
movbe rax, QWORD PTR [r11+40]
|
|
movbe r10, QWORD PTR [r11+32]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
movbe rax, QWORD PTR [r11+24]
|
|
movbe r10, QWORD PTR [r11+16]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
movbe rax, QWORD PTR [r11+8]
|
|
movbe r10, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_3072_from_bin_movbe_64_end:
|
|
cmp r9, 63
|
|
jg L_3072_from_bin_movbe_64_start
|
|
jmp L_3072_from_bin_movbe_8_end
|
|
L_3072_from_bin_movbe_8_start:
|
|
sub r11, 8
|
|
movbe rax, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_3072_from_bin_movbe_8_end:
|
|
cmp r9, 7
|
|
jg L_3072_from_bin_movbe_8_start
|
|
cmp r9, r13
|
|
je L_3072_from_bin_movbe_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_3072_from_bin_movbe_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_3072_from_bin_movbe_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_3072_from_bin_movbe_hi_end:
|
|
cmp rcx, r12
|
|
je L_3072_from_bin_movbe_zero_end
|
|
L_3072_from_bin_movbe_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_3072_from_bin_movbe_zero_start
|
|
L_3072_from_bin_movbe_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_from_bin_movbe ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 384
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_to_bin_bswap_48 PROC
|
|
mov rax, QWORD PTR [rcx+376]
|
|
mov r8, QWORD PTR [rcx+368]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
mov rax, QWORD PTR [rcx+360]
|
|
mov r8, QWORD PTR [rcx+352]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
mov rax, QWORD PTR [rcx+344]
|
|
mov r8, QWORD PTR [rcx+336]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
mov rax, QWORD PTR [rcx+328]
|
|
mov r8, QWORD PTR [rcx+320]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+48], rax
|
|
mov QWORD PTR [rdx+56], r8
|
|
mov rax, QWORD PTR [rcx+312]
|
|
mov r8, QWORD PTR [rcx+304]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+64], rax
|
|
mov QWORD PTR [rdx+72], r8
|
|
mov rax, QWORD PTR [rcx+296]
|
|
mov r8, QWORD PTR [rcx+288]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+80], rax
|
|
mov QWORD PTR [rdx+88], r8
|
|
mov rax, QWORD PTR [rcx+280]
|
|
mov r8, QWORD PTR [rcx+272]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+96], rax
|
|
mov QWORD PTR [rdx+104], r8
|
|
mov rax, QWORD PTR [rcx+264]
|
|
mov r8, QWORD PTR [rcx+256]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+112], rax
|
|
mov QWORD PTR [rdx+120], r8
|
|
mov rax, QWORD PTR [rcx+248]
|
|
mov r8, QWORD PTR [rcx+240]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+128], rax
|
|
mov QWORD PTR [rdx+136], r8
|
|
mov rax, QWORD PTR [rcx+232]
|
|
mov r8, QWORD PTR [rcx+224]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+144], rax
|
|
mov QWORD PTR [rdx+152], r8
|
|
mov rax, QWORD PTR [rcx+216]
|
|
mov r8, QWORD PTR [rcx+208]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+160], rax
|
|
mov QWORD PTR [rdx+168], r8
|
|
mov rax, QWORD PTR [rcx+200]
|
|
mov r8, QWORD PTR [rcx+192]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+176], rax
|
|
mov QWORD PTR [rdx+184], r8
|
|
mov rax, QWORD PTR [rcx+184]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+192], rax
|
|
mov QWORD PTR [rdx+200], r8
|
|
mov rax, QWORD PTR [rcx+168]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+208], rax
|
|
mov QWORD PTR [rdx+216], r8
|
|
mov rax, QWORD PTR [rcx+152]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+224], rax
|
|
mov QWORD PTR [rdx+232], r8
|
|
mov rax, QWORD PTR [rcx+136]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+240], rax
|
|
mov QWORD PTR [rdx+248], r8
|
|
mov rax, QWORD PTR [rcx+120]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+256], rax
|
|
mov QWORD PTR [rdx+264], r8
|
|
mov rax, QWORD PTR [rcx+104]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+272], rax
|
|
mov QWORD PTR [rdx+280], r8
|
|
mov rax, QWORD PTR [rcx+88]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+288], rax
|
|
mov QWORD PTR [rdx+296], r8
|
|
mov rax, QWORD PTR [rcx+72]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+304], rax
|
|
mov QWORD PTR [rdx+312], r8
|
|
mov rax, QWORD PTR [rcx+56]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+320], rax
|
|
mov QWORD PTR [rdx+328], r8
|
|
mov rax, QWORD PTR [rcx+40]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+336], rax
|
|
mov QWORD PTR [rdx+344], r8
|
|
mov rax, QWORD PTR [rcx+24]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+352], rax
|
|
mov QWORD PTR [rdx+360], r8
|
|
mov rax, QWORD PTR [rcx+8]
|
|
mov r8, QWORD PTR [rcx]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+368], rax
|
|
mov QWORD PTR [rdx+376], r8
|
|
ret
|
|
sp_3072_to_bin_bswap_48 ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 384
|
|
; * Uses the movbe instruction which is optional.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_to_bin_movbe_48 PROC
|
|
movbe rax, QWORD PTR [rcx+376]
|
|
movbe r8, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
movbe rax, QWORD PTR [rcx+360]
|
|
movbe r8, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
movbe rax, QWORD PTR [rcx+344]
|
|
movbe r8, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
movbe rax, QWORD PTR [rcx+328]
|
|
movbe r8, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rdx+48], rax
|
|
mov QWORD PTR [rdx+56], r8
|
|
movbe rax, QWORD PTR [rcx+312]
|
|
movbe r8, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rdx+64], rax
|
|
mov QWORD PTR [rdx+72], r8
|
|
movbe rax, QWORD PTR [rcx+296]
|
|
movbe r8, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rdx+80], rax
|
|
mov QWORD PTR [rdx+88], r8
|
|
movbe rax, QWORD PTR [rcx+280]
|
|
movbe r8, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rdx+96], rax
|
|
mov QWORD PTR [rdx+104], r8
|
|
movbe rax, QWORD PTR [rcx+264]
|
|
movbe r8, QWORD PTR [rcx+256]
|
|
mov QWORD PTR [rdx+112], rax
|
|
mov QWORD PTR [rdx+120], r8
|
|
movbe rax, QWORD PTR [rcx+248]
|
|
movbe r8, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rdx+128], rax
|
|
mov QWORD PTR [rdx+136], r8
|
|
movbe rax, QWORD PTR [rcx+232]
|
|
movbe r8, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rdx+144], rax
|
|
mov QWORD PTR [rdx+152], r8
|
|
movbe rax, QWORD PTR [rcx+216]
|
|
movbe r8, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rdx+160], rax
|
|
mov QWORD PTR [rdx+168], r8
|
|
movbe rax, QWORD PTR [rcx+200]
|
|
movbe r8, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rdx+176], rax
|
|
mov QWORD PTR [rdx+184], r8
|
|
movbe rax, QWORD PTR [rcx+184]
|
|
movbe r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rdx+192], rax
|
|
mov QWORD PTR [rdx+200], r8
|
|
movbe rax, QWORD PTR [rcx+168]
|
|
movbe r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rdx+208], rax
|
|
mov QWORD PTR [rdx+216], r8
|
|
movbe rax, QWORD PTR [rcx+152]
|
|
movbe r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rdx+224], rax
|
|
mov QWORD PTR [rdx+232], r8
|
|
movbe rax, QWORD PTR [rcx+136]
|
|
movbe r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rdx+240], rax
|
|
mov QWORD PTR [rdx+248], r8
|
|
movbe rax, QWORD PTR [rcx+120]
|
|
movbe r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rdx+256], rax
|
|
mov QWORD PTR [rdx+264], r8
|
|
movbe rax, QWORD PTR [rcx+104]
|
|
movbe r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rdx+272], rax
|
|
mov QWORD PTR [rdx+280], r8
|
|
movbe rax, QWORD PTR [rcx+88]
|
|
movbe r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rdx+288], rax
|
|
mov QWORD PTR [rdx+296], r8
|
|
movbe rax, QWORD PTR [rcx+72]
|
|
movbe r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rdx+304], rax
|
|
mov QWORD PTR [rdx+312], r8
|
|
movbe rax, QWORD PTR [rcx+56]
|
|
movbe r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rdx+320], rax
|
|
mov QWORD PTR [rdx+328], r8
|
|
movbe rax, QWORD PTR [rcx+40]
|
|
movbe r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rdx+336], rax
|
|
mov QWORD PTR [rdx+344], r8
|
|
movbe rax, QWORD PTR [rcx+24]
|
|
movbe r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rdx+352], rax
|
|
mov QWORD PTR [rdx+360], r8
|
|
movbe rax, QWORD PTR [rcx+8]
|
|
movbe r8, QWORD PTR [rcx]
|
|
mov QWORD PTR [rdx+368], rax
|
|
mov QWORD PTR [rdx+376], r8
|
|
ret
|
|
sp_3072_to_bin_movbe_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_12 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
sub rsp, 96
|
|
; A[0] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
mov QWORD PTR [rsp], rax
|
|
mov r11, rdx
|
|
; A[0] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+8], r11
|
|
; A[0] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+16], r12
|
|
; A[0] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+24], r10
|
|
; A[0] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+32], r11
|
|
; A[0] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+40], r12
|
|
; A[0] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+48], r10
|
|
; A[0] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+56], r11
|
|
; A[0] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+64], r12
|
|
; A[0] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+72], r10
|
|
; A[0] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+80], r11
|
|
; A[0] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+88], r12
|
|
; A[1] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+8]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+96], r10
|
|
; A[2] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+16]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+104], r11
|
|
; A[3] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+24]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+112], r12
|
|
; A[4] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+32]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+120], r10
|
|
; A[5] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+40]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[6] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+48]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[7] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+56]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+144], r10
|
|
; A[8] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+64]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+152], r11
|
|
; A[9] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+72]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+160], r12
|
|
; A[10] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+80]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+168], r10
|
|
; A[11] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
mov QWORD PTR [rcx+176], r11
|
|
mov QWORD PTR [rcx+184], r12
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r10, QWORD PTR [rsp+16]
|
|
mov r11, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov r10, QWORD PTR [rsp+48]
|
|
mov r11, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rsp+64]
|
|
mov rdx, QWORD PTR [rsp+72]
|
|
mov r10, QWORD PTR [rsp+80]
|
|
mov r11, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], rdx
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
add rsp, 96
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_12 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sqr_12 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r8, rdx
|
|
sub rsp, 96
|
|
; A[0] * A[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul rax
|
|
xor r11, r11
|
|
mov QWORD PTR [rsp], rax
|
|
mov r10, rdx
|
|
; A[0] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+8], r10
|
|
; A[0] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[1] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+16], r11
|
|
; A[0] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8+8]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+24], r9
|
|
; A[0] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[1] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[2] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+32], r10
|
|
; A[0] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+40], r11
|
|
; A[0] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+48], r9
|
|
; A[0] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+56], r10
|
|
; A[0] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+64], r11
|
|
; A[0] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+72], r9
|
|
; A[0] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+80], r10
|
|
; A[0] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+88], r11
|
|
; A[1] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[2] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+96], r9
|
|
; A[2] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+16]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[3] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+104], r10
|
|
; A[3] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+24]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[4] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[4] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+32]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[5] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+120], r9
|
|
; A[5] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+40]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[6] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+128], r10
|
|
; A[6] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+48]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[7] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+136], r11
|
|
; A[7] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+56]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+64]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul rax
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+144], r9
|
|
; A[8] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+64]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[9] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+152], r10
|
|
; A[9] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+72]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[10] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[10] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+80]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+168], r9
|
|
; A[11] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov QWORD PTR [rcx+184], r11
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r12, QWORD PTR [rsp+16]
|
|
mov r13, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov r12, QWORD PTR [rsp+48]
|
|
mov r13, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
mov QWORD PTR [rcx+48], r12
|
|
mov QWORD PTR [rcx+56], r13
|
|
mov rax, QWORD PTR [rsp+64]
|
|
mov rdx, QWORD PTR [rsp+72]
|
|
mov r12, QWORD PTR [rsp+80]
|
|
mov r13, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], rdx
|
|
mov QWORD PTR [rcx+80], r12
|
|
mov QWORD PTR [rcx+88], r13
|
|
add rsp, 96
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_sqr_12 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply.
|
|
; * b Second number to multiply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_avx2_12 PROC
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov rbp, r8
|
|
mov r8, rcx
|
|
mov r9, rdx
|
|
sub rsp, 96
|
|
cmp r9, r8
|
|
mov rbx, rsp
|
|
cmovne rbx, r8
|
|
cmp rbp, r8
|
|
cmove rbx, rsp
|
|
add r8, 96
|
|
xor r14, r14
|
|
mov rdx, QWORD PTR [r9]
|
|
; A[0] * B[0]
|
|
mulx r11, r10, QWORD PTR [rbp]
|
|
; A[0] * B[1]
|
|
mulx r12, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx], r10
|
|
adcx r11, rax
|
|
mov QWORD PTR [rbx+8], r11
|
|
; A[0] * B[2]
|
|
mulx r10, rax, QWORD PTR [rbp+16]
|
|
adcx r12, rax
|
|
; A[0] * B[3]
|
|
mulx r11, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r10, rax
|
|
mov QWORD PTR [rbx+24], r10
|
|
; A[0] * B[4]
|
|
mulx r12, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
; A[0] * B[5]
|
|
mulx r10, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+32], r11
|
|
adcx r12, rax
|
|
mov QWORD PTR [rbx+40], r12
|
|
; A[0] * B[6]
|
|
mulx r11, rax, QWORD PTR [rbp+48]
|
|
adcx r10, rax
|
|
; A[0] * B[7]
|
|
mulx r12, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+48], r10
|
|
adcx r11, rax
|
|
mov QWORD PTR [rbx+56], r11
|
|
; A[0] * B[8]
|
|
mulx r10, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
; A[0] * B[9]
|
|
mulx r11, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+64], r12
|
|
adcx r10, rax
|
|
mov QWORD PTR [rbx+72], r10
|
|
; A[0] * B[10]
|
|
mulx r12, rax, QWORD PTR [rbp+80]
|
|
adcx r11, rax
|
|
; A[0] * B[11]
|
|
mulx r10, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+80], r11
|
|
adcx r12, rax
|
|
adcx r10, r14
|
|
mov r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov QWORD PTR [r8], r10
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mov r11, QWORD PTR [rbx+8]
|
|
mov r12, QWORD PTR [rbx+16]
|
|
mov r10, QWORD PTR [rbx+24]
|
|
; A[1] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+8], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+16], r12
|
|
mov r11, QWORD PTR [rbx+32]
|
|
mov r12, QWORD PTR [rbx+40]
|
|
; A[1] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+24], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+32], r11
|
|
mov r10, QWORD PTR [rbx+48]
|
|
mov r11, QWORD PTR [rbx+56]
|
|
; A[1] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[1] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+40], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+48], r10
|
|
mov r12, QWORD PTR [rbx+64]
|
|
mov r10, QWORD PTR [rbx+72]
|
|
; A[1] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+56], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+64], r12
|
|
mov r11, QWORD PTR [rbx+80]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
; A[1] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+80], r11
|
|
mov r10, QWORD PTR [r8]
|
|
; A[1] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[1] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov r11, r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8], r10
|
|
mov QWORD PTR [r8+8], r11
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [rbx+16]
|
|
mov r10, QWORD PTR [rbx+24]
|
|
mov r11, QWORD PTR [rbx+32]
|
|
; A[2] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[2] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+24], r10
|
|
mov r12, QWORD PTR [rbx+40]
|
|
mov r10, QWORD PTR [rbx+48]
|
|
; A[2] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+32], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+40], r12
|
|
mov r11, QWORD PTR [rbx+56]
|
|
mov r12, QWORD PTR [rbx+64]
|
|
; A[2] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[2] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+48], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+56], r11
|
|
mov r10, QWORD PTR [rbx+72]
|
|
mov r11, QWORD PTR [rbx+80]
|
|
; A[2] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[2] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+64], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+72], r10
|
|
mov r12, QWORD PTR [rbx+88]
|
|
mov r10, QWORD PTR [r8]
|
|
; A[2] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+80], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[2] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[2] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, r14
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+8], r11
|
|
mov QWORD PTR [r8+16], r12
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mov r10, QWORD PTR [rbx+24]
|
|
mov r11, QWORD PTR [rbx+32]
|
|
mov r12, QWORD PTR [rbx+40]
|
|
; A[3] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+24], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+32], r11
|
|
mov r10, QWORD PTR [rbx+48]
|
|
mov r11, QWORD PTR [rbx+56]
|
|
; A[3] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[3] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+40], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+48], r10
|
|
mov r12, QWORD PTR [rbx+64]
|
|
mov r10, QWORD PTR [rbx+72]
|
|
; A[3] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[3] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+56], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+64], r12
|
|
mov r11, QWORD PTR [rbx+80]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
; A[3] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+80], r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[3] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[3] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+88], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
; A[3] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[3] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, r14
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
adcx r10, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+16], r12
|
|
mov QWORD PTR [r8+24], r10
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mov r11, QWORD PTR [rbx+32]
|
|
mov r12, QWORD PTR [rbx+40]
|
|
mov r10, QWORD PTR [rbx+48]
|
|
; A[4] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[4] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+32], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+40], r12
|
|
mov r11, QWORD PTR [rbx+56]
|
|
mov r12, QWORD PTR [rbx+64]
|
|
; A[4] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+48], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+56], r11
|
|
mov r10, QWORD PTR [rbx+72]
|
|
mov r11, QWORD PTR [rbx+80]
|
|
; A[4] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[4] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+64], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+72], r10
|
|
mov r12, QWORD PTR [rbx+88]
|
|
mov r10, QWORD PTR [r8]
|
|
; A[4] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[4] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+80], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov r12, QWORD PTR [r8+16]
|
|
; A[4] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[4] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[4] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+24], r10
|
|
mov QWORD PTR [r8+32], r11
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mov r12, QWORD PTR [rbx+40]
|
|
mov r10, QWORD PTR [rbx+48]
|
|
mov r11, QWORD PTR [rbx+56]
|
|
; A[5] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[5] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+40], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+48], r10
|
|
mov r12, QWORD PTR [rbx+64]
|
|
mov r10, QWORD PTR [rbx+72]
|
|
; A[5] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[5] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+56], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+64], r12
|
|
mov r11, QWORD PTR [rbx+80]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
; A[5] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+80], r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[5] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[5] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+88], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[5] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[5] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+8], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
; A[5] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+24], r10
|
|
mov r12, r14
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+32], r11
|
|
mov QWORD PTR [r8+40], r12
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mov r10, QWORD PTR [rbx+48]
|
|
mov r11, QWORD PTR [rbx+56]
|
|
mov r12, QWORD PTR [rbx+64]
|
|
; A[6] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+48], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+56], r11
|
|
mov r10, QWORD PTR [rbx+72]
|
|
mov r11, QWORD PTR [rbx+80]
|
|
; A[6] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[6] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+64], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+72], r10
|
|
mov r12, QWORD PTR [rbx+88]
|
|
mov r10, QWORD PTR [r8]
|
|
; A[6] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+80], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov r12, QWORD PTR [r8+16]
|
|
; A[6] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+24]
|
|
mov r11, QWORD PTR [r8+32]
|
|
; A[6] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[6] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+16], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov r12, QWORD PTR [r8+40]
|
|
; A[6] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, r14
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
adcx r10, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+40], r12
|
|
mov QWORD PTR [r8+48], r10
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mov r11, QWORD PTR [rbx+56]
|
|
mov r12, QWORD PTR [rbx+64]
|
|
mov r10, QWORD PTR [rbx+72]
|
|
; A[7] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+56], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+64], r12
|
|
mov r11, QWORD PTR [rbx+80]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
; A[7] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[7] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+80], r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[7] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[7] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+88], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[7] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+8], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, QWORD PTR [r8+40]
|
|
; A[7] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[7] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+24], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
; A[7] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[7] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+40], r12
|
|
mov r11, r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+48], r10
|
|
mov QWORD PTR [r8+56], r11
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mov r12, QWORD PTR [rbx+64]
|
|
mov r10, QWORD PTR [rbx+72]
|
|
mov r11, QWORD PTR [rbx+80]
|
|
; A[8] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[8] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+64], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+72], r10
|
|
mov r12, QWORD PTR [rbx+88]
|
|
mov r10, QWORD PTR [r8]
|
|
; A[8] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+80], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov r12, QWORD PTR [r8+16]
|
|
; A[8] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+24]
|
|
mov r11, QWORD PTR [r8+32]
|
|
; A[8] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[8] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+16], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov r12, QWORD PTR [r8+40]
|
|
mov r10, QWORD PTR [r8+48]
|
|
; A[8] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+32], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r12
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[8] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+48], r10
|
|
mov r12, r14
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+56], r11
|
|
mov QWORD PTR [r8+64], r12
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mov r10, QWORD PTR [rbx+72]
|
|
mov r11, QWORD PTR [rbx+80]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
; A[9] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+80], r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[9] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[9] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+88], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[9] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[9] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+8], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, QWORD PTR [r8+40]
|
|
; A[9] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+24], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[9] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[9] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+40], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+48], r10
|
|
mov r12, QWORD PTR [r8+64]
|
|
; A[9] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[9] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+56], r11
|
|
mov r10, r14
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
adcx r10, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+64], r12
|
|
mov QWORD PTR [r8+72], r10
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mov r11, QWORD PTR [rbx+80]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
mov r10, QWORD PTR [r8]
|
|
; A[10] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+80], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+88], r12
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov r12, QWORD PTR [r8+16]
|
|
; A[10] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [r8], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+24]
|
|
mov r11, QWORD PTR [r8+32]
|
|
; A[10] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[10] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+16], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov r12, QWORD PTR [r8+40]
|
|
mov r10, QWORD PTR [r8+48]
|
|
; A[10] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+32], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r12
|
|
mov r11, QWORD PTR [r8+56]
|
|
mov r12, QWORD PTR [r8+64]
|
|
; A[10] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+48], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+56], r11
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[10] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[10] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+64], r12
|
|
mov r11, r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r13
|
|
mov r13, r14
|
|
adox r13, r14
|
|
adcx r13, r14
|
|
mov QWORD PTR [r8+72], r10
|
|
mov QWORD PTR [r8+80], r11
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mov r12, QWORD PTR [rbx+88]
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[11] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[11] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+88], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[11] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [r8+8], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, QWORD PTR [r8+40]
|
|
; A[11] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+24], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[11] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[11] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+40], r12
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+48], r10
|
|
mov r12, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[11] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+56], r11
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+64], r12
|
|
mov r11, QWORD PTR [r8+80]
|
|
; A[11] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+72], r10
|
|
mov r12, r14
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r13
|
|
mov QWORD PTR [r8+80], r11
|
|
mov QWORD PTR [r8+88], r12
|
|
sub r8, 96
|
|
cmp r9, r8
|
|
je L_start_3072_mul_avx2_12
|
|
cmp rbp, r8
|
|
jne L_end_3072_mul_avx2_12
|
|
L_start_3072_mul_avx2_12:
|
|
vmovdqu xmm0, OWORD PTR [rbx]
|
|
vmovups OWORD PTR [r8], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+16]
|
|
vmovups OWORD PTR [r8+16], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+32]
|
|
vmovups OWORD PTR [r8+32], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+48]
|
|
vmovups OWORD PTR [r8+48], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+64]
|
|
vmovups OWORD PTR [r8+64], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+80]
|
|
vmovups OWORD PTR [r8+80], xmm0
|
|
L_end_3072_mul_avx2_12:
|
|
add rsp, 96
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
sp_3072_mul_avx2_12 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sqr_avx2_12 PROC
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov r8, rcx
|
|
mov r9, rdx
|
|
sub rsp, 96
|
|
cmp r9, r8
|
|
mov rbp, rsp
|
|
cmovne rbp, r8
|
|
add r8, 96
|
|
xor r12, r12
|
|
; Diagonal 1
|
|
; A[1] x A[0]
|
|
mov rdx, QWORD PTR [r9]
|
|
mulx r11, r10, QWORD PTR [r9+8]
|
|
mov QWORD PTR [rbp+8], r10
|
|
mov r10, r12
|
|
; A[2] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+16]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+16], r11
|
|
mov r11, r12
|
|
; A[3] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+24], r10
|
|
mov r10, r12
|
|
; A[4] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+32], r11
|
|
mov r11, r12
|
|
; A[5] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+40], r10
|
|
mov r10, r12
|
|
; A[6] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+48], r11
|
|
mov r11, r12
|
|
; A[7] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov r14, r10
|
|
mov r10, r12
|
|
; A[8] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov r15, r11
|
|
mov r11, r12
|
|
; A[9] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov rdi, r10
|
|
mov r10, r12
|
|
; A[10] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov rsi, r11
|
|
mov r11, r12
|
|
; A[11] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov rbx, r10
|
|
; Carry
|
|
adcx r11, r12
|
|
mov r13, r12
|
|
adcx r13, r12
|
|
adox r13, r12
|
|
mov QWORD PTR [r8], r11
|
|
; Diagonal 2
|
|
mov r11, QWORD PTR [rbp+24]
|
|
mov r10, QWORD PTR [rbp+32]
|
|
; A[2] x A[1]
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mulx rcx, rax, QWORD PTR [r9+16]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+24], r11
|
|
mov r11, QWORD PTR [rbp+40]
|
|
; A[3] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+32], r10
|
|
mov r10, QWORD PTR [rbp+48]
|
|
; A[4] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+40], r11
|
|
; No load %r12 - %r9
|
|
; A[5] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbp+48], r10
|
|
; No load %r13 - %r8
|
|
; A[6] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r14, rax
|
|
adox r15, rcx
|
|
; No store %r12
|
|
; No load %r14 - %r9
|
|
; A[7] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; No store %r13
|
|
; No load %r15 - %r8
|
|
; A[8] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; No store %r14
|
|
; No load %rbx - %r9
|
|
; A[9] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r15
|
|
mov r10, QWORD PTR [r8]
|
|
; A[10] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov r11, r12
|
|
; A[11] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r10, r12
|
|
; A[11] x A[2]
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
; Carry
|
|
adcx r10, r13
|
|
mov r13, r12
|
|
adcx r13, r12
|
|
adox r13, r12
|
|
mov QWORD PTR [r8+16], r10
|
|
; Diagonal 3
|
|
mov r10, QWORD PTR [rbp+40]
|
|
mov r11, QWORD PTR [rbp+48]
|
|
; A[3] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+40], r10
|
|
; No load %r12 - %r8
|
|
; A[4] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbp+48], r11
|
|
; No load %r13 - %r9
|
|
; A[5] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r14, rax
|
|
adox r15, rcx
|
|
; No store %r12
|
|
; No load %r14 - %r8
|
|
; A[6] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; No store %r13
|
|
; No load %r15 - %r9
|
|
; A[7] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; No store %r14
|
|
; No load %rbx - %r8
|
|
; A[8] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r15
|
|
mov r11, QWORD PTR [r8]
|
|
; A[9] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; No store %rbx
|
|
mov r10, QWORD PTR [r8+8]
|
|
; A[10] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r11, QWORD PTR [r8+16]
|
|
; A[10] x A[3]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+8], r10
|
|
mov r10, r12
|
|
; A[10] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+16], r11
|
|
mov r11, r12
|
|
; A[10] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
; Carry
|
|
adcx r11, r13
|
|
mov r13, r12
|
|
adcx r13, r12
|
|
adox r13, r12
|
|
mov QWORD PTR [r8+32], r11
|
|
; Diagonal 4
|
|
; No load %r12 - %r9
|
|
; No load %r13 - %r8
|
|
; A[4] x A[3]
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r14, rax
|
|
adox r15, rcx
|
|
; No store %r12
|
|
; No load %r14 - %r9
|
|
; A[5] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; No store %r13
|
|
; No load %r15 - %r8
|
|
; A[6] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; No store %r14
|
|
; No load %rbx - %r9
|
|
; A[7] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r15
|
|
mov r10, QWORD PTR [r8]
|
|
; A[8] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[9] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r10, QWORD PTR [r8+16]
|
|
; A[9] x A[4]
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[9] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+16], r10
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[9] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r11
|
|
mov r11, r12
|
|
; A[9] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r10, r12
|
|
; A[9] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
; Carry
|
|
adcx r10, r13
|
|
mov r13, r12
|
|
adcx r13, r12
|
|
adox r13, r12
|
|
mov QWORD PTR [r8+48], r10
|
|
; Diagonal 5
|
|
; No load %r14 - %r8
|
|
; No load %r15 - %r9
|
|
; A[5] x A[4]
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; No store %r14
|
|
; No load %rbx - %r8
|
|
; A[6] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r15
|
|
mov r11, QWORD PTR [r8]
|
|
; A[7] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; No store %rbx
|
|
mov r10, QWORD PTR [r8+8]
|
|
; A[8] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r11, QWORD PTR [r8+16]
|
|
; A[8] x A[5]
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+8], r10
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[8] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+16], r11
|
|
mov r11, QWORD PTR [r8+32]
|
|
; A[8] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov r10, QWORD PTR [r8+40]
|
|
; A[10] x A[6]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r11, QWORD PTR [r8+48]
|
|
; A[10] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+40], r10
|
|
mov r10, r12
|
|
; A[10] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+48], r11
|
|
mov r11, r12
|
|
; A[10] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+56], r10
|
|
; Carry
|
|
adcx r11, r13
|
|
mov r13, r12
|
|
adcx r13, r12
|
|
adox r13, r12
|
|
mov QWORD PTR [r8+64], r11
|
|
; Diagonal 6
|
|
; No load %rbx - %r9
|
|
mov r10, QWORD PTR [r8]
|
|
; A[6] x A[5]
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[7] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov r10, QWORD PTR [r8+16]
|
|
; A[7] x A[6]
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[11] x A[3]
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+16], r10
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[11] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r11
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[11] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r10, QWORD PTR [r8+48]
|
|
; A[11] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[11] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+48], r10
|
|
mov r10, QWORD PTR [r8+64]
|
|
; A[11] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+56], r11
|
|
mov r11, r12
|
|
; A[11] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+64], r10
|
|
mov r10, r12
|
|
; A[11] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+72], r11
|
|
; Carry
|
|
adcx r10, r13
|
|
mov r13, r12
|
|
adcx r13, r12
|
|
adox r13, r12
|
|
mov QWORD PTR [r8+80], r10
|
|
mov QWORD PTR [r8+88], r13
|
|
; Double and Add in A[i] x A[i]
|
|
mov r11, QWORD PTR [rbp+8]
|
|
; A[0] x A[0]
|
|
mov rdx, QWORD PTR [r9]
|
|
mulx rcx, rax, rdx
|
|
mov QWORD PTR [rbp], rax
|
|
adox r11, r11
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+8], r11
|
|
mov r10, QWORD PTR [rbp+16]
|
|
mov r11, QWORD PTR [rbp+24]
|
|
; A[1] x A[1]
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+16], r10
|
|
mov QWORD PTR [rbp+24], r11
|
|
mov r10, QWORD PTR [rbp+32]
|
|
mov r11, QWORD PTR [rbp+40]
|
|
; A[2] x A[2]
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+32], r10
|
|
mov QWORD PTR [rbp+40], r11
|
|
mov r10, QWORD PTR [rbp+48]
|
|
; A[3] x A[3]
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r14, r14
|
|
adcx r10, rax
|
|
adcx r14, rcx
|
|
mov QWORD PTR [rbp+48], r10
|
|
; A[4] x A[4]
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mulx rcx, rax, rdx
|
|
adox r15, r15
|
|
adox rdi, rdi
|
|
adcx r15, rax
|
|
adcx rdi, rcx
|
|
; A[5] x A[5]
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mulx rcx, rax, rdx
|
|
adox rsi, rsi
|
|
adox rbx, rbx
|
|
adcx rsi, rax
|
|
adcx rbx, rcx
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[6] x A[6]
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[7] x A[7]
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+16], r10
|
|
mov QWORD PTR [r8+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[8] x A[8]
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[9] x A[9]
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+48], r10
|
|
mov QWORD PTR [r8+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
; A[10] x A[10]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+64], r10
|
|
mov QWORD PTR [r8+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
; A[11] x A[11]
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+80], r10
|
|
mov QWORD PTR [r8+88], r11
|
|
mov QWORD PTR [r8+-40], r14
|
|
mov QWORD PTR [r8+-32], r15
|
|
mov QWORD PTR [r8+-24], rdi
|
|
mov QWORD PTR [r8+-16], rsi
|
|
mov QWORD PTR [r8+-8], rbx
|
|
sub r8, 96
|
|
cmp r9, r8
|
|
jne L_end_3072_sqr_avx2_12
|
|
vmovdqu xmm0, OWORD PTR [rbp]
|
|
vmovups OWORD PTR [r8], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+16]
|
|
vmovups OWORD PTR [r8+16], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+32]
|
|
vmovups OWORD PTR [r8+32], xmm0
|
|
mov rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+48], rax
|
|
L_end_3072_sqr_avx2_12:
|
|
add rsp, 96
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
sp_3072_sqr_avx2_12 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_add_12 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_3072_add_12 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sub_in_place_24 PROC
|
|
mov r8, QWORD PTR [rcx]
|
|
xor rax, rax
|
|
sub r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
sbb r9, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r8, QWORD PTR [rdx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
sbb r9, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
sbb r8, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
sbb r9, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
sbb r8, QWORD PTR [rdx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
sbb r9, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
sbb r8, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
sbb r9, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
sbb r8, QWORD PTR [rdx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
sbb r9, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
sbb r8, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
sbb r9, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
sbb r8, QWORD PTR [rdx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
sbb r9, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rcx+120], r9
|
|
sbb r8, QWORD PTR [rdx+128]
|
|
mov r9, QWORD PTR [rcx+136]
|
|
mov QWORD PTR [rcx+128], r8
|
|
sbb r9, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rcx+136], r9
|
|
sbb r8, QWORD PTR [rdx+144]
|
|
mov r9, QWORD PTR [rcx+152]
|
|
mov QWORD PTR [rcx+144], r8
|
|
sbb r9, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rcx+152], r9
|
|
sbb r8, QWORD PTR [rdx+160]
|
|
mov r9, QWORD PTR [rcx+168]
|
|
mov QWORD PTR [rcx+160], r8
|
|
sbb r9, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rcx+168], r9
|
|
sbb r8, QWORD PTR [rdx+176]
|
|
mov r9, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [rcx+176], r8
|
|
sbb r9, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+184], r9
|
|
sbb rax, 0
|
|
ret
|
|
sp_3072_sub_in_place_24 ENDP
|
|
_text ENDS
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_add_24 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
adc r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
adc r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
adc r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
adc r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
adc r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
adc r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
adc r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
adc r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
adc r10, QWORD PTR [r8+184]
|
|
mov QWORD PTR [rcx+184], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_3072_add_24 ENDP
|
|
_text ENDS
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_24 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 616
|
|
mov QWORD PTR [rsp+576], rcx
|
|
mov QWORD PTR [rsp+584], rdx
|
|
mov QWORD PTR [rsp+592], r8
|
|
lea r12, QWORD PTR [rsp+384]
|
|
lea r14, QWORD PTR [rdx+96]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+600], r15
|
|
lea r13, QWORD PTR [rsp+480]
|
|
lea r14, QWORD PTR [r8+96]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+608], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_3072_mul_12
|
|
mov r8, QWORD PTR [rsp+592]
|
|
mov rdx, QWORD PTR [rsp+584]
|
|
lea rcx, QWORD PTR [rsp+192]
|
|
add r8, 96
|
|
add rdx, 96
|
|
call sp_3072_mul_12
|
|
mov r8, QWORD PTR [rsp+592]
|
|
mov rdx, QWORD PTR [rsp+584]
|
|
mov rcx, QWORD PTR [rsp+576]
|
|
call sp_3072_mul_12
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+592]
|
|
mov rdx, QWORD PTR [rsp+584]
|
|
mov rcx, QWORD PTR [rsp+576]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+600]
|
|
mov rdi, QWORD PTR [rsp+608]
|
|
mov rsi, QWORD PTR [rsp+576]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+384]
|
|
lea r13, QWORD PTR [rsp+480]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 192
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12], rax
|
|
mov QWORD PTR [r13], r9
|
|
mov rax, QWORD PTR [r12+8]
|
|
mov r9, QWORD PTR [r13+8]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+8], rax
|
|
mov QWORD PTR [r13+8], r9
|
|
mov rax, QWORD PTR [r12+16]
|
|
mov r9, QWORD PTR [r13+16]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+16], rax
|
|
mov QWORD PTR [r13+16], r9
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+24], rax
|
|
mov QWORD PTR [r13+24], r9
|
|
mov rax, QWORD PTR [r12+32]
|
|
mov r9, QWORD PTR [r13+32]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+32], rax
|
|
mov QWORD PTR [r13+32], r9
|
|
mov rax, QWORD PTR [r12+40]
|
|
mov r9, QWORD PTR [r13+40]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+40], rax
|
|
mov QWORD PTR [r13+40], r9
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+48], rax
|
|
mov QWORD PTR [r13+48], r9
|
|
mov rax, QWORD PTR [r12+56]
|
|
mov r9, QWORD PTR [r13+56]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+56], rax
|
|
mov QWORD PTR [r13+56], r9
|
|
mov rax, QWORD PTR [r12+64]
|
|
mov r9, QWORD PTR [r13+64]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+64], rax
|
|
mov QWORD PTR [r13+64], r9
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+72], rax
|
|
mov QWORD PTR [r13+72], r9
|
|
mov rax, QWORD PTR [r12+80]
|
|
mov r9, QWORD PTR [r13+80]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+80], rax
|
|
mov QWORD PTR [r13+80], r9
|
|
mov rax, QWORD PTR [r12+88]
|
|
mov r9, QWORD PTR [r13+88]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+88], rax
|
|
mov QWORD PTR [r13+88], r9
|
|
mov rax, QWORD PTR [r12]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+192]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb r11, 0
|
|
sub rsi, 96
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+288], r11
|
|
add rsi, 96
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov QWORD PTR [rsi+96], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+104]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+112]
|
|
mov QWORD PTR [rsi+104], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+120]
|
|
mov QWORD PTR [rsi+112], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+128]
|
|
mov QWORD PTR [rsi+120], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+136]
|
|
mov QWORD PTR [rsi+128], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+144]
|
|
mov QWORD PTR [rsi+136], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+152]
|
|
mov QWORD PTR [rsi+144], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+160]
|
|
mov QWORD PTR [rsi+152], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+168]
|
|
mov QWORD PTR [rsi+160], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+176]
|
|
mov QWORD PTR [rsi+168], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+184]
|
|
mov QWORD PTR [rsi+176], rax
|
|
adc r9, 0
|
|
mov QWORD PTR [rsi+184], r9
|
|
add rsp, 616
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_24 ENDP
|
|
_text ENDS
|
|
; /* Add a to a into r. (r = a + a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_dbl_12 PROC
|
|
mov r8, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r8, r8
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
adc r9, r9
|
|
mov QWORD PTR [rcx+88], r9
|
|
adc rax, 0
|
|
ret
|
|
sp_3072_dbl_12 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sqr_24 PROC
|
|
push r12
|
|
sub rsp, 504
|
|
mov QWORD PTR [rsp+480], rcx
|
|
mov QWORD PTR [rsp+488], rdx
|
|
lea r10, QWORD PTR [rsp+384]
|
|
lea r11, QWORD PTR [rdx+96]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+496], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_3072_sqr_12
|
|
mov rdx, QWORD PTR [rsp+488]
|
|
lea rcx, QWORD PTR [rsp+192]
|
|
add rdx, 96
|
|
call sp_3072_sqr_12
|
|
mov rdx, QWORD PTR [rsp+488]
|
|
mov rcx, QWORD PTR [rsp+480]
|
|
call sp_3072_sqr_12
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+488]
|
|
mov rcx, QWORD PTR [rsp+480]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+496]
|
|
mov r11, rcx
|
|
lea r10, QWORD PTR [rsp+384]
|
|
mov r9, r12
|
|
neg r12
|
|
add r11, 192
|
|
mov rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r10+8]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11], rax
|
|
mov QWORD PTR [r11+8], r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+16], rax
|
|
mov QWORD PTR [r11+24], r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+32], rax
|
|
mov QWORD PTR [r11+40], r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+48], rax
|
|
mov QWORD PTR [r11+56], r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+64], rax
|
|
mov QWORD PTR [r11+72], r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+80], rax
|
|
mov QWORD PTR [r11+88], r8
|
|
mov rax, QWORD PTR [r11]
|
|
add rax, rax
|
|
mov r8, QWORD PTR [r11+8]
|
|
mov QWORD PTR [r11], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
adc r8, r8
|
|
mov QWORD PTR [r11+88], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+192]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb r9, 0
|
|
sub r11, 96
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r11+8]
|
|
mov QWORD PTR [r11], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [r11+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r11+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [r11+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r11+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [r11+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r11+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r11+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r11+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [r11+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [r11+96]
|
|
mov QWORD PTR [r11+88], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r11+184], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+288], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11+96]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r11+192]
|
|
mov QWORD PTR [r11+184], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r11+192], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+104]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r11+200], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r11+208], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r11+216], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r11+224], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r11+232], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r11+240], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r11+248], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r11+256], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r11+264], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r11+272], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [r11+280], rax
|
|
add rsp, 504
|
|
pop r12
|
|
ret
|
|
sp_3072_sqr_24 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_avx2_24 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 616
|
|
mov QWORD PTR [rsp+576], rcx
|
|
mov QWORD PTR [rsp+584], rdx
|
|
mov QWORD PTR [rsp+592], r8
|
|
lea r12, QWORD PTR [rsp+384]
|
|
lea r14, QWORD PTR [rdx+96]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+600], r15
|
|
lea r13, QWORD PTR [rsp+480]
|
|
lea r14, QWORD PTR [r8+96]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+608], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_3072_mul_avx2_12
|
|
mov r8, QWORD PTR [rsp+592]
|
|
mov rdx, QWORD PTR [rsp+584]
|
|
lea rcx, QWORD PTR [rsp+192]
|
|
add r8, 96
|
|
add rdx, 96
|
|
call sp_3072_mul_avx2_12
|
|
mov r8, QWORD PTR [rsp+592]
|
|
mov rdx, QWORD PTR [rsp+584]
|
|
mov rcx, QWORD PTR [rsp+576]
|
|
call sp_3072_mul_avx2_12
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+592]
|
|
mov rdx, QWORD PTR [rsp+584]
|
|
mov rcx, QWORD PTR [rsp+576]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+600]
|
|
mov rdi, QWORD PTR [rsp+608]
|
|
mov rsi, QWORD PTR [rsp+576]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+384]
|
|
lea r13, QWORD PTR [rsp+480]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 192
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
add rax, r9
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [r13+8]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [r13+16]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [r13+32]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [r13+40]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [r13+56]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [r13+64]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [r13+80]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [r13+88]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, rax
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+192]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb r11, 0
|
|
sub rsi, 96
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+288], r11
|
|
add rsi, 96
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov QWORD PTR [rsi+96], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+104]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+112]
|
|
mov QWORD PTR [rsi+104], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+120]
|
|
mov QWORD PTR [rsi+112], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+128]
|
|
mov QWORD PTR [rsi+120], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+136]
|
|
mov QWORD PTR [rsi+128], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+144]
|
|
mov QWORD PTR [rsi+136], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+152]
|
|
mov QWORD PTR [rsi+144], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+160]
|
|
mov QWORD PTR [rsi+152], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+168]
|
|
mov QWORD PTR [rsi+160], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+176]
|
|
mov QWORD PTR [rsi+168], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+184]
|
|
mov QWORD PTR [rsi+176], rax
|
|
adc r9, 0
|
|
mov QWORD PTR [rsi+184], r9
|
|
add rsp, 616
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_avx2_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sqr_avx2_24 PROC
|
|
push r12
|
|
sub rsp, 504
|
|
mov QWORD PTR [rsp+480], rcx
|
|
mov QWORD PTR [rsp+488], rdx
|
|
lea r10, QWORD PTR [rsp+384]
|
|
lea r11, QWORD PTR [rdx+96]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+496], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_3072_sqr_avx2_12
|
|
mov rdx, QWORD PTR [rsp+488]
|
|
lea rcx, QWORD PTR [rsp+192]
|
|
add rdx, 96
|
|
call sp_3072_sqr_avx2_12
|
|
mov rdx, QWORD PTR [rsp+488]
|
|
mov rcx, QWORD PTR [rsp+480]
|
|
call sp_3072_sqr_avx2_12
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+488]
|
|
mov rcx, QWORD PTR [rsp+480]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+496]
|
|
mov r11, rcx
|
|
lea r10, QWORD PTR [rsp+384]
|
|
mov r9, r12
|
|
neg r12
|
|
add r11, 192
|
|
mov rax, QWORD PTR [r10]
|
|
pext rax, rax, r12
|
|
add rax, rax
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r11], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov QWORD PTR [r11+88], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+192]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb r9, 0
|
|
sub r11, 96
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r11+8]
|
|
mov QWORD PTR [r11], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [r11+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r11+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [r11+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r11+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [r11+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r11+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r11+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r11+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [r11+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [r11+96]
|
|
mov QWORD PTR [r11+88], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r11+184], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+288], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11+96]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r11+192]
|
|
mov QWORD PTR [r11+184], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r11+192], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+104]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r11+200], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r11+208], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r11+216], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r11+224], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r11+232], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r11+240], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r11+248], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r11+256], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r11+264], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r11+272], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [r11+280], rax
|
|
add rsp, 504
|
|
pop r12
|
|
ret
|
|
sp_3072_sqr_avx2_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sub_in_place_48 PROC
|
|
mov r8, QWORD PTR [rcx]
|
|
xor rax, rax
|
|
sub r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
sbb r9, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r8, QWORD PTR [rdx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
sbb r9, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
sbb r8, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
sbb r9, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
sbb r8, QWORD PTR [rdx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
sbb r9, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
sbb r8, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
sbb r9, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
sbb r8, QWORD PTR [rdx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
sbb r9, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
sbb r8, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
sbb r9, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
sbb r8, QWORD PTR [rdx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
sbb r9, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rcx+120], r9
|
|
sbb r8, QWORD PTR [rdx+128]
|
|
mov r9, QWORD PTR [rcx+136]
|
|
mov QWORD PTR [rcx+128], r8
|
|
sbb r9, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rcx+136], r9
|
|
sbb r8, QWORD PTR [rdx+144]
|
|
mov r9, QWORD PTR [rcx+152]
|
|
mov QWORD PTR [rcx+144], r8
|
|
sbb r9, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rcx+152], r9
|
|
sbb r8, QWORD PTR [rdx+160]
|
|
mov r9, QWORD PTR [rcx+168]
|
|
mov QWORD PTR [rcx+160], r8
|
|
sbb r9, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rcx+168], r9
|
|
sbb r8, QWORD PTR [rdx+176]
|
|
mov r9, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [rcx+176], r8
|
|
sbb r9, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rcx+184], r9
|
|
sbb r8, QWORD PTR [rdx+192]
|
|
mov r9, QWORD PTR [rcx+200]
|
|
mov QWORD PTR [rcx+192], r8
|
|
sbb r9, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rcx+200], r9
|
|
sbb r8, QWORD PTR [rdx+208]
|
|
mov r9, QWORD PTR [rcx+216]
|
|
mov QWORD PTR [rcx+208], r8
|
|
sbb r9, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rcx+216], r9
|
|
sbb r8, QWORD PTR [rdx+224]
|
|
mov r9, QWORD PTR [rcx+232]
|
|
mov QWORD PTR [rcx+224], r8
|
|
sbb r9, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rcx+232], r9
|
|
sbb r8, QWORD PTR [rdx+240]
|
|
mov r9, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [rcx+240], r8
|
|
sbb r9, QWORD PTR [rdx+248]
|
|
mov r8, QWORD PTR [rcx+256]
|
|
mov QWORD PTR [rcx+248], r9
|
|
sbb r8, QWORD PTR [rdx+256]
|
|
mov r9, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], r8
|
|
sbb r9, QWORD PTR [rdx+264]
|
|
mov r8, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r9
|
|
sbb r8, QWORD PTR [rdx+272]
|
|
mov r9, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], r8
|
|
sbb r9, QWORD PTR [rdx+280]
|
|
mov r8, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r9
|
|
sbb r8, QWORD PTR [rdx+288]
|
|
mov r9, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], r8
|
|
sbb r9, QWORD PTR [rdx+296]
|
|
mov r8, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r9
|
|
sbb r8, QWORD PTR [rdx+304]
|
|
mov r9, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], r8
|
|
sbb r9, QWORD PTR [rdx+312]
|
|
mov r8, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r9
|
|
sbb r8, QWORD PTR [rdx+320]
|
|
mov r9, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], r8
|
|
sbb r9, QWORD PTR [rdx+328]
|
|
mov r8, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r9
|
|
sbb r8, QWORD PTR [rdx+336]
|
|
mov r9, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], r8
|
|
sbb r9, QWORD PTR [rdx+344]
|
|
mov r8, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r9
|
|
sbb r8, QWORD PTR [rdx+352]
|
|
mov r9, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], r8
|
|
sbb r9, QWORD PTR [rdx+360]
|
|
mov r8, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r9
|
|
sbb r8, QWORD PTR [rdx+368]
|
|
mov r9, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], r8
|
|
sbb r9, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+376], r9
|
|
sbb rax, 0
|
|
ret
|
|
sp_3072_sub_in_place_48 ENDP
|
|
_text ENDS
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_add_48 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
adc r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
adc r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
adc r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
adc r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
adc r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
adc r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
adc r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
adc r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
adc r10, QWORD PTR [r8+184]
|
|
mov r9, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r10
|
|
adc r9, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r9
|
|
adc r10, QWORD PTR [r8+200]
|
|
mov r9, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r10
|
|
adc r9, QWORD PTR [r8+208]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r9
|
|
adc r10, QWORD PTR [r8+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r9, QWORD PTR [r8+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r9
|
|
adc r10, QWORD PTR [r8+232]
|
|
mov r9, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r10
|
|
adc r9, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r9
|
|
adc r10, QWORD PTR [r8+248]
|
|
mov r9, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [rcx+248], r10
|
|
adc r9, QWORD PTR [r8+256]
|
|
mov r10, QWORD PTR [rdx+264]
|
|
mov QWORD PTR [rcx+256], r9
|
|
adc r10, QWORD PTR [r8+264]
|
|
mov r9, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [rcx+264], r10
|
|
adc r9, QWORD PTR [r8+272]
|
|
mov r10, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [rcx+272], r9
|
|
adc r10, QWORD PTR [r8+280]
|
|
mov r9, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [rcx+280], r10
|
|
adc r9, QWORD PTR [r8+288]
|
|
mov r10, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [rcx+288], r9
|
|
adc r10, QWORD PTR [r8+296]
|
|
mov r9, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [rcx+296], r10
|
|
adc r9, QWORD PTR [r8+304]
|
|
mov r10, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [rcx+304], r9
|
|
adc r10, QWORD PTR [r8+312]
|
|
mov r9, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [rcx+312], r10
|
|
adc r9, QWORD PTR [r8+320]
|
|
mov r10, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [rcx+320], r9
|
|
adc r10, QWORD PTR [r8+328]
|
|
mov r9, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [rcx+328], r10
|
|
adc r9, QWORD PTR [r8+336]
|
|
mov r10, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [rcx+336], r9
|
|
adc r10, QWORD PTR [r8+344]
|
|
mov r9, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [rcx+344], r10
|
|
adc r9, QWORD PTR [r8+352]
|
|
mov r10, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [rcx+352], r9
|
|
adc r10, QWORD PTR [r8+360]
|
|
mov r9, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [rcx+360], r10
|
|
adc r9, QWORD PTR [r8+368]
|
|
mov r10, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+368], r9
|
|
adc r10, QWORD PTR [r8+376]
|
|
mov QWORD PTR [rcx+376], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_3072_add_48 ENDP
|
|
_text ENDS
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_48 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 1192
|
|
mov QWORD PTR [rsp+1152], rcx
|
|
mov QWORD PTR [rsp+1160], rdx
|
|
mov QWORD PTR [rsp+1168], r8
|
|
lea r12, QWORD PTR [rsp+768]
|
|
lea r14, QWORD PTR [rdx+192]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov QWORD PTR [r12+184], r10
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+1176], r15
|
|
lea r13, QWORD PTR [rsp+960]
|
|
lea r14, QWORD PTR [r8+192]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov QWORD PTR [r13+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov QWORD PTR [r13+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mov QWORD PTR [r13+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [r8+128]
|
|
mov QWORD PTR [r13+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov QWORD PTR [r13+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [r8+144]
|
|
mov QWORD PTR [r13+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [r8+152]
|
|
mov QWORD PTR [r13+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov QWORD PTR [r13+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [r8+168]
|
|
mov QWORD PTR [r13+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [r8+176]
|
|
mov QWORD PTR [r13+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov QWORD PTR [r13+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov QWORD PTR [r13+184], r10
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+1184], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_3072_mul_24
|
|
mov r8, QWORD PTR [rsp+1168]
|
|
mov rdx, QWORD PTR [rsp+1160]
|
|
lea rcx, QWORD PTR [rsp+384]
|
|
add r8, 192
|
|
add rdx, 192
|
|
call sp_3072_mul_24
|
|
mov r8, QWORD PTR [rsp+1168]
|
|
mov rdx, QWORD PTR [rsp+1160]
|
|
mov rcx, QWORD PTR [rsp+1152]
|
|
call sp_3072_mul_24
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+1168]
|
|
mov rdx, QWORD PTR [rsp+1160]
|
|
mov rcx, QWORD PTR [rsp+1152]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+1176]
|
|
mov rdi, QWORD PTR [rsp+1184]
|
|
mov rsi, QWORD PTR [rsp+1152]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+768]
|
|
lea r13, QWORD PTR [rsp+960]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 384
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12], rax
|
|
mov QWORD PTR [r13], r9
|
|
mov rax, QWORD PTR [r12+8]
|
|
mov r9, QWORD PTR [r13+8]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+8], rax
|
|
mov QWORD PTR [r13+8], r9
|
|
mov rax, QWORD PTR [r12+16]
|
|
mov r9, QWORD PTR [r13+16]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+16], rax
|
|
mov QWORD PTR [r13+16], r9
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+24], rax
|
|
mov QWORD PTR [r13+24], r9
|
|
mov rax, QWORD PTR [r12+32]
|
|
mov r9, QWORD PTR [r13+32]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+32], rax
|
|
mov QWORD PTR [r13+32], r9
|
|
mov rax, QWORD PTR [r12+40]
|
|
mov r9, QWORD PTR [r13+40]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+40], rax
|
|
mov QWORD PTR [r13+40], r9
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+48], rax
|
|
mov QWORD PTR [r13+48], r9
|
|
mov rax, QWORD PTR [r12+56]
|
|
mov r9, QWORD PTR [r13+56]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+56], rax
|
|
mov QWORD PTR [r13+56], r9
|
|
mov rax, QWORD PTR [r12+64]
|
|
mov r9, QWORD PTR [r13+64]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+64], rax
|
|
mov QWORD PTR [r13+64], r9
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+72], rax
|
|
mov QWORD PTR [r13+72], r9
|
|
mov rax, QWORD PTR [r12+80]
|
|
mov r9, QWORD PTR [r13+80]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+80], rax
|
|
mov QWORD PTR [r13+80], r9
|
|
mov rax, QWORD PTR [r12+88]
|
|
mov r9, QWORD PTR [r13+88]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+88], rax
|
|
mov QWORD PTR [r13+88], r9
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [r13+96]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+96], rax
|
|
mov QWORD PTR [r13+96], r9
|
|
mov rax, QWORD PTR [r12+104]
|
|
mov r9, QWORD PTR [r13+104]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+104], rax
|
|
mov QWORD PTR [r13+104], r9
|
|
mov rax, QWORD PTR [r12+112]
|
|
mov r9, QWORD PTR [r13+112]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+112], rax
|
|
mov QWORD PTR [r13+112], r9
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [r13+120]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+120], rax
|
|
mov QWORD PTR [r13+120], r9
|
|
mov rax, QWORD PTR [r12+128]
|
|
mov r9, QWORD PTR [r13+128]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+128], rax
|
|
mov QWORD PTR [r13+128], r9
|
|
mov rax, QWORD PTR [r12+136]
|
|
mov r9, QWORD PTR [r13+136]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+136], rax
|
|
mov QWORD PTR [r13+136], r9
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [r13+144]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+144], rax
|
|
mov QWORD PTR [r13+144], r9
|
|
mov rax, QWORD PTR [r12+152]
|
|
mov r9, QWORD PTR [r13+152]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+152], rax
|
|
mov QWORD PTR [r13+152], r9
|
|
mov rax, QWORD PTR [r12+160]
|
|
mov r9, QWORD PTR [r13+160]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+160], rax
|
|
mov QWORD PTR [r13+160], r9
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [r13+168]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+168], rax
|
|
mov QWORD PTR [r13+168], r9
|
|
mov rax, QWORD PTR [r12+176]
|
|
mov r9, QWORD PTR [r13+176]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+176], rax
|
|
mov QWORD PTR [r13+176], r9
|
|
mov rax, QWORD PTR [r12+184]
|
|
mov r9, QWORD PTR [r13+184]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+184], rax
|
|
mov QWORD PTR [r13+184], r9
|
|
mov rax, QWORD PTR [r12]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r13+184]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+384]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [r13+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [r13+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [r13+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [r13+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [r13+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [r13+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [r13+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [r13+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [r13+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [r13+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [r13+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [r13+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [r13+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [r13+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [r13+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [r13+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [r13+376]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [rcx+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [rcx+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [rcx+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [rcx+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [rcx+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [rcx+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [rcx+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [rcx+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [rcx+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [rcx+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [rcx+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [rcx+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [rcx+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [rcx+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [rcx+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [rcx+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb r11, 0
|
|
sub rsi, 192
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r12+248]
|
|
mov r10, QWORD PTR [rsi+256]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r10, QWORD PTR [r12+256]
|
|
mov rax, QWORD PTR [rsi+264]
|
|
mov QWORD PTR [rsi+256], r10
|
|
adc rax, QWORD PTR [r12+264]
|
|
mov r9, QWORD PTR [rsi+272]
|
|
mov QWORD PTR [rsi+264], rax
|
|
adc r9, QWORD PTR [r12+272]
|
|
mov r10, QWORD PTR [rsi+280]
|
|
mov QWORD PTR [rsi+272], r9
|
|
adc r10, QWORD PTR [r12+280]
|
|
mov rax, QWORD PTR [rsi+288]
|
|
mov QWORD PTR [rsi+280], r10
|
|
adc rax, QWORD PTR [r12+288]
|
|
mov r9, QWORD PTR [rsi+296]
|
|
mov QWORD PTR [rsi+288], rax
|
|
adc r9, QWORD PTR [r12+296]
|
|
mov r10, QWORD PTR [rsi+304]
|
|
mov QWORD PTR [rsi+296], r9
|
|
adc r10, QWORD PTR [r12+304]
|
|
mov rax, QWORD PTR [rsi+312]
|
|
mov QWORD PTR [rsi+304], r10
|
|
adc rax, QWORD PTR [r12+312]
|
|
mov r9, QWORD PTR [rsi+320]
|
|
mov QWORD PTR [rsi+312], rax
|
|
adc r9, QWORD PTR [r12+320]
|
|
mov r10, QWORD PTR [rsi+328]
|
|
mov QWORD PTR [rsi+320], r9
|
|
adc r10, QWORD PTR [r12+328]
|
|
mov rax, QWORD PTR [rsi+336]
|
|
mov QWORD PTR [rsi+328], r10
|
|
adc rax, QWORD PTR [r12+336]
|
|
mov r9, QWORD PTR [rsi+344]
|
|
mov QWORD PTR [rsi+336], rax
|
|
adc r9, QWORD PTR [r12+344]
|
|
mov r10, QWORD PTR [rsi+352]
|
|
mov QWORD PTR [rsi+344], r9
|
|
adc r10, QWORD PTR [r12+352]
|
|
mov rax, QWORD PTR [rsi+360]
|
|
mov QWORD PTR [rsi+352], r10
|
|
adc rax, QWORD PTR [r12+360]
|
|
mov r9, QWORD PTR [rsi+368]
|
|
mov QWORD PTR [rsi+360], rax
|
|
adc r9, QWORD PTR [r12+368]
|
|
mov r10, QWORD PTR [rsi+376]
|
|
mov QWORD PTR [rsi+368], r9
|
|
adc r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [rsi+376], r10
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+576], r11
|
|
add rsi, 192
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r13+192]
|
|
mov QWORD PTR [rsi+192], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+200]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+208]
|
|
mov QWORD PTR [rsi+200], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+216]
|
|
mov QWORD PTR [rsi+208], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+224]
|
|
mov QWORD PTR [rsi+216], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+232]
|
|
mov QWORD PTR [rsi+224], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+240]
|
|
mov QWORD PTR [rsi+232], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+248]
|
|
mov QWORD PTR [rsi+240], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+256]
|
|
mov QWORD PTR [rsi+248], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+264]
|
|
mov QWORD PTR [rsi+256], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+272]
|
|
mov QWORD PTR [rsi+264], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+280]
|
|
mov QWORD PTR [rsi+272], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+288]
|
|
mov QWORD PTR [rsi+280], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+296]
|
|
mov QWORD PTR [rsi+288], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+304]
|
|
mov QWORD PTR [rsi+296], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+312]
|
|
mov QWORD PTR [rsi+304], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+320]
|
|
mov QWORD PTR [rsi+312], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+328]
|
|
mov QWORD PTR [rsi+320], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+336]
|
|
mov QWORD PTR [rsi+328], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+344]
|
|
mov QWORD PTR [rsi+336], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+352]
|
|
mov QWORD PTR [rsi+344], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+360]
|
|
mov QWORD PTR [rsi+352], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+368]
|
|
mov QWORD PTR [rsi+360], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+376]
|
|
mov QWORD PTR [rsi+368], rax
|
|
adc r9, 0
|
|
mov QWORD PTR [rsi+376], r9
|
|
add rsp, 1192
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_48 ENDP
|
|
_text ENDS
|
|
; /* Add a to a into r. (r = a + a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_dbl_24 PROC
|
|
mov r8, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r8, r8
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r8
|
|
adc r9, r9
|
|
mov QWORD PTR [rcx+184], r9
|
|
adc rax, 0
|
|
ret
|
|
sp_3072_dbl_24 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sqr_48 PROC
|
|
push r12
|
|
sub rsp, 984
|
|
mov QWORD PTR [rsp+960], rcx
|
|
mov QWORD PTR [rsp+968], rdx
|
|
lea r10, QWORD PTR [rsp+768]
|
|
lea r11, QWORD PTR [rdx+192]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc rax, QWORD PTR [r11+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
adc r8, QWORD PTR [r11+104]
|
|
mov rax, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
adc rax, QWORD PTR [r11+112]
|
|
mov r8, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
adc r8, QWORD PTR [r11+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
adc rax, QWORD PTR [r11+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
adc r8, QWORD PTR [r11+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
adc rax, QWORD PTR [r11+144]
|
|
mov r8, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
adc r8, QWORD PTR [r11+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
adc rax, QWORD PTR [r11+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
adc r8, QWORD PTR [r11+168]
|
|
mov rax, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
adc rax, QWORD PTR [r11+176]
|
|
mov r8, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
adc r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r10+184], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+976], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_3072_sqr_24
|
|
mov rdx, QWORD PTR [rsp+968]
|
|
lea rcx, QWORD PTR [rsp+384]
|
|
add rdx, 192
|
|
call sp_3072_sqr_24
|
|
mov rdx, QWORD PTR [rsp+968]
|
|
mov rcx, QWORD PTR [rsp+960]
|
|
call sp_3072_sqr_24
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+968]
|
|
mov rcx, QWORD PTR [rsp+960]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+976]
|
|
mov r11, rcx
|
|
lea r10, QWORD PTR [rsp+768]
|
|
mov r9, r12
|
|
neg r12
|
|
add r11, 384
|
|
mov rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r10+8]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11], rax
|
|
mov QWORD PTR [r11+8], r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+16], rax
|
|
mov QWORD PTR [r11+24], r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+32], rax
|
|
mov QWORD PTR [r11+40], r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+48], rax
|
|
mov QWORD PTR [r11+56], r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+64], rax
|
|
mov QWORD PTR [r11+72], r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+80], rax
|
|
mov QWORD PTR [r11+88], r8
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+96], rax
|
|
mov QWORD PTR [r11+104], r8
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+112], rax
|
|
mov QWORD PTR [r11+120], r8
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+128], rax
|
|
mov QWORD PTR [r11+136], r8
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+144], rax
|
|
mov QWORD PTR [r11+152], r8
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+160], rax
|
|
mov QWORD PTR [r11+168], r8
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [r11+176], rax
|
|
mov QWORD PTR [r11+184], r8
|
|
mov rax, QWORD PTR [r11]
|
|
add rax, rax
|
|
mov r8, QWORD PTR [r11+8]
|
|
mov QWORD PTR [r11], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+96]
|
|
mov QWORD PTR [r11+88], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, r8
|
|
mov QWORD PTR [r11+184], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+384]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rdx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rdx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rdx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rdx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rdx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rdx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rdx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rdx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rdx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rdx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rdx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rcx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rcx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rcx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rcx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rcx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rcx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rcx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rcx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rcx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rcx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rcx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rcx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rcx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rcx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rcx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rcx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb r9, 0
|
|
sub r11, 192
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r11+8]
|
|
mov QWORD PTR [r11], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [r11+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r11+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [r11+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r11+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [r11+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r11+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r11+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r11+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [r11+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [r11+96]
|
|
mov QWORD PTR [r11+88], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov rax, QWORD PTR [r11+192]
|
|
mov QWORD PTR [r11+184], r8
|
|
adc rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [r11+200]
|
|
mov QWORD PTR [r11+192], rax
|
|
adc r8, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [r11+208]
|
|
mov QWORD PTR [r11+200], r8
|
|
adc rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [r11+216]
|
|
mov QWORD PTR [r11+208], rax
|
|
adc r8, QWORD PTR [r10+216]
|
|
mov rax, QWORD PTR [r11+224]
|
|
mov QWORD PTR [r11+216], r8
|
|
adc rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [r11+232]
|
|
mov QWORD PTR [r11+224], rax
|
|
adc r8, QWORD PTR [r10+232]
|
|
mov rax, QWORD PTR [r11+240]
|
|
mov QWORD PTR [r11+232], r8
|
|
adc rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [r11+248]
|
|
mov QWORD PTR [r11+240], rax
|
|
adc r8, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [r11+256]
|
|
mov QWORD PTR [r11+248], r8
|
|
adc rax, QWORD PTR [r10+256]
|
|
mov r8, QWORD PTR [r11+264]
|
|
mov QWORD PTR [r11+256], rax
|
|
adc r8, QWORD PTR [r10+264]
|
|
mov rax, QWORD PTR [r11+272]
|
|
mov QWORD PTR [r11+264], r8
|
|
adc rax, QWORD PTR [r10+272]
|
|
mov r8, QWORD PTR [r11+280]
|
|
mov QWORD PTR [r11+272], rax
|
|
adc r8, QWORD PTR [r10+280]
|
|
mov rax, QWORD PTR [r11+288]
|
|
mov QWORD PTR [r11+280], r8
|
|
adc rax, QWORD PTR [r10+288]
|
|
mov r8, QWORD PTR [r11+296]
|
|
mov QWORD PTR [r11+288], rax
|
|
adc r8, QWORD PTR [r10+296]
|
|
mov rax, QWORD PTR [r11+304]
|
|
mov QWORD PTR [r11+296], r8
|
|
adc rax, QWORD PTR [r10+304]
|
|
mov r8, QWORD PTR [r11+312]
|
|
mov QWORD PTR [r11+304], rax
|
|
adc r8, QWORD PTR [r10+312]
|
|
mov rax, QWORD PTR [r11+320]
|
|
mov QWORD PTR [r11+312], r8
|
|
adc rax, QWORD PTR [r10+320]
|
|
mov r8, QWORD PTR [r11+328]
|
|
mov QWORD PTR [r11+320], rax
|
|
adc r8, QWORD PTR [r10+328]
|
|
mov rax, QWORD PTR [r11+336]
|
|
mov QWORD PTR [r11+328], r8
|
|
adc rax, QWORD PTR [r10+336]
|
|
mov r8, QWORD PTR [r11+344]
|
|
mov QWORD PTR [r11+336], rax
|
|
adc r8, QWORD PTR [r10+344]
|
|
mov rax, QWORD PTR [r11+352]
|
|
mov QWORD PTR [r11+344], r8
|
|
adc rax, QWORD PTR [r10+352]
|
|
mov r8, QWORD PTR [r11+360]
|
|
mov QWORD PTR [r11+352], rax
|
|
adc r8, QWORD PTR [r10+360]
|
|
mov rax, QWORD PTR [r11+368]
|
|
mov QWORD PTR [r11+360], r8
|
|
adc rax, QWORD PTR [r10+368]
|
|
mov r8, QWORD PTR [r11+376]
|
|
mov QWORD PTR [r11+368], rax
|
|
adc r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r11+376], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+576], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11+192]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r11+200]
|
|
mov QWORD PTR [r11+192], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r11+208]
|
|
mov QWORD PTR [r11+200], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r11+216]
|
|
mov QWORD PTR [r11+208], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r11+224]
|
|
mov QWORD PTR [r11+216], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r11+232]
|
|
mov QWORD PTR [r11+224], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r11+240]
|
|
mov QWORD PTR [r11+232], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r11+248]
|
|
mov QWORD PTR [r11+240], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r11+256]
|
|
mov QWORD PTR [r11+248], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r11+264]
|
|
mov QWORD PTR [r11+256], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r11+272]
|
|
mov QWORD PTR [r11+264], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r11+280]
|
|
mov QWORD PTR [r11+272], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r11+288]
|
|
mov QWORD PTR [r11+280], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r11+296]
|
|
mov QWORD PTR [r11+288], rax
|
|
adc r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r11+304]
|
|
mov QWORD PTR [r11+296], r8
|
|
adc rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r11+312]
|
|
mov QWORD PTR [r11+304], rax
|
|
adc r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r11+320]
|
|
mov QWORD PTR [r11+312], r8
|
|
adc rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r11+328]
|
|
mov QWORD PTR [r11+320], rax
|
|
adc r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r11+336]
|
|
mov QWORD PTR [r11+328], r8
|
|
adc rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r11+344]
|
|
mov QWORD PTR [r11+336], rax
|
|
adc r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r11+352]
|
|
mov QWORD PTR [r11+344], r8
|
|
adc rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r11+360]
|
|
mov QWORD PTR [r11+352], rax
|
|
adc r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r11+368]
|
|
mov QWORD PTR [r11+360], r8
|
|
adc rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r11+376]
|
|
mov QWORD PTR [r11+368], rax
|
|
adc r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r11+384]
|
|
mov QWORD PTR [r11+376], r8
|
|
adc rax, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [r11+384], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+200]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [r11+392], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [r11+400], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [r11+408], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [r11+416], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [r11+424], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r11+432], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [r11+440], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+264]
|
|
mov QWORD PTR [r11+448], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [r11+456], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [r11+464], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [r11+472], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [r11+480], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [r11+488], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [r11+496], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [r11+504], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [r11+512], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [r11+520], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [r11+528], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [r11+536], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [r11+544], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [r11+552], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [r11+560], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [r11+568], rax
|
|
add rsp, 984
|
|
pop r12
|
|
ret
|
|
sp_3072_sqr_48 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_avx2_48 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 1192
|
|
mov QWORD PTR [rsp+1152], rcx
|
|
mov QWORD PTR [rsp+1160], rdx
|
|
mov QWORD PTR [rsp+1168], r8
|
|
lea r12, QWORD PTR [rsp+768]
|
|
lea r14, QWORD PTR [rdx+192]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov QWORD PTR [r12+184], r10
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+1176], r15
|
|
lea r13, QWORD PTR [rsp+960]
|
|
lea r14, QWORD PTR [r8+192]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov QWORD PTR [r13+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov QWORD PTR [r13+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mov QWORD PTR [r13+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [r8+128]
|
|
mov QWORD PTR [r13+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov QWORD PTR [r13+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [r8+144]
|
|
mov QWORD PTR [r13+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [r8+152]
|
|
mov QWORD PTR [r13+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov QWORD PTR [r13+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [r8+168]
|
|
mov QWORD PTR [r13+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [r8+176]
|
|
mov QWORD PTR [r13+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov QWORD PTR [r13+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov QWORD PTR [r13+184], r10
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+1184], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_3072_mul_avx2_24
|
|
mov r8, QWORD PTR [rsp+1168]
|
|
mov rdx, QWORD PTR [rsp+1160]
|
|
lea rcx, QWORD PTR [rsp+384]
|
|
add r8, 192
|
|
add rdx, 192
|
|
call sp_3072_mul_avx2_24
|
|
mov r8, QWORD PTR [rsp+1168]
|
|
mov rdx, QWORD PTR [rsp+1160]
|
|
mov rcx, QWORD PTR [rsp+1152]
|
|
call sp_3072_mul_avx2_24
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+1168]
|
|
mov rdx, QWORD PTR [rsp+1160]
|
|
mov rcx, QWORD PTR [rsp+1152]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+1176]
|
|
mov rdi, QWORD PTR [rsp+1184]
|
|
mov rsi, QWORD PTR [rsp+1152]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+768]
|
|
lea r13, QWORD PTR [rsp+960]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 384
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
add rax, r9
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [r13+8]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [r13+16]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [r13+32]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [r13+40]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [r13+56]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [r13+64]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [r13+80]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [r13+88]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [r13+96]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [r13+104]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [r13+112]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [r13+120]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [r13+128]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [r13+136]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [r13+144]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [r13+152]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [r13+160]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [r13+168]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [r13+176]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [r13+184]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, rax
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+384]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [r13+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [r13+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [r13+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [r13+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [r13+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [r13+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [r13+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [r13+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [r13+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [r13+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [r13+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [r13+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [r13+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [r13+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [r13+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [r13+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [r13+376]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [rcx+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [rcx+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [rcx+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [rcx+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [rcx+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [rcx+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [rcx+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [rcx+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [rcx+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [rcx+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [rcx+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [rcx+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [rcx+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [rcx+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [rcx+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [rcx+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb r11, 0
|
|
sub rsi, 192
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r12+248]
|
|
mov r10, QWORD PTR [rsi+256]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r10, QWORD PTR [r12+256]
|
|
mov rax, QWORD PTR [rsi+264]
|
|
mov QWORD PTR [rsi+256], r10
|
|
adc rax, QWORD PTR [r12+264]
|
|
mov r9, QWORD PTR [rsi+272]
|
|
mov QWORD PTR [rsi+264], rax
|
|
adc r9, QWORD PTR [r12+272]
|
|
mov r10, QWORD PTR [rsi+280]
|
|
mov QWORD PTR [rsi+272], r9
|
|
adc r10, QWORD PTR [r12+280]
|
|
mov rax, QWORD PTR [rsi+288]
|
|
mov QWORD PTR [rsi+280], r10
|
|
adc rax, QWORD PTR [r12+288]
|
|
mov r9, QWORD PTR [rsi+296]
|
|
mov QWORD PTR [rsi+288], rax
|
|
adc r9, QWORD PTR [r12+296]
|
|
mov r10, QWORD PTR [rsi+304]
|
|
mov QWORD PTR [rsi+296], r9
|
|
adc r10, QWORD PTR [r12+304]
|
|
mov rax, QWORD PTR [rsi+312]
|
|
mov QWORD PTR [rsi+304], r10
|
|
adc rax, QWORD PTR [r12+312]
|
|
mov r9, QWORD PTR [rsi+320]
|
|
mov QWORD PTR [rsi+312], rax
|
|
adc r9, QWORD PTR [r12+320]
|
|
mov r10, QWORD PTR [rsi+328]
|
|
mov QWORD PTR [rsi+320], r9
|
|
adc r10, QWORD PTR [r12+328]
|
|
mov rax, QWORD PTR [rsi+336]
|
|
mov QWORD PTR [rsi+328], r10
|
|
adc rax, QWORD PTR [r12+336]
|
|
mov r9, QWORD PTR [rsi+344]
|
|
mov QWORD PTR [rsi+336], rax
|
|
adc r9, QWORD PTR [r12+344]
|
|
mov r10, QWORD PTR [rsi+352]
|
|
mov QWORD PTR [rsi+344], r9
|
|
adc r10, QWORD PTR [r12+352]
|
|
mov rax, QWORD PTR [rsi+360]
|
|
mov QWORD PTR [rsi+352], r10
|
|
adc rax, QWORD PTR [r12+360]
|
|
mov r9, QWORD PTR [rsi+368]
|
|
mov QWORD PTR [rsi+360], rax
|
|
adc r9, QWORD PTR [r12+368]
|
|
mov r10, QWORD PTR [rsi+376]
|
|
mov QWORD PTR [rsi+368], r9
|
|
adc r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [rsi+376], r10
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+576], r11
|
|
add rsi, 192
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r13+192]
|
|
mov QWORD PTR [rsi+192], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+200]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+208]
|
|
mov QWORD PTR [rsi+200], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+216]
|
|
mov QWORD PTR [rsi+208], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+224]
|
|
mov QWORD PTR [rsi+216], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+232]
|
|
mov QWORD PTR [rsi+224], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+240]
|
|
mov QWORD PTR [rsi+232], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+248]
|
|
mov QWORD PTR [rsi+240], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+256]
|
|
mov QWORD PTR [rsi+248], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+264]
|
|
mov QWORD PTR [rsi+256], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+272]
|
|
mov QWORD PTR [rsi+264], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+280]
|
|
mov QWORD PTR [rsi+272], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+288]
|
|
mov QWORD PTR [rsi+280], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+296]
|
|
mov QWORD PTR [rsi+288], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+304]
|
|
mov QWORD PTR [rsi+296], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+312]
|
|
mov QWORD PTR [rsi+304], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+320]
|
|
mov QWORD PTR [rsi+312], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+328]
|
|
mov QWORD PTR [rsi+320], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+336]
|
|
mov QWORD PTR [rsi+328], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+344]
|
|
mov QWORD PTR [rsi+336], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+352]
|
|
mov QWORD PTR [rsi+344], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+360]
|
|
mov QWORD PTR [rsi+352], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+368]
|
|
mov QWORD PTR [rsi+360], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+376]
|
|
mov QWORD PTR [rsi+368], rax
|
|
adc r9, 0
|
|
mov QWORD PTR [rsi+376], r9
|
|
add rsp, 1192
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_avx2_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sqr_avx2_48 PROC
|
|
push r12
|
|
sub rsp, 984
|
|
mov QWORD PTR [rsp+960], rcx
|
|
mov QWORD PTR [rsp+968], rdx
|
|
lea r10, QWORD PTR [rsp+768]
|
|
lea r11, QWORD PTR [rdx+192]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc rax, QWORD PTR [r11+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
adc r8, QWORD PTR [r11+104]
|
|
mov rax, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
adc rax, QWORD PTR [r11+112]
|
|
mov r8, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
adc r8, QWORD PTR [r11+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
adc rax, QWORD PTR [r11+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
adc r8, QWORD PTR [r11+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
adc rax, QWORD PTR [r11+144]
|
|
mov r8, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
adc r8, QWORD PTR [r11+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
adc rax, QWORD PTR [r11+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
adc r8, QWORD PTR [r11+168]
|
|
mov rax, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
adc rax, QWORD PTR [r11+176]
|
|
mov r8, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
adc r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r10+184], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+976], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_3072_sqr_avx2_24
|
|
mov rdx, QWORD PTR [rsp+968]
|
|
lea rcx, QWORD PTR [rsp+384]
|
|
add rdx, 192
|
|
call sp_3072_sqr_avx2_24
|
|
mov rdx, QWORD PTR [rsp+968]
|
|
mov rcx, QWORD PTR [rsp+960]
|
|
call sp_3072_sqr_avx2_24
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+968]
|
|
mov rcx, QWORD PTR [rsp+960]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+976]
|
|
mov r11, rcx
|
|
lea r10, QWORD PTR [rsp+768]
|
|
mov r9, r12
|
|
neg r12
|
|
add r11, 384
|
|
mov rax, QWORD PTR [r10]
|
|
pext rax, rax, r12
|
|
add rax, rax
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r11], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r11+88], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov QWORD PTR [r11+184], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+384]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rdx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rdx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rdx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rdx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rdx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rdx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rdx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rdx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rdx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rdx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rdx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rcx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rcx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rcx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rcx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rcx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rcx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rcx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rcx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rcx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rcx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rcx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rcx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rcx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rcx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rcx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rcx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb r9, 0
|
|
sub r11, 192
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r11+8]
|
|
mov QWORD PTR [r11], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [r11+16]
|
|
mov QWORD PTR [r11+8], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r11+24]
|
|
mov QWORD PTR [r11+16], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [r11+32]
|
|
mov QWORD PTR [r11+24], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r11+40]
|
|
mov QWORD PTR [r11+32], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [r11+48]
|
|
mov QWORD PTR [r11+40], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r11+56]
|
|
mov QWORD PTR [r11+48], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r11+64]
|
|
mov QWORD PTR [r11+56], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r11+72]
|
|
mov QWORD PTR [r11+64], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [r11+80]
|
|
mov QWORD PTR [r11+72], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r11+88]
|
|
mov QWORD PTR [r11+80], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [r11+96]
|
|
mov QWORD PTR [r11+88], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r11+104]
|
|
mov QWORD PTR [r11+96], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r11+112]
|
|
mov QWORD PTR [r11+104], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r11+120]
|
|
mov QWORD PTR [r11+112], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [r11+128]
|
|
mov QWORD PTR [r11+120], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [r11+136]
|
|
mov QWORD PTR [r11+128], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [r11+144]
|
|
mov QWORD PTR [r11+136], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [r11+152]
|
|
mov QWORD PTR [r11+144], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r11+160]
|
|
mov QWORD PTR [r11+152], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [r11+168]
|
|
mov QWORD PTR [r11+160], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [r11+176]
|
|
mov QWORD PTR [r11+168], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [r11+184]
|
|
mov QWORD PTR [r11+176], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov rax, QWORD PTR [r11+192]
|
|
mov QWORD PTR [r11+184], r8
|
|
adc rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [r11+200]
|
|
mov QWORD PTR [r11+192], rax
|
|
adc r8, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [r11+208]
|
|
mov QWORD PTR [r11+200], r8
|
|
adc rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [r11+216]
|
|
mov QWORD PTR [r11+208], rax
|
|
adc r8, QWORD PTR [r10+216]
|
|
mov rax, QWORD PTR [r11+224]
|
|
mov QWORD PTR [r11+216], r8
|
|
adc rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [r11+232]
|
|
mov QWORD PTR [r11+224], rax
|
|
adc r8, QWORD PTR [r10+232]
|
|
mov rax, QWORD PTR [r11+240]
|
|
mov QWORD PTR [r11+232], r8
|
|
adc rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [r11+248]
|
|
mov QWORD PTR [r11+240], rax
|
|
adc r8, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [r11+256]
|
|
mov QWORD PTR [r11+248], r8
|
|
adc rax, QWORD PTR [r10+256]
|
|
mov r8, QWORD PTR [r11+264]
|
|
mov QWORD PTR [r11+256], rax
|
|
adc r8, QWORD PTR [r10+264]
|
|
mov rax, QWORD PTR [r11+272]
|
|
mov QWORD PTR [r11+264], r8
|
|
adc rax, QWORD PTR [r10+272]
|
|
mov r8, QWORD PTR [r11+280]
|
|
mov QWORD PTR [r11+272], rax
|
|
adc r8, QWORD PTR [r10+280]
|
|
mov rax, QWORD PTR [r11+288]
|
|
mov QWORD PTR [r11+280], r8
|
|
adc rax, QWORD PTR [r10+288]
|
|
mov r8, QWORD PTR [r11+296]
|
|
mov QWORD PTR [r11+288], rax
|
|
adc r8, QWORD PTR [r10+296]
|
|
mov rax, QWORD PTR [r11+304]
|
|
mov QWORD PTR [r11+296], r8
|
|
adc rax, QWORD PTR [r10+304]
|
|
mov r8, QWORD PTR [r11+312]
|
|
mov QWORD PTR [r11+304], rax
|
|
adc r8, QWORD PTR [r10+312]
|
|
mov rax, QWORD PTR [r11+320]
|
|
mov QWORD PTR [r11+312], r8
|
|
adc rax, QWORD PTR [r10+320]
|
|
mov r8, QWORD PTR [r11+328]
|
|
mov QWORD PTR [r11+320], rax
|
|
adc r8, QWORD PTR [r10+328]
|
|
mov rax, QWORD PTR [r11+336]
|
|
mov QWORD PTR [r11+328], r8
|
|
adc rax, QWORD PTR [r10+336]
|
|
mov r8, QWORD PTR [r11+344]
|
|
mov QWORD PTR [r11+336], rax
|
|
adc r8, QWORD PTR [r10+344]
|
|
mov rax, QWORD PTR [r11+352]
|
|
mov QWORD PTR [r11+344], r8
|
|
adc rax, QWORD PTR [r10+352]
|
|
mov r8, QWORD PTR [r11+360]
|
|
mov QWORD PTR [r11+352], rax
|
|
adc r8, QWORD PTR [r10+360]
|
|
mov rax, QWORD PTR [r11+368]
|
|
mov QWORD PTR [r11+360], r8
|
|
adc rax, QWORD PTR [r10+368]
|
|
mov r8, QWORD PTR [r11+376]
|
|
mov QWORD PTR [r11+368], rax
|
|
adc r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r11+376], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+576], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [r11+192]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r11+200]
|
|
mov QWORD PTR [r11+192], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r11+208]
|
|
mov QWORD PTR [r11+200], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r11+216]
|
|
mov QWORD PTR [r11+208], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r11+224]
|
|
mov QWORD PTR [r11+216], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r11+232]
|
|
mov QWORD PTR [r11+224], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r11+240]
|
|
mov QWORD PTR [r11+232], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r11+248]
|
|
mov QWORD PTR [r11+240], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r11+256]
|
|
mov QWORD PTR [r11+248], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r11+264]
|
|
mov QWORD PTR [r11+256], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r11+272]
|
|
mov QWORD PTR [r11+264], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r11+280]
|
|
mov QWORD PTR [r11+272], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r11+288]
|
|
mov QWORD PTR [r11+280], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r11+296]
|
|
mov QWORD PTR [r11+288], rax
|
|
adc r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r11+304]
|
|
mov QWORD PTR [r11+296], r8
|
|
adc rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r11+312]
|
|
mov QWORD PTR [r11+304], rax
|
|
adc r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r11+320]
|
|
mov QWORD PTR [r11+312], r8
|
|
adc rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r11+328]
|
|
mov QWORD PTR [r11+320], rax
|
|
adc r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r11+336]
|
|
mov QWORD PTR [r11+328], r8
|
|
adc rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r11+344]
|
|
mov QWORD PTR [r11+336], rax
|
|
adc r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r11+352]
|
|
mov QWORD PTR [r11+344], r8
|
|
adc rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r11+360]
|
|
mov QWORD PTR [r11+352], rax
|
|
adc r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r11+368]
|
|
mov QWORD PTR [r11+360], r8
|
|
adc rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r11+376]
|
|
mov QWORD PTR [r11+368], rax
|
|
adc r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r11+384]
|
|
mov QWORD PTR [r11+376], r8
|
|
adc rax, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [r11+384], rax
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+200]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [r11+392], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [r11+400], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [r11+408], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [r11+416], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [r11+424], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r11+432], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [r11+440], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+264]
|
|
mov QWORD PTR [r11+448], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [r11+456], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [r11+464], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [r11+472], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [r11+480], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [r11+488], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [r11+496], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [r11+504], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [r11+512], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [r11+520], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [r11+528], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [r11+536], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [r11+544], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [r11+552], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [r11+560], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [r11+568], rax
|
|
add rsp, 984
|
|
pop r12
|
|
ret
|
|
sp_3072_sqr_avx2_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_d_48 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[16] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+128]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+128], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[17] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+136], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[18] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+144], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[19] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+152]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+152], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[20] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+160]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+160], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[21] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+168]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[22] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+176]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+176], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[23] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+184]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+184], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[24] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+192]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+192], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[25] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+200]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+200], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[26] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+208]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+208], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[27] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+216]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[28] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+224]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+224], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[29] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+232]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+232], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[30] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+240]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+240], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[31] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+248]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+248], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[32] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+256]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+256], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[33] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+264]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+264], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[34] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+272]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+272], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[35] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+280]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+280], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[36] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+288]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+288], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[37] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+296]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+296], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[38] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+304]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+304], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[39] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+312]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+312], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[40] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+320]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+320], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[41] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+328]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+328], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[42] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+336]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+336], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[43] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+344]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+344], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[44] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+352]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+352], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[45] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+360]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+360], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[46] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+368]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+368], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[47] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+376]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
mov QWORD PTR [rcx+376], r12
|
|
mov QWORD PTR [rcx+384], r10
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_d_48 ENDP
|
|
_text ENDS
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cond_sub_24 PROC
|
|
sub rsp, 192
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [r8+136]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+128], r10
|
|
mov QWORD PTR [rsp+136], r11
|
|
mov r10, QWORD PTR [r8+144]
|
|
mov r11, QWORD PTR [r8+152]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+144], r10
|
|
mov QWORD PTR [rsp+152], r11
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r11, QWORD PTR [r8+168]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+160], r10
|
|
mov QWORD PTR [rsp+168], r11
|
|
mov r10, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [r8+184]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+176], r10
|
|
mov QWORD PTR [rsp+184], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov r10, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rsp+128]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+120], r11
|
|
mov r11, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rsp+136]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+128], r10
|
|
mov r10, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rsp+144]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+136], r11
|
|
mov r11, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rsp+152]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+144], r10
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rsp+160]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+152], r11
|
|
mov r11, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rsp+168]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+160], r10
|
|
mov r10, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rsp+176]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+168], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rsp+184]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov QWORD PTR [rcx+184], r11
|
|
sbb rax, 0
|
|
add rsp, 192
|
|
ret
|
|
sp_3072_cond_sub_24 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 3072 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mont_reduce_24 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 24
|
|
mov r10, 24
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_3072_mont_loop_24:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc r11, 0
|
|
; a[i+6] += m[6] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+48], r14
|
|
adc r12, 0
|
|
; a[i+7] += m[7] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+56]
|
|
mov r14, QWORD PTR [rcx+56]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+56], r14
|
|
adc r11, 0
|
|
; a[i+8] += m[8] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+64]
|
|
mov r14, QWORD PTR [rcx+64]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+64], r14
|
|
adc r12, 0
|
|
; a[i+9] += m[9] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rcx+72]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+72], r14
|
|
adc r11, 0
|
|
; a[i+10] += m[10] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+80]
|
|
mov r14, QWORD PTR [rcx+80]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+80], r14
|
|
adc r12, 0
|
|
; a[i+11] += m[11] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
mov r14, QWORD PTR [rcx+88]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+88], r14
|
|
adc r11, 0
|
|
; a[i+12] += m[12] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
mov r14, QWORD PTR [rcx+96]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+96], r14
|
|
adc r12, 0
|
|
; a[i+13] += m[13] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+104]
|
|
mov r14, QWORD PTR [rcx+104]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+104], r14
|
|
adc r11, 0
|
|
; a[i+14] += m[14] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rcx+112]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+112], r14
|
|
adc r12, 0
|
|
; a[i+15] += m[15] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+120]
|
|
mov r14, QWORD PTR [rcx+120]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+120], r14
|
|
adc r11, 0
|
|
; a[i+16] += m[16] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+128]
|
|
mov r14, QWORD PTR [rcx+128]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+128], r14
|
|
adc r12, 0
|
|
; a[i+17] += m[17] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
mov r14, QWORD PTR [rcx+136]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+136], r14
|
|
adc r11, 0
|
|
; a[i+18] += m[18] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
mov r14, QWORD PTR [rcx+144]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+144], r14
|
|
adc r12, 0
|
|
; a[i+19] += m[19] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+152]
|
|
mov r14, QWORD PTR [rcx+152]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+152], r14
|
|
adc r11, 0
|
|
; a[i+20] += m[20] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+160]
|
|
mov r14, QWORD PTR [rcx+160]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+160], r14
|
|
adc r12, 0
|
|
; a[i+21] += m[21] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+168]
|
|
mov r14, QWORD PTR [rcx+168]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+168], r14
|
|
adc r11, 0
|
|
; a[i+22] += m[22] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+176]
|
|
mov r14, QWORD PTR [rcx+176]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+176], r14
|
|
adc r12, 0
|
|
; a[i+23] += m[23] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+184]
|
|
mov r14, QWORD PTR [rcx+184]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+184], r14
|
|
adc QWORD PTR [rcx+192], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_3072_mont_loop_24
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
neg rsi
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 192
|
|
call sp_3072_cond_sub_24
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mont_reduce_24 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cond_sub_avx2_24 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [rdx+128]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+128], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+136], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+152]
|
|
mov r11, QWORD PTR [rdx+152]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+144], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+152], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+168]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+160], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [rdx+176]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+176], r11
|
|
sbb r12, r10
|
|
mov QWORD PTR [rcx+184], r12
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_3072_cond_sub_avx2_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_d_24 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[16] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+128]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+128], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[17] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+136], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[18] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+144], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[19] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+152]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+152], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[20] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+160]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+160], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[21] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+168]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[22] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+176]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+176], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[23] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+184]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
mov QWORD PTR [rcx+184], r12
|
|
mov QWORD PTR [rcx+192], r10
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_d_24 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_d_avx2_24 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[6] * B
|
|
mulx r10, r9, QWORD PTR [rax+48]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; A[7] * B
|
|
mulx r10, r9, QWORD PTR [rax+56]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; A[8] * B
|
|
mulx r10, r9, QWORD PTR [rax+64]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[9] * B
|
|
mulx r10, r9, QWORD PTR [rax+72]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
; A[10] * B
|
|
mulx r10, r9, QWORD PTR [rax+80]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
; A[11] * B
|
|
mulx r10, r9, QWORD PTR [rax+88]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+88], r12
|
|
; A[12] * B
|
|
mulx r10, r9, QWORD PTR [rax+96]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+96], r11
|
|
; A[13] * B
|
|
mulx r10, r9, QWORD PTR [rax+104]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+104], r12
|
|
; A[14] * B
|
|
mulx r10, r9, QWORD PTR [rax+112]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[15] * B
|
|
mulx r10, r9, QWORD PTR [rax+120]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+120], r12
|
|
; A[16] * B
|
|
mulx r10, r9, QWORD PTR [rax+128]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[17] * B
|
|
mulx r10, r9, QWORD PTR [rax+136]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[18] * B
|
|
mulx r10, r9, QWORD PTR [rax+144]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+144], r11
|
|
; A[19] * B
|
|
mulx r10, r9, QWORD PTR [rax+152]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+152], r12
|
|
; A[20] * B
|
|
mulx r10, r9, QWORD PTR [rax+160]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[21] * B
|
|
mulx r10, r9, QWORD PTR [rax+168]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+168], r12
|
|
; A[22] * B
|
|
mulx r10, r9, QWORD PTR [rax+176]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+176], r11
|
|
; A[23] * B
|
|
mulx r10, r9, QWORD PTR [rax+184]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+184], r12
|
|
mov QWORD PTR [rcx+192], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_d_avx2_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_3072_word_asm_24 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_3072_word_asm_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cmp_24 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+176]
|
|
mov r12, QWORD PTR [rdx+176]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+168]
|
|
mov r12, QWORD PTR [rdx+168]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+152]
|
|
mov r12, QWORD PTR [rdx+152]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+144]
|
|
mov r12, QWORD PTR [rdx+144]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+128]
|
|
mov r12, QWORD PTR [rdx+128]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+120]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+104]
|
|
mov r12, QWORD PTR [rdx+104]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+96]
|
|
mov r12, QWORD PTR [rdx+96]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+80]
|
|
mov r12, QWORD PTR [rdx+80]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+72]
|
|
mov r12, QWORD PTR [rdx+72]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+56]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+48]
|
|
mov r12, QWORD PTR [rdx+48]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_3072_cmp_24 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 3072 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mont_reduce_avx2_24 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r9, rcx
|
|
mov r10, rdx
|
|
xor rbp, rbp
|
|
; i = 24
|
|
mov r11, 24
|
|
mov r15, QWORD PTR [r9]
|
|
mov rdi, QWORD PTR [r9+8]
|
|
mov rsi, QWORD PTR [r9+16]
|
|
mov rbx, QWORD PTR [r9+24]
|
|
add r9, 96
|
|
xor rbp, rbp
|
|
L_3072_mont_loop_avx2_24:
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-64]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-56]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-56], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9+-40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-48], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9+-32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-40], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+-24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-32], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+-16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-24], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+-8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-16], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-8], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+8], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+16], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+24], r13
|
|
; a[i+16] += m[16] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+128]
|
|
mov r13, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+32], r12
|
|
; a[i+17] += m[17] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+136]
|
|
mov r12, QWORD PTR [r9+48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+40], r13
|
|
; a[i+18] += m[18] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+144]
|
|
mov r13, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+48], r12
|
|
; a[i+19] += m[19] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+152]
|
|
mov r12, QWORD PTR [r9+64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+56], r13
|
|
; a[i+20] += m[20] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+160]
|
|
mov r13, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+64], r12
|
|
; a[i+21] += m[21] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+168]
|
|
mov r12, QWORD PTR [r9+80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+72], r13
|
|
; a[i+22] += m[22] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+176]
|
|
mov r13, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+80], r12
|
|
; a[i+23] += m[23] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+184]
|
|
mov r12, QWORD PTR [r9+96]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+88], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+96], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; a += 1
|
|
add r9, 8
|
|
; i -= 1
|
|
sub r11, 1
|
|
jnz L_3072_mont_loop_avx2_24
|
|
sub r9, 96
|
|
neg rbp
|
|
mov r8, r9
|
|
sub r9, 192
|
|
mov rcx, QWORD PTR [r10]
|
|
mov rdx, r15
|
|
pext rcx, rcx, rbp
|
|
sub rdx, rcx
|
|
mov rcx, QWORD PTR [r10+8]
|
|
mov rax, rdi
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+16]
|
|
mov rcx, rsi
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+8], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+24]
|
|
mov rdx, rbx
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+16], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+32]
|
|
mov rax, QWORD PTR [r8+32]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+24], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+40]
|
|
mov rcx, QWORD PTR [r8+40]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+32], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov rdx, QWORD PTR [r8+48]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+40], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r8+56]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+48], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+64]
|
|
mov rcx, QWORD PTR [r8+64]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+56], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+72]
|
|
mov rdx, QWORD PTR [r8+72]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+64], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+80]
|
|
mov rax, QWORD PTR [r8+80]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+72], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+88]
|
|
mov rcx, QWORD PTR [r8+88]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+80], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov rdx, QWORD PTR [r8+96]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+88], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r8+104]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+96], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+112]
|
|
mov rcx, QWORD PTR [r8+112]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+104], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+120]
|
|
mov rdx, QWORD PTR [r8+120]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+112], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+128]
|
|
mov rax, QWORD PTR [r8+128]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+120], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+136]
|
|
mov rcx, QWORD PTR [r8+136]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+128], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov rdx, QWORD PTR [r8+144]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+136], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r8+152]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+144], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+160]
|
|
mov rcx, QWORD PTR [r8+160]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+152], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+168]
|
|
mov rdx, QWORD PTR [r8+168]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+160], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+176]
|
|
mov rax, QWORD PTR [r8+176]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+168], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+184]
|
|
mov rcx, QWORD PTR [r8+184]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+176], rax
|
|
sbb rcx, rdx
|
|
mov QWORD PTR [r9+184], rcx
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mont_reduce_avx2_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cond_sub_48 PROC
|
|
sub rsp, 384
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [r8+136]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+128], r10
|
|
mov QWORD PTR [rsp+136], r11
|
|
mov r10, QWORD PTR [r8+144]
|
|
mov r11, QWORD PTR [r8+152]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+144], r10
|
|
mov QWORD PTR [rsp+152], r11
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r11, QWORD PTR [r8+168]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+160], r10
|
|
mov QWORD PTR [rsp+168], r11
|
|
mov r10, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [r8+184]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+176], r10
|
|
mov QWORD PTR [rsp+184], r11
|
|
mov r10, QWORD PTR [r8+192]
|
|
mov r11, QWORD PTR [r8+200]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+192], r10
|
|
mov QWORD PTR [rsp+200], r11
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r11, QWORD PTR [r8+216]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+208], r10
|
|
mov QWORD PTR [rsp+216], r11
|
|
mov r10, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [r8+232]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+224], r10
|
|
mov QWORD PTR [rsp+232], r11
|
|
mov r10, QWORD PTR [r8+240]
|
|
mov r11, QWORD PTR [r8+248]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+240], r10
|
|
mov QWORD PTR [rsp+248], r11
|
|
mov r10, QWORD PTR [r8+256]
|
|
mov r11, QWORD PTR [r8+264]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+256], r10
|
|
mov QWORD PTR [rsp+264], r11
|
|
mov r10, QWORD PTR [r8+272]
|
|
mov r11, QWORD PTR [r8+280]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+272], r10
|
|
mov QWORD PTR [rsp+280], r11
|
|
mov r10, QWORD PTR [r8+288]
|
|
mov r11, QWORD PTR [r8+296]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+288], r10
|
|
mov QWORD PTR [rsp+296], r11
|
|
mov r10, QWORD PTR [r8+304]
|
|
mov r11, QWORD PTR [r8+312]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+304], r10
|
|
mov QWORD PTR [rsp+312], r11
|
|
mov r10, QWORD PTR [r8+320]
|
|
mov r11, QWORD PTR [r8+328]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+320], r10
|
|
mov QWORD PTR [rsp+328], r11
|
|
mov r10, QWORD PTR [r8+336]
|
|
mov r11, QWORD PTR [r8+344]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+336], r10
|
|
mov QWORD PTR [rsp+344], r11
|
|
mov r10, QWORD PTR [r8+352]
|
|
mov r11, QWORD PTR [r8+360]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+352], r10
|
|
mov QWORD PTR [rsp+360], r11
|
|
mov r10, QWORD PTR [r8+368]
|
|
mov r11, QWORD PTR [r8+376]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+368], r10
|
|
mov QWORD PTR [rsp+376], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov r10, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rsp+128]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+120], r11
|
|
mov r11, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rsp+136]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+128], r10
|
|
mov r10, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rsp+144]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+136], r11
|
|
mov r11, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rsp+152]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+144], r10
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rsp+160]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+152], r11
|
|
mov r11, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rsp+168]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+160], r10
|
|
mov r10, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rsp+176]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+168], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rsp+184]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov r10, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rsp+192]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+184], r11
|
|
mov r11, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rsp+200]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+192], r10
|
|
mov r10, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [rsp+208]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+200], r11
|
|
mov r11, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rsp+216]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+208], r10
|
|
mov r10, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rsp+224]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+216], r11
|
|
mov r11, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rsp+232]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+224], r10
|
|
mov r10, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [rsp+240]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+232], r11
|
|
mov r11, QWORD PTR [rdx+248]
|
|
mov r8, QWORD PTR [rsp+248]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+240], r10
|
|
mov r10, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [rsp+256]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+248], r11
|
|
mov r11, QWORD PTR [rdx+264]
|
|
mov r8, QWORD PTR [rsp+264]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+256], r10
|
|
mov r10, QWORD PTR [rdx+272]
|
|
mov r8, QWORD PTR [rsp+272]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+264], r11
|
|
mov r11, QWORD PTR [rdx+280]
|
|
mov r8, QWORD PTR [rsp+280]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+272], r10
|
|
mov r10, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [rsp+288]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+280], r11
|
|
mov r11, QWORD PTR [rdx+296]
|
|
mov r8, QWORD PTR [rsp+296]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+288], r10
|
|
mov r10, QWORD PTR [rdx+304]
|
|
mov r8, QWORD PTR [rsp+304]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+296], r11
|
|
mov r11, QWORD PTR [rdx+312]
|
|
mov r8, QWORD PTR [rsp+312]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+304], r10
|
|
mov r10, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [rsp+320]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+312], r11
|
|
mov r11, QWORD PTR [rdx+328]
|
|
mov r8, QWORD PTR [rsp+328]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+320], r10
|
|
mov r10, QWORD PTR [rdx+336]
|
|
mov r8, QWORD PTR [rsp+336]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+328], r11
|
|
mov r11, QWORD PTR [rdx+344]
|
|
mov r8, QWORD PTR [rsp+344]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+336], r10
|
|
mov r10, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [rsp+352]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+344], r11
|
|
mov r11, QWORD PTR [rdx+360]
|
|
mov r8, QWORD PTR [rsp+360]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+352], r10
|
|
mov r10, QWORD PTR [rdx+368]
|
|
mov r8, QWORD PTR [rsp+368]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+360], r11
|
|
mov r11, QWORD PTR [rdx+376]
|
|
mov r8, QWORD PTR [rsp+376]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+368], r10
|
|
mov QWORD PTR [rcx+376], r11
|
|
sbb rax, 0
|
|
add rsp, 384
|
|
ret
|
|
sp_3072_cond_sub_48 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 3072 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mont_reduce_48 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 48
|
|
mov r10, 48
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_3072_mont_loop_48:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc r11, 0
|
|
; a[i+6] += m[6] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+48], r14
|
|
adc r12, 0
|
|
; a[i+7] += m[7] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+56]
|
|
mov r14, QWORD PTR [rcx+56]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+56], r14
|
|
adc r11, 0
|
|
; a[i+8] += m[8] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+64]
|
|
mov r14, QWORD PTR [rcx+64]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+64], r14
|
|
adc r12, 0
|
|
; a[i+9] += m[9] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rcx+72]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+72], r14
|
|
adc r11, 0
|
|
; a[i+10] += m[10] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+80]
|
|
mov r14, QWORD PTR [rcx+80]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+80], r14
|
|
adc r12, 0
|
|
; a[i+11] += m[11] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
mov r14, QWORD PTR [rcx+88]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+88], r14
|
|
adc r11, 0
|
|
; a[i+12] += m[12] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
mov r14, QWORD PTR [rcx+96]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+96], r14
|
|
adc r12, 0
|
|
; a[i+13] += m[13] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+104]
|
|
mov r14, QWORD PTR [rcx+104]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+104], r14
|
|
adc r11, 0
|
|
; a[i+14] += m[14] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rcx+112]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+112], r14
|
|
adc r12, 0
|
|
; a[i+15] += m[15] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+120]
|
|
mov r14, QWORD PTR [rcx+120]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+120], r14
|
|
adc r11, 0
|
|
; a[i+16] += m[16] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+128]
|
|
mov r14, QWORD PTR [rcx+128]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+128], r14
|
|
adc r12, 0
|
|
; a[i+17] += m[17] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
mov r14, QWORD PTR [rcx+136]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+136], r14
|
|
adc r11, 0
|
|
; a[i+18] += m[18] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
mov r14, QWORD PTR [rcx+144]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+144], r14
|
|
adc r12, 0
|
|
; a[i+19] += m[19] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+152]
|
|
mov r14, QWORD PTR [rcx+152]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+152], r14
|
|
adc r11, 0
|
|
; a[i+20] += m[20] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+160]
|
|
mov r14, QWORD PTR [rcx+160]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+160], r14
|
|
adc r12, 0
|
|
; a[i+21] += m[21] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+168]
|
|
mov r14, QWORD PTR [rcx+168]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+168], r14
|
|
adc r11, 0
|
|
; a[i+22] += m[22] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+176]
|
|
mov r14, QWORD PTR [rcx+176]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+176], r14
|
|
adc r12, 0
|
|
; a[i+23] += m[23] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+184]
|
|
mov r14, QWORD PTR [rcx+184]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+184], r14
|
|
adc r11, 0
|
|
; a[i+24] += m[24] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+192]
|
|
mov r14, QWORD PTR [rcx+192]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+192], r14
|
|
adc r12, 0
|
|
; a[i+25] += m[25] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+200]
|
|
mov r14, QWORD PTR [rcx+200]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+200], r14
|
|
adc r11, 0
|
|
; a[i+26] += m[26] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+208]
|
|
mov r14, QWORD PTR [rcx+208]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+208], r14
|
|
adc r12, 0
|
|
; a[i+27] += m[27] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+216]
|
|
mov r14, QWORD PTR [rcx+216]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+216], r14
|
|
adc r11, 0
|
|
; a[i+28] += m[28] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+224]
|
|
mov r14, QWORD PTR [rcx+224]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+224], r14
|
|
adc r12, 0
|
|
; a[i+29] += m[29] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+232]
|
|
mov r14, QWORD PTR [rcx+232]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+232], r14
|
|
adc r11, 0
|
|
; a[i+30] += m[30] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+240]
|
|
mov r14, QWORD PTR [rcx+240]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+240], r14
|
|
adc r12, 0
|
|
; a[i+31] += m[31] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+248]
|
|
mov r14, QWORD PTR [rcx+248]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+248], r14
|
|
adc r11, 0
|
|
; a[i+32] += m[32] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+256]
|
|
mov r14, QWORD PTR [rcx+256]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+256], r14
|
|
adc r12, 0
|
|
; a[i+33] += m[33] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+264]
|
|
mov r14, QWORD PTR [rcx+264]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+264], r14
|
|
adc r11, 0
|
|
; a[i+34] += m[34] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+272]
|
|
mov r14, QWORD PTR [rcx+272]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+272], r14
|
|
adc r12, 0
|
|
; a[i+35] += m[35] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+280]
|
|
mov r14, QWORD PTR [rcx+280]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+280], r14
|
|
adc r11, 0
|
|
; a[i+36] += m[36] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+288]
|
|
mov r14, QWORD PTR [rcx+288]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+288], r14
|
|
adc r12, 0
|
|
; a[i+37] += m[37] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+296]
|
|
mov r14, QWORD PTR [rcx+296]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+296], r14
|
|
adc r11, 0
|
|
; a[i+38] += m[38] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+304]
|
|
mov r14, QWORD PTR [rcx+304]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+304], r14
|
|
adc r12, 0
|
|
; a[i+39] += m[39] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+312]
|
|
mov r14, QWORD PTR [rcx+312]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+312], r14
|
|
adc r11, 0
|
|
; a[i+40] += m[40] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+320]
|
|
mov r14, QWORD PTR [rcx+320]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+320], r14
|
|
adc r12, 0
|
|
; a[i+41] += m[41] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+328]
|
|
mov r14, QWORD PTR [rcx+328]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+328], r14
|
|
adc r11, 0
|
|
; a[i+42] += m[42] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+336]
|
|
mov r14, QWORD PTR [rcx+336]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+336], r14
|
|
adc r12, 0
|
|
; a[i+43] += m[43] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+344]
|
|
mov r14, QWORD PTR [rcx+344]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+344], r14
|
|
adc r11, 0
|
|
; a[i+44] += m[44] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+352]
|
|
mov r14, QWORD PTR [rcx+352]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+352], r14
|
|
adc r12, 0
|
|
; a[i+45] += m[45] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+360]
|
|
mov r14, QWORD PTR [rcx+360]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+360], r14
|
|
adc r11, 0
|
|
; a[i+46] += m[46] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+368]
|
|
mov r14, QWORD PTR [rcx+368]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+368], r14
|
|
adc r12, 0
|
|
; a[i+47] += m[47] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+376]
|
|
mov r14, QWORD PTR [rcx+376]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+376], r14
|
|
adc QWORD PTR [rcx+384], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_3072_mont_loop_48
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
neg rsi
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 384
|
|
call sp_3072_cond_sub_48
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mont_reduce_48 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into r. (r = a - b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_sub_48 PROC
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
sub r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
sbb r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
sbb r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
sbb r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
sbb r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
sbb r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
sbb r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
sbb r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
sbb r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
sbb r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
sbb r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
sbb r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
sbb r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
sbb r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
sbb r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
sbb r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
sbb r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
sbb r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
sbb r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
sbb r10, QWORD PTR [r8+184]
|
|
mov r9, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r10
|
|
sbb r9, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r9
|
|
sbb r10, QWORD PTR [r8+200]
|
|
mov r9, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r10
|
|
sbb r9, QWORD PTR [r8+208]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r9
|
|
sbb r10, QWORD PTR [r8+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r10
|
|
sbb r9, QWORD PTR [r8+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r9
|
|
sbb r10, QWORD PTR [r8+232]
|
|
mov r9, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r10
|
|
sbb r9, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r9
|
|
sbb r10, QWORD PTR [r8+248]
|
|
mov r9, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [rcx+248], r10
|
|
sbb r9, QWORD PTR [r8+256]
|
|
mov r10, QWORD PTR [rdx+264]
|
|
mov QWORD PTR [rcx+256], r9
|
|
sbb r10, QWORD PTR [r8+264]
|
|
mov r9, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [rcx+264], r10
|
|
sbb r9, QWORD PTR [r8+272]
|
|
mov r10, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [rcx+272], r9
|
|
sbb r10, QWORD PTR [r8+280]
|
|
mov r9, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [rcx+280], r10
|
|
sbb r9, QWORD PTR [r8+288]
|
|
mov r10, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [rcx+288], r9
|
|
sbb r10, QWORD PTR [r8+296]
|
|
mov r9, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [rcx+296], r10
|
|
sbb r9, QWORD PTR [r8+304]
|
|
mov r10, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [rcx+304], r9
|
|
sbb r10, QWORD PTR [r8+312]
|
|
mov r9, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [rcx+312], r10
|
|
sbb r9, QWORD PTR [r8+320]
|
|
mov r10, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [rcx+320], r9
|
|
sbb r10, QWORD PTR [r8+328]
|
|
mov r9, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [rcx+328], r10
|
|
sbb r9, QWORD PTR [r8+336]
|
|
mov r10, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [rcx+336], r9
|
|
sbb r10, QWORD PTR [r8+344]
|
|
mov r9, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [rcx+344], r10
|
|
sbb r9, QWORD PTR [r8+352]
|
|
mov r10, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [rcx+352], r9
|
|
sbb r10, QWORD PTR [r8+360]
|
|
mov r9, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [rcx+360], r10
|
|
sbb r9, QWORD PTR [r8+368]
|
|
mov r10, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+368], r9
|
|
sbb r10, QWORD PTR [r8+376]
|
|
mov QWORD PTR [rcx+376], r10
|
|
sbb rax, 0
|
|
ret
|
|
sp_3072_sub_48 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mul_d_avx2_48 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[6] * B
|
|
mulx r10, r9, QWORD PTR [rax+48]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; A[7] * B
|
|
mulx r10, r9, QWORD PTR [rax+56]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; A[8] * B
|
|
mulx r10, r9, QWORD PTR [rax+64]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[9] * B
|
|
mulx r10, r9, QWORD PTR [rax+72]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
; A[10] * B
|
|
mulx r10, r9, QWORD PTR [rax+80]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
; A[11] * B
|
|
mulx r10, r9, QWORD PTR [rax+88]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+88], r12
|
|
; A[12] * B
|
|
mulx r10, r9, QWORD PTR [rax+96]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+96], r11
|
|
; A[13] * B
|
|
mulx r10, r9, QWORD PTR [rax+104]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+104], r12
|
|
; A[14] * B
|
|
mulx r10, r9, QWORD PTR [rax+112]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[15] * B
|
|
mulx r10, r9, QWORD PTR [rax+120]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+120], r12
|
|
; A[16] * B
|
|
mulx r10, r9, QWORD PTR [rax+128]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[17] * B
|
|
mulx r10, r9, QWORD PTR [rax+136]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[18] * B
|
|
mulx r10, r9, QWORD PTR [rax+144]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+144], r11
|
|
; A[19] * B
|
|
mulx r10, r9, QWORD PTR [rax+152]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+152], r12
|
|
; A[20] * B
|
|
mulx r10, r9, QWORD PTR [rax+160]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[21] * B
|
|
mulx r10, r9, QWORD PTR [rax+168]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+168], r12
|
|
; A[22] * B
|
|
mulx r10, r9, QWORD PTR [rax+176]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+176], r11
|
|
; A[23] * B
|
|
mulx r10, r9, QWORD PTR [rax+184]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+184], r12
|
|
; A[24] * B
|
|
mulx r10, r9, QWORD PTR [rax+192]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+192], r11
|
|
; A[25] * B
|
|
mulx r10, r9, QWORD PTR [rax+200]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+200], r12
|
|
; A[26] * B
|
|
mulx r10, r9, QWORD PTR [rax+208]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+208], r11
|
|
; A[27] * B
|
|
mulx r10, r9, QWORD PTR [rax+216]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+216], r12
|
|
; A[28] * B
|
|
mulx r10, r9, QWORD PTR [rax+224]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+224], r11
|
|
; A[29] * B
|
|
mulx r10, r9, QWORD PTR [rax+232]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+232], r12
|
|
; A[30] * B
|
|
mulx r10, r9, QWORD PTR [rax+240]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+240], r11
|
|
; A[31] * B
|
|
mulx r10, r9, QWORD PTR [rax+248]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+248], r12
|
|
; A[32] * B
|
|
mulx r10, r9, QWORD PTR [rax+256]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+256], r11
|
|
; A[33] * B
|
|
mulx r10, r9, QWORD PTR [rax+264]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+264], r12
|
|
; A[34] * B
|
|
mulx r10, r9, QWORD PTR [rax+272]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+272], r11
|
|
; A[35] * B
|
|
mulx r10, r9, QWORD PTR [rax+280]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+280], r12
|
|
; A[36] * B
|
|
mulx r10, r9, QWORD PTR [rax+288]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+288], r11
|
|
; A[37] * B
|
|
mulx r10, r9, QWORD PTR [rax+296]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+296], r12
|
|
; A[38] * B
|
|
mulx r10, r9, QWORD PTR [rax+304]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+304], r11
|
|
; A[39] * B
|
|
mulx r10, r9, QWORD PTR [rax+312]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+312], r12
|
|
; A[40] * B
|
|
mulx r10, r9, QWORD PTR [rax+320]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+320], r11
|
|
; A[41] * B
|
|
mulx r10, r9, QWORD PTR [rax+328]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+328], r12
|
|
; A[42] * B
|
|
mulx r10, r9, QWORD PTR [rax+336]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+336], r11
|
|
; A[43] * B
|
|
mulx r10, r9, QWORD PTR [rax+344]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+344], r12
|
|
; A[44] * B
|
|
mulx r10, r9, QWORD PTR [rax+352]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+352], r11
|
|
; A[45] * B
|
|
mulx r10, r9, QWORD PTR [rax+360]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+360], r12
|
|
; A[46] * B
|
|
mulx r10, r9, QWORD PTR [rax+368]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+368], r11
|
|
; A[47] * B
|
|
mulx r10, r9, QWORD PTR [rax+376]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+376], r12
|
|
mov QWORD PTR [rcx+384], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mul_d_avx2_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_3072_word_asm_48 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_3072_word_asm_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cond_sub_avx2_48 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [rdx+128]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+128], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+136], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+152]
|
|
mov r11, QWORD PTR [rdx+152]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+144], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+152], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+168]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+160], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [rdx+176]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+176], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+192]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+184], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+200]
|
|
mov r11, QWORD PTR [rdx+200]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+192], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+200], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+216]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+208], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [rdx+224]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+216], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+224], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+232], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+248]
|
|
mov r11, QWORD PTR [rdx+248]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+240], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+256]
|
|
mov r12, QWORD PTR [rdx+256]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+248], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+264]
|
|
mov r10, QWORD PTR [rdx+264]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+256], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+272]
|
|
mov r11, QWORD PTR [rdx+272]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+264], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+280]
|
|
mov r12, QWORD PTR [rdx+280]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+272], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+288]
|
|
mov r10, QWORD PTR [rdx+288]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+280], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+296]
|
|
mov r11, QWORD PTR [rdx+296]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+288], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+304]
|
|
mov r12, QWORD PTR [rdx+304]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+296], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+312]
|
|
mov r10, QWORD PTR [rdx+312]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+304], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+320]
|
|
mov r11, QWORD PTR [rdx+320]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+312], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+328]
|
|
mov r12, QWORD PTR [rdx+328]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+320], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+336]
|
|
mov r10, QWORD PTR [rdx+336]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+328], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+344]
|
|
mov r11, QWORD PTR [rdx+344]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+336], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+352]
|
|
mov r12, QWORD PTR [rdx+352]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+344], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+360]
|
|
mov r10, QWORD PTR [rdx+360]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+352], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+368]
|
|
mov r11, QWORD PTR [rdx+368]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+360], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+376]
|
|
mov r12, QWORD PTR [rdx+376]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+368], r11
|
|
sbb r12, r10
|
|
mov QWORD PTR [rcx+376], r12
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_3072_cond_sub_avx2_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cmp_48 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+376]
|
|
mov r12, QWORD PTR [rdx+376]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+368]
|
|
mov r12, QWORD PTR [rdx+368]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+360]
|
|
mov r12, QWORD PTR [rdx+360]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+352]
|
|
mov r12, QWORD PTR [rdx+352]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+344]
|
|
mov r12, QWORD PTR [rdx+344]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+336]
|
|
mov r12, QWORD PTR [rdx+336]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+328]
|
|
mov r12, QWORD PTR [rdx+328]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+320]
|
|
mov r12, QWORD PTR [rdx+320]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+312]
|
|
mov r12, QWORD PTR [rdx+312]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+304]
|
|
mov r12, QWORD PTR [rdx+304]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+296]
|
|
mov r12, QWORD PTR [rdx+296]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+288]
|
|
mov r12, QWORD PTR [rdx+288]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+280]
|
|
mov r12, QWORD PTR [rdx+280]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+272]
|
|
mov r12, QWORD PTR [rdx+272]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+264]
|
|
mov r12, QWORD PTR [rdx+264]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+256]
|
|
mov r12, QWORD PTR [rdx+256]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+248]
|
|
mov r12, QWORD PTR [rdx+248]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+240]
|
|
mov r12, QWORD PTR [rdx+240]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+224]
|
|
mov r12, QWORD PTR [rdx+224]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+216]
|
|
mov r12, QWORD PTR [rdx+216]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+200]
|
|
mov r12, QWORD PTR [rdx+200]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+192]
|
|
mov r12, QWORD PTR [rdx+192]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+176]
|
|
mov r12, QWORD PTR [rdx+176]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+168]
|
|
mov r12, QWORD PTR [rdx+168]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+152]
|
|
mov r12, QWORD PTR [rdx+152]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+144]
|
|
mov r12, QWORD PTR [rdx+144]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+128]
|
|
mov r12, QWORD PTR [rdx+128]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+120]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+104]
|
|
mov r12, QWORD PTR [rdx+104]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+96]
|
|
mov r12, QWORD PTR [rdx+96]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+80]
|
|
mov r12, QWORD PTR [rdx+80]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+72]
|
|
mov r12, QWORD PTR [rdx+72]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+56]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+48]
|
|
mov r12, QWORD PTR [rdx+48]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_3072_cmp_48 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 3072 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_mont_reduce_avx2_48 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r9, rcx
|
|
mov r10, rdx
|
|
xor rbp, rbp
|
|
; i = 48
|
|
mov r11, 48
|
|
mov r15, QWORD PTR [r9]
|
|
mov rdi, QWORD PTR [r9+8]
|
|
mov rsi, QWORD PTR [r9+16]
|
|
mov rbx, QWORD PTR [r9+24]
|
|
add r9, 192
|
|
xor rbp, rbp
|
|
L_3072_mont_loop_avx2_48:
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-160]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-152]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-144]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-152], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9+-136]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-144], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9+-128]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-136], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+-120]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-128], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+-112]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-120], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+-104]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-112], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+-96]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-104], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+-88]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-96], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+-80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-88], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+-72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-80], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+-64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-72], r13
|
|
; a[i+16] += m[16] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+128]
|
|
mov r13, QWORD PTR [r9+-56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-64], r12
|
|
; a[i+17] += m[17] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+136]
|
|
mov r12, QWORD PTR [r9+-48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-56], r13
|
|
; a[i+18] += m[18] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+144]
|
|
mov r13, QWORD PTR [r9+-40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-48], r12
|
|
; a[i+19] += m[19] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+152]
|
|
mov r12, QWORD PTR [r9+-32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-40], r13
|
|
; a[i+20] += m[20] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+160]
|
|
mov r13, QWORD PTR [r9+-24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-32], r12
|
|
; a[i+21] += m[21] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+168]
|
|
mov r12, QWORD PTR [r9+-16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-24], r13
|
|
; a[i+22] += m[22] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+176]
|
|
mov r13, QWORD PTR [r9+-8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-16], r12
|
|
; a[i+23] += m[23] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+184]
|
|
mov r12, QWORD PTR [r9]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-8], r13
|
|
; a[i+24] += m[24] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+192]
|
|
mov r13, QWORD PTR [r9+8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9], r12
|
|
; a[i+25] += m[25] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+200]
|
|
mov r12, QWORD PTR [r9+16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+8], r13
|
|
; a[i+26] += m[26] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+208]
|
|
mov r13, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+16], r12
|
|
; a[i+27] += m[27] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+216]
|
|
mov r12, QWORD PTR [r9+32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+24], r13
|
|
; a[i+28] += m[28] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+224]
|
|
mov r13, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+32], r12
|
|
; a[i+29] += m[29] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+232]
|
|
mov r12, QWORD PTR [r9+48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+40], r13
|
|
; a[i+30] += m[30] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+240]
|
|
mov r13, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+48], r12
|
|
; a[i+31] += m[31] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+248]
|
|
mov r12, QWORD PTR [r9+64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+56], r13
|
|
; a[i+32] += m[32] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+256]
|
|
mov r13, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+64], r12
|
|
; a[i+33] += m[33] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+264]
|
|
mov r12, QWORD PTR [r9+80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+72], r13
|
|
; a[i+34] += m[34] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+272]
|
|
mov r13, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+80], r12
|
|
; a[i+35] += m[35] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+280]
|
|
mov r12, QWORD PTR [r9+96]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+88], r13
|
|
; a[i+36] += m[36] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+288]
|
|
mov r13, QWORD PTR [r9+104]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+96], r12
|
|
; a[i+37] += m[37] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+296]
|
|
mov r12, QWORD PTR [r9+112]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+104], r13
|
|
; a[i+38] += m[38] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+304]
|
|
mov r13, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+112], r12
|
|
; a[i+39] += m[39] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+312]
|
|
mov r12, QWORD PTR [r9+128]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+120], r13
|
|
; a[i+40] += m[40] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+320]
|
|
mov r13, QWORD PTR [r9+136]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+128], r12
|
|
; a[i+41] += m[41] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+328]
|
|
mov r12, QWORD PTR [r9+144]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+136], r13
|
|
; a[i+42] += m[42] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+336]
|
|
mov r13, QWORD PTR [r9+152]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+144], r12
|
|
; a[i+43] += m[43] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+344]
|
|
mov r12, QWORD PTR [r9+160]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+152], r13
|
|
; a[i+44] += m[44] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+352]
|
|
mov r13, QWORD PTR [r9+168]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+160], r12
|
|
; a[i+45] += m[45] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+360]
|
|
mov r12, QWORD PTR [r9+176]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+168], r13
|
|
; a[i+46] += m[46] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+368]
|
|
mov r13, QWORD PTR [r9+184]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+176], r12
|
|
; a[i+47] += m[47] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+376]
|
|
mov r12, QWORD PTR [r9+192]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+184], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+192], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; a += 1
|
|
add r9, 8
|
|
; i -= 1
|
|
sub r11, 1
|
|
jnz L_3072_mont_loop_avx2_48
|
|
sub r9, 192
|
|
neg rbp
|
|
mov r8, r9
|
|
sub r9, 384
|
|
mov rcx, QWORD PTR [r10]
|
|
mov rdx, r15
|
|
pext rcx, rcx, rbp
|
|
sub rdx, rcx
|
|
mov rcx, QWORD PTR [r10+8]
|
|
mov rax, rdi
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+16]
|
|
mov rcx, rsi
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+8], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+24]
|
|
mov rdx, rbx
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+16], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+32]
|
|
mov rax, QWORD PTR [r8+32]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+24], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+40]
|
|
mov rcx, QWORD PTR [r8+40]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+32], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov rdx, QWORD PTR [r8+48]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+40], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r8+56]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+48], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+64]
|
|
mov rcx, QWORD PTR [r8+64]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+56], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+72]
|
|
mov rdx, QWORD PTR [r8+72]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+64], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+80]
|
|
mov rax, QWORD PTR [r8+80]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+72], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+88]
|
|
mov rcx, QWORD PTR [r8+88]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+80], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov rdx, QWORD PTR [r8+96]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+88], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r8+104]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+96], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+112]
|
|
mov rcx, QWORD PTR [r8+112]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+104], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+120]
|
|
mov rdx, QWORD PTR [r8+120]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+112], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+128]
|
|
mov rax, QWORD PTR [r8+128]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+120], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+136]
|
|
mov rcx, QWORD PTR [r8+136]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+128], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov rdx, QWORD PTR [r8+144]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+136], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r8+152]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+144], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+160]
|
|
mov rcx, QWORD PTR [r8+160]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+152], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+168]
|
|
mov rdx, QWORD PTR [r8+168]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+160], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+176]
|
|
mov rax, QWORD PTR [r8+176]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+168], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+184]
|
|
mov rcx, QWORD PTR [r8+184]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+176], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov rdx, QWORD PTR [r8+192]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+184], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [r8+200]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+192], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+208]
|
|
mov rcx, QWORD PTR [r8+208]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+200], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+216]
|
|
mov rdx, QWORD PTR [r8+216]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+208], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+224]
|
|
mov rax, QWORD PTR [r8+224]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+216], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+232]
|
|
mov rcx, QWORD PTR [r8+232]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+224], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov rdx, QWORD PTR [r8+240]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+232], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [r8+248]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+240], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+256]
|
|
mov rcx, QWORD PTR [r8+256]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+248], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+264]
|
|
mov rdx, QWORD PTR [r8+264]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+256], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+272]
|
|
mov rax, QWORD PTR [r8+272]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+264], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+280]
|
|
mov rcx, QWORD PTR [r8+280]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+272], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov rdx, QWORD PTR [r8+288]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+280], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+296]
|
|
mov rax, QWORD PTR [r8+296]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+288], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+304]
|
|
mov rcx, QWORD PTR [r8+304]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+296], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+312]
|
|
mov rdx, QWORD PTR [r8+312]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+304], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+320]
|
|
mov rax, QWORD PTR [r8+320]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+312], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+328]
|
|
mov rcx, QWORD PTR [r8+328]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+320], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov rdx, QWORD PTR [r8+336]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+328], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+344]
|
|
mov rax, QWORD PTR [r8+344]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+336], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+352]
|
|
mov rcx, QWORD PTR [r8+352]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+344], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+360]
|
|
mov rdx, QWORD PTR [r8+360]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+352], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+368]
|
|
mov rax, QWORD PTR [r8+368]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+360], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+376]
|
|
mov rcx, QWORD PTR [r8+376]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+368], rax
|
|
sbb rcx, rdx
|
|
mov QWORD PTR [r9+376], rcx
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_mont_reduce_avx2_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cond_add_24 PROC
|
|
sub rsp, 192
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [r8+136]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+128], r10
|
|
mov QWORD PTR [rsp+136], r11
|
|
mov r10, QWORD PTR [r8+144]
|
|
mov r11, QWORD PTR [r8+152]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+144], r10
|
|
mov QWORD PTR [rsp+152], r11
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r11, QWORD PTR [r8+168]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+160], r10
|
|
mov QWORD PTR [rsp+168], r11
|
|
mov r10, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [r8+184]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+176], r10
|
|
mov QWORD PTR [rsp+184], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
add r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov r10, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rsp+128]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+120], r11
|
|
mov r11, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rsp+136]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+128], r10
|
|
mov r10, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rsp+144]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+136], r11
|
|
mov r11, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rsp+152]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+144], r10
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rsp+160]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+152], r11
|
|
mov r11, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rsp+168]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+160], r10
|
|
mov r10, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rsp+176]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+168], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rsp+184]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov QWORD PTR [rcx+184], r11
|
|
adc rax, 0
|
|
add rsp, 192
|
|
ret
|
|
sp_3072_cond_add_24 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_cond_add_avx2_24 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
add r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [rdx+128]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+128], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+136], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+152]
|
|
mov r11, QWORD PTR [rdx+152]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+144], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+152], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+168]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+160], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [rdx+176]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+176], r11
|
|
adc r12, r10
|
|
mov QWORD PTR [rcx+184], r12
|
|
adc rax, 0
|
|
pop r12
|
|
ret
|
|
sp_3072_cond_add_avx2_24 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Shift number left by n bit. (r = a << n)
|
|
; *
|
|
; * r Result of left shift by n.
|
|
; * a Number to shift.
|
|
; * n Amoutnt o shift.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_3072_lshift_48 PROC
|
|
push r12
|
|
push r13
|
|
mov r9, rcx
|
|
mov rcx, r8
|
|
mov r12, 0
|
|
mov r13, QWORD PTR [rdx+344]
|
|
mov rax, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [rdx+360]
|
|
mov r10, QWORD PTR [rdx+368]
|
|
mov r11, QWORD PTR [rdx+376]
|
|
shld r12, r11, cl
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+352], rax
|
|
mov QWORD PTR [r9+360], r8
|
|
mov QWORD PTR [r9+368], r10
|
|
mov QWORD PTR [r9+376], r11
|
|
mov QWORD PTR [r9+384], r12
|
|
mov r11, QWORD PTR [rdx+312]
|
|
mov rax, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [rdx+328]
|
|
mov r10, QWORD PTR [rdx+336]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+320], rax
|
|
mov QWORD PTR [r9+328], r8
|
|
mov QWORD PTR [r9+336], r10
|
|
mov QWORD PTR [r9+344], r13
|
|
mov r13, QWORD PTR [rdx+280]
|
|
mov rax, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [rdx+296]
|
|
mov r10, QWORD PTR [rdx+304]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+288], rax
|
|
mov QWORD PTR [r9+296], r8
|
|
mov QWORD PTR [r9+304], r10
|
|
mov QWORD PTR [r9+312], r11
|
|
mov r11, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [rdx+264]
|
|
mov r10, QWORD PTR [rdx+272]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+256], rax
|
|
mov QWORD PTR [r9+264], r8
|
|
mov QWORD PTR [r9+272], r10
|
|
mov QWORD PTR [r9+280], r13
|
|
mov r13, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rdx+232]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+224], rax
|
|
mov QWORD PTR [r9+232], r8
|
|
mov QWORD PTR [r9+240], r10
|
|
mov QWORD PTR [r9+248], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rdx+200]
|
|
mov r10, QWORD PTR [rdx+208]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+192], rax
|
|
mov QWORD PTR [r9+200], r8
|
|
mov QWORD PTR [r9+208], r10
|
|
mov QWORD PTR [r9+216], r13
|
|
mov r13, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov r10, QWORD PTR [rdx+176]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+160], rax
|
|
mov QWORD PTR [r9+168], r8
|
|
mov QWORD PTR [r9+176], r10
|
|
mov QWORD PTR [r9+184], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+128], rax
|
|
mov QWORD PTR [r9+136], r8
|
|
mov QWORD PTR [r9+144], r10
|
|
mov QWORD PTR [r9+152], r13
|
|
mov r13, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+96], rax
|
|
mov QWORD PTR [r9+104], r8
|
|
mov QWORD PTR [r9+112], r10
|
|
mov QWORD PTR [r9+120], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+64], rax
|
|
mov QWORD PTR [r9+72], r8
|
|
mov QWORD PTR [r9+80], r10
|
|
mov QWORD PTR [r9+88], r13
|
|
mov r13, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+32], rax
|
|
mov QWORD PTR [r9+40], r8
|
|
mov QWORD PTR [r9+48], r10
|
|
mov QWORD PTR [r9+56], r11
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shl rax, cl
|
|
mov QWORD PTR [r9], rax
|
|
mov QWORD PTR [r9+8], r8
|
|
mov QWORD PTR [r9+16], r10
|
|
mov QWORD PTR [r9+24], r13
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_3072_lshift_48 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFDEF WOLFSSL_SP_4096
|
|
IFDEF WOLFSSL_SP_4096
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_from_bin_bswap PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 512
|
|
xor r13, r13
|
|
jmp L_4096_from_bin_bswap_64_end
|
|
L_4096_from_bin_bswap_64_start:
|
|
sub r11, 64
|
|
mov rax, QWORD PTR [r11+56]
|
|
mov r10, QWORD PTR [r11+48]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [r11+40]
|
|
mov r10, QWORD PTR [r11+32]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [r11+24]
|
|
mov r10, QWORD PTR [r11+16]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [r11+8]
|
|
mov r10, QWORD PTR [r11]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_4096_from_bin_bswap_64_end:
|
|
cmp r9, 63
|
|
jg L_4096_from_bin_bswap_64_start
|
|
jmp L_4096_from_bin_bswap_8_end
|
|
L_4096_from_bin_bswap_8_start:
|
|
sub r11, 8
|
|
mov rax, QWORD PTR [r11]
|
|
bswap rax
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_4096_from_bin_bswap_8_end:
|
|
cmp r9, 7
|
|
jg L_4096_from_bin_bswap_8_start
|
|
cmp r9, r13
|
|
je L_4096_from_bin_bswap_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_4096_from_bin_bswap_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_4096_from_bin_bswap_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_4096_from_bin_bswap_hi_end:
|
|
cmp rcx, r12
|
|
je L_4096_from_bin_bswap_zero_end
|
|
L_4096_from_bin_bswap_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_4096_from_bin_bswap_zero_start
|
|
L_4096_from_bin_bswap_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_from_bin_bswap ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the movbe instruction which is an optional instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_from_bin_movbe PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 512
|
|
xor r13, r13
|
|
jmp L_4096_from_bin_movbe_64_end
|
|
L_4096_from_bin_movbe_64_start:
|
|
sub r11, 64
|
|
movbe rax, QWORD PTR [r11+56]
|
|
movbe r10, QWORD PTR [r11+48]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
movbe rax, QWORD PTR [r11+40]
|
|
movbe r10, QWORD PTR [r11+32]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
movbe rax, QWORD PTR [r11+24]
|
|
movbe r10, QWORD PTR [r11+16]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
movbe rax, QWORD PTR [r11+8]
|
|
movbe r10, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_4096_from_bin_movbe_64_end:
|
|
cmp r9, 63
|
|
jg L_4096_from_bin_movbe_64_start
|
|
jmp L_4096_from_bin_movbe_8_end
|
|
L_4096_from_bin_movbe_8_start:
|
|
sub r11, 8
|
|
movbe rax, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_4096_from_bin_movbe_8_end:
|
|
cmp r9, 7
|
|
jg L_4096_from_bin_movbe_8_start
|
|
cmp r9, r13
|
|
je L_4096_from_bin_movbe_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_4096_from_bin_movbe_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_4096_from_bin_movbe_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_4096_from_bin_movbe_hi_end:
|
|
cmp rcx, r12
|
|
je L_4096_from_bin_movbe_zero_end
|
|
L_4096_from_bin_movbe_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_4096_from_bin_movbe_zero_start
|
|
L_4096_from_bin_movbe_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_from_bin_movbe ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 512
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_to_bin_bswap_64 PROC
|
|
mov rax, QWORD PTR [rcx+504]
|
|
mov r8, QWORD PTR [rcx+496]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
mov rax, QWORD PTR [rcx+488]
|
|
mov r8, QWORD PTR [rcx+480]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
mov rax, QWORD PTR [rcx+472]
|
|
mov r8, QWORD PTR [rcx+464]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
mov rax, QWORD PTR [rcx+456]
|
|
mov r8, QWORD PTR [rcx+448]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+48], rax
|
|
mov QWORD PTR [rdx+56], r8
|
|
mov rax, QWORD PTR [rcx+440]
|
|
mov r8, QWORD PTR [rcx+432]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+64], rax
|
|
mov QWORD PTR [rdx+72], r8
|
|
mov rax, QWORD PTR [rcx+424]
|
|
mov r8, QWORD PTR [rcx+416]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+80], rax
|
|
mov QWORD PTR [rdx+88], r8
|
|
mov rax, QWORD PTR [rcx+408]
|
|
mov r8, QWORD PTR [rcx+400]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+96], rax
|
|
mov QWORD PTR [rdx+104], r8
|
|
mov rax, QWORD PTR [rcx+392]
|
|
mov r8, QWORD PTR [rcx+384]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+112], rax
|
|
mov QWORD PTR [rdx+120], r8
|
|
mov rax, QWORD PTR [rcx+376]
|
|
mov r8, QWORD PTR [rcx+368]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+128], rax
|
|
mov QWORD PTR [rdx+136], r8
|
|
mov rax, QWORD PTR [rcx+360]
|
|
mov r8, QWORD PTR [rcx+352]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+144], rax
|
|
mov QWORD PTR [rdx+152], r8
|
|
mov rax, QWORD PTR [rcx+344]
|
|
mov r8, QWORD PTR [rcx+336]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+160], rax
|
|
mov QWORD PTR [rdx+168], r8
|
|
mov rax, QWORD PTR [rcx+328]
|
|
mov r8, QWORD PTR [rcx+320]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+176], rax
|
|
mov QWORD PTR [rdx+184], r8
|
|
mov rax, QWORD PTR [rcx+312]
|
|
mov r8, QWORD PTR [rcx+304]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+192], rax
|
|
mov QWORD PTR [rdx+200], r8
|
|
mov rax, QWORD PTR [rcx+296]
|
|
mov r8, QWORD PTR [rcx+288]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+208], rax
|
|
mov QWORD PTR [rdx+216], r8
|
|
mov rax, QWORD PTR [rcx+280]
|
|
mov r8, QWORD PTR [rcx+272]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+224], rax
|
|
mov QWORD PTR [rdx+232], r8
|
|
mov rax, QWORD PTR [rcx+264]
|
|
mov r8, QWORD PTR [rcx+256]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+240], rax
|
|
mov QWORD PTR [rdx+248], r8
|
|
mov rax, QWORD PTR [rcx+248]
|
|
mov r8, QWORD PTR [rcx+240]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+256], rax
|
|
mov QWORD PTR [rdx+264], r8
|
|
mov rax, QWORD PTR [rcx+232]
|
|
mov r8, QWORD PTR [rcx+224]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+272], rax
|
|
mov QWORD PTR [rdx+280], r8
|
|
mov rax, QWORD PTR [rcx+216]
|
|
mov r8, QWORD PTR [rcx+208]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+288], rax
|
|
mov QWORD PTR [rdx+296], r8
|
|
mov rax, QWORD PTR [rcx+200]
|
|
mov r8, QWORD PTR [rcx+192]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+304], rax
|
|
mov QWORD PTR [rdx+312], r8
|
|
mov rax, QWORD PTR [rcx+184]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+320], rax
|
|
mov QWORD PTR [rdx+328], r8
|
|
mov rax, QWORD PTR [rcx+168]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+336], rax
|
|
mov QWORD PTR [rdx+344], r8
|
|
mov rax, QWORD PTR [rcx+152]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+352], rax
|
|
mov QWORD PTR [rdx+360], r8
|
|
mov rax, QWORD PTR [rcx+136]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+368], rax
|
|
mov QWORD PTR [rdx+376], r8
|
|
mov rax, QWORD PTR [rcx+120]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+384], rax
|
|
mov QWORD PTR [rdx+392], r8
|
|
mov rax, QWORD PTR [rcx+104]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+400], rax
|
|
mov QWORD PTR [rdx+408], r8
|
|
mov rax, QWORD PTR [rcx+88]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+416], rax
|
|
mov QWORD PTR [rdx+424], r8
|
|
mov rax, QWORD PTR [rcx+72]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+432], rax
|
|
mov QWORD PTR [rdx+440], r8
|
|
mov rax, QWORD PTR [rcx+56]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+448], rax
|
|
mov QWORD PTR [rdx+456], r8
|
|
mov rax, QWORD PTR [rcx+40]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+464], rax
|
|
mov QWORD PTR [rdx+472], r8
|
|
mov rax, QWORD PTR [rcx+24]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+480], rax
|
|
mov QWORD PTR [rdx+488], r8
|
|
mov rax, QWORD PTR [rcx+8]
|
|
mov r8, QWORD PTR [rcx]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+496], rax
|
|
mov QWORD PTR [rdx+504], r8
|
|
ret
|
|
sp_4096_to_bin_bswap_64 ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 512
|
|
; * Uses the movbe instruction which is optional.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_to_bin_movbe_64 PROC
|
|
movbe rax, QWORD PTR [rcx+504]
|
|
movbe r8, QWORD PTR [rcx+496]
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
movbe rax, QWORD PTR [rcx+488]
|
|
movbe r8, QWORD PTR [rcx+480]
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
movbe rax, QWORD PTR [rcx+472]
|
|
movbe r8, QWORD PTR [rcx+464]
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
movbe rax, QWORD PTR [rcx+456]
|
|
movbe r8, QWORD PTR [rcx+448]
|
|
mov QWORD PTR [rdx+48], rax
|
|
mov QWORD PTR [rdx+56], r8
|
|
movbe rax, QWORD PTR [rcx+440]
|
|
movbe r8, QWORD PTR [rcx+432]
|
|
mov QWORD PTR [rdx+64], rax
|
|
mov QWORD PTR [rdx+72], r8
|
|
movbe rax, QWORD PTR [rcx+424]
|
|
movbe r8, QWORD PTR [rcx+416]
|
|
mov QWORD PTR [rdx+80], rax
|
|
mov QWORD PTR [rdx+88], r8
|
|
movbe rax, QWORD PTR [rcx+408]
|
|
movbe r8, QWORD PTR [rcx+400]
|
|
mov QWORD PTR [rdx+96], rax
|
|
mov QWORD PTR [rdx+104], r8
|
|
movbe rax, QWORD PTR [rcx+392]
|
|
movbe r8, QWORD PTR [rcx+384]
|
|
mov QWORD PTR [rdx+112], rax
|
|
mov QWORD PTR [rdx+120], r8
|
|
movbe rax, QWORD PTR [rcx+376]
|
|
movbe r8, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rdx+128], rax
|
|
mov QWORD PTR [rdx+136], r8
|
|
movbe rax, QWORD PTR [rcx+360]
|
|
movbe r8, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rdx+144], rax
|
|
mov QWORD PTR [rdx+152], r8
|
|
movbe rax, QWORD PTR [rcx+344]
|
|
movbe r8, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rdx+160], rax
|
|
mov QWORD PTR [rdx+168], r8
|
|
movbe rax, QWORD PTR [rcx+328]
|
|
movbe r8, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rdx+176], rax
|
|
mov QWORD PTR [rdx+184], r8
|
|
movbe rax, QWORD PTR [rcx+312]
|
|
movbe r8, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rdx+192], rax
|
|
mov QWORD PTR [rdx+200], r8
|
|
movbe rax, QWORD PTR [rcx+296]
|
|
movbe r8, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rdx+208], rax
|
|
mov QWORD PTR [rdx+216], r8
|
|
movbe rax, QWORD PTR [rcx+280]
|
|
movbe r8, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rdx+224], rax
|
|
mov QWORD PTR [rdx+232], r8
|
|
movbe rax, QWORD PTR [rcx+264]
|
|
movbe r8, QWORD PTR [rcx+256]
|
|
mov QWORD PTR [rdx+240], rax
|
|
mov QWORD PTR [rdx+248], r8
|
|
movbe rax, QWORD PTR [rcx+248]
|
|
movbe r8, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rdx+256], rax
|
|
mov QWORD PTR [rdx+264], r8
|
|
movbe rax, QWORD PTR [rcx+232]
|
|
movbe r8, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rdx+272], rax
|
|
mov QWORD PTR [rdx+280], r8
|
|
movbe rax, QWORD PTR [rcx+216]
|
|
movbe r8, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rdx+288], rax
|
|
mov QWORD PTR [rdx+296], r8
|
|
movbe rax, QWORD PTR [rcx+200]
|
|
movbe r8, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rdx+304], rax
|
|
mov QWORD PTR [rdx+312], r8
|
|
movbe rax, QWORD PTR [rcx+184]
|
|
movbe r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rdx+320], rax
|
|
mov QWORD PTR [rdx+328], r8
|
|
movbe rax, QWORD PTR [rcx+168]
|
|
movbe r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rdx+336], rax
|
|
mov QWORD PTR [rdx+344], r8
|
|
movbe rax, QWORD PTR [rcx+152]
|
|
movbe r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rdx+352], rax
|
|
mov QWORD PTR [rdx+360], r8
|
|
movbe rax, QWORD PTR [rcx+136]
|
|
movbe r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rdx+368], rax
|
|
mov QWORD PTR [rdx+376], r8
|
|
movbe rax, QWORD PTR [rcx+120]
|
|
movbe r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rdx+384], rax
|
|
mov QWORD PTR [rdx+392], r8
|
|
movbe rax, QWORD PTR [rcx+104]
|
|
movbe r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rdx+400], rax
|
|
mov QWORD PTR [rdx+408], r8
|
|
movbe rax, QWORD PTR [rcx+88]
|
|
movbe r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rdx+416], rax
|
|
mov QWORD PTR [rdx+424], r8
|
|
movbe rax, QWORD PTR [rcx+72]
|
|
movbe r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rdx+432], rax
|
|
mov QWORD PTR [rdx+440], r8
|
|
movbe rax, QWORD PTR [rcx+56]
|
|
movbe r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rdx+448], rax
|
|
mov QWORD PTR [rdx+456], r8
|
|
movbe rax, QWORD PTR [rcx+40]
|
|
movbe r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rdx+464], rax
|
|
mov QWORD PTR [rdx+472], r8
|
|
movbe rax, QWORD PTR [rcx+24]
|
|
movbe r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rdx+480], rax
|
|
mov QWORD PTR [rdx+488], r8
|
|
movbe rax, QWORD PTR [rcx+8]
|
|
movbe r8, QWORD PTR [rcx]
|
|
mov QWORD PTR [rdx+496], rax
|
|
mov QWORD PTR [rdx+504], r8
|
|
ret
|
|
sp_4096_to_bin_movbe_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_sub_in_place_64 PROC
|
|
mov r8, QWORD PTR [rcx]
|
|
xor rax, rax
|
|
sub r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
sbb r9, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r8, QWORD PTR [rdx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
sbb r9, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
sbb r8, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
sbb r9, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
sbb r8, QWORD PTR [rdx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
sbb r9, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
sbb r8, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
sbb r9, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
sbb r8, QWORD PTR [rdx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
sbb r9, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
sbb r8, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
sbb r9, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
sbb r8, QWORD PTR [rdx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
sbb r9, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rcx+128]
|
|
mov QWORD PTR [rcx+120], r9
|
|
sbb r8, QWORD PTR [rdx+128]
|
|
mov r9, QWORD PTR [rcx+136]
|
|
mov QWORD PTR [rcx+128], r8
|
|
sbb r9, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rcx+144]
|
|
mov QWORD PTR [rcx+136], r9
|
|
sbb r8, QWORD PTR [rdx+144]
|
|
mov r9, QWORD PTR [rcx+152]
|
|
mov QWORD PTR [rcx+144], r8
|
|
sbb r9, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rcx+160]
|
|
mov QWORD PTR [rcx+152], r9
|
|
sbb r8, QWORD PTR [rdx+160]
|
|
mov r9, QWORD PTR [rcx+168]
|
|
mov QWORD PTR [rcx+160], r8
|
|
sbb r9, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rcx+176]
|
|
mov QWORD PTR [rcx+168], r9
|
|
sbb r8, QWORD PTR [rdx+176]
|
|
mov r9, QWORD PTR [rcx+184]
|
|
mov QWORD PTR [rcx+176], r8
|
|
sbb r9, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rcx+192]
|
|
mov QWORD PTR [rcx+184], r9
|
|
sbb r8, QWORD PTR [rdx+192]
|
|
mov r9, QWORD PTR [rcx+200]
|
|
mov QWORD PTR [rcx+192], r8
|
|
sbb r9, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rcx+208]
|
|
mov QWORD PTR [rcx+200], r9
|
|
sbb r8, QWORD PTR [rdx+208]
|
|
mov r9, QWORD PTR [rcx+216]
|
|
mov QWORD PTR [rcx+208], r8
|
|
sbb r9, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rcx+224]
|
|
mov QWORD PTR [rcx+216], r9
|
|
sbb r8, QWORD PTR [rdx+224]
|
|
mov r9, QWORD PTR [rcx+232]
|
|
mov QWORD PTR [rcx+224], r8
|
|
sbb r9, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rcx+240]
|
|
mov QWORD PTR [rcx+232], r9
|
|
sbb r8, QWORD PTR [rdx+240]
|
|
mov r9, QWORD PTR [rcx+248]
|
|
mov QWORD PTR [rcx+240], r8
|
|
sbb r9, QWORD PTR [rdx+248]
|
|
mov r8, QWORD PTR [rcx+256]
|
|
mov QWORD PTR [rcx+248], r9
|
|
sbb r8, QWORD PTR [rdx+256]
|
|
mov r9, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], r8
|
|
sbb r9, QWORD PTR [rdx+264]
|
|
mov r8, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r9
|
|
sbb r8, QWORD PTR [rdx+272]
|
|
mov r9, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], r8
|
|
sbb r9, QWORD PTR [rdx+280]
|
|
mov r8, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r9
|
|
sbb r8, QWORD PTR [rdx+288]
|
|
mov r9, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], r8
|
|
sbb r9, QWORD PTR [rdx+296]
|
|
mov r8, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r9
|
|
sbb r8, QWORD PTR [rdx+304]
|
|
mov r9, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], r8
|
|
sbb r9, QWORD PTR [rdx+312]
|
|
mov r8, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r9
|
|
sbb r8, QWORD PTR [rdx+320]
|
|
mov r9, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], r8
|
|
sbb r9, QWORD PTR [rdx+328]
|
|
mov r8, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r9
|
|
sbb r8, QWORD PTR [rdx+336]
|
|
mov r9, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], r8
|
|
sbb r9, QWORD PTR [rdx+344]
|
|
mov r8, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r9
|
|
sbb r8, QWORD PTR [rdx+352]
|
|
mov r9, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], r8
|
|
sbb r9, QWORD PTR [rdx+360]
|
|
mov r8, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r9
|
|
sbb r8, QWORD PTR [rdx+368]
|
|
mov r9, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], r8
|
|
sbb r9, QWORD PTR [rdx+376]
|
|
mov r8, QWORD PTR [rcx+384]
|
|
mov QWORD PTR [rcx+376], r9
|
|
sbb r8, QWORD PTR [rdx+384]
|
|
mov r9, QWORD PTR [rcx+392]
|
|
mov QWORD PTR [rcx+384], r8
|
|
sbb r9, QWORD PTR [rdx+392]
|
|
mov r8, QWORD PTR [rcx+400]
|
|
mov QWORD PTR [rcx+392], r9
|
|
sbb r8, QWORD PTR [rdx+400]
|
|
mov r9, QWORD PTR [rcx+408]
|
|
mov QWORD PTR [rcx+400], r8
|
|
sbb r9, QWORD PTR [rdx+408]
|
|
mov r8, QWORD PTR [rcx+416]
|
|
mov QWORD PTR [rcx+408], r9
|
|
sbb r8, QWORD PTR [rdx+416]
|
|
mov r9, QWORD PTR [rcx+424]
|
|
mov QWORD PTR [rcx+416], r8
|
|
sbb r9, QWORD PTR [rdx+424]
|
|
mov r8, QWORD PTR [rcx+432]
|
|
mov QWORD PTR [rcx+424], r9
|
|
sbb r8, QWORD PTR [rdx+432]
|
|
mov r9, QWORD PTR [rcx+440]
|
|
mov QWORD PTR [rcx+432], r8
|
|
sbb r9, QWORD PTR [rdx+440]
|
|
mov r8, QWORD PTR [rcx+448]
|
|
mov QWORD PTR [rcx+440], r9
|
|
sbb r8, QWORD PTR [rdx+448]
|
|
mov r9, QWORD PTR [rcx+456]
|
|
mov QWORD PTR [rcx+448], r8
|
|
sbb r9, QWORD PTR [rdx+456]
|
|
mov r8, QWORD PTR [rcx+464]
|
|
mov QWORD PTR [rcx+456], r9
|
|
sbb r8, QWORD PTR [rdx+464]
|
|
mov r9, QWORD PTR [rcx+472]
|
|
mov QWORD PTR [rcx+464], r8
|
|
sbb r9, QWORD PTR [rdx+472]
|
|
mov r8, QWORD PTR [rcx+480]
|
|
mov QWORD PTR [rcx+472], r9
|
|
sbb r8, QWORD PTR [rdx+480]
|
|
mov r9, QWORD PTR [rcx+488]
|
|
mov QWORD PTR [rcx+480], r8
|
|
sbb r9, QWORD PTR [rdx+488]
|
|
mov r8, QWORD PTR [rcx+496]
|
|
mov QWORD PTR [rcx+488], r9
|
|
sbb r8, QWORD PTR [rdx+496]
|
|
mov r9, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [rcx+496], r8
|
|
sbb r9, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [rcx+504], r9
|
|
sbb rax, 0
|
|
ret
|
|
sp_4096_sub_in_place_64 ENDP
|
|
_text ENDS
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_add_64 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
adc r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
adc r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
adc r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
adc r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
adc r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
adc r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
adc r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
adc r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
adc r10, QWORD PTR [r8+184]
|
|
mov r9, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r10
|
|
adc r9, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r9
|
|
adc r10, QWORD PTR [r8+200]
|
|
mov r9, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r10
|
|
adc r9, QWORD PTR [r8+208]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r9
|
|
adc r10, QWORD PTR [r8+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r9, QWORD PTR [r8+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r9
|
|
adc r10, QWORD PTR [r8+232]
|
|
mov r9, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r10
|
|
adc r9, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r9
|
|
adc r10, QWORD PTR [r8+248]
|
|
mov r9, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [rcx+248], r10
|
|
adc r9, QWORD PTR [r8+256]
|
|
mov r10, QWORD PTR [rdx+264]
|
|
mov QWORD PTR [rcx+256], r9
|
|
adc r10, QWORD PTR [r8+264]
|
|
mov r9, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [rcx+264], r10
|
|
adc r9, QWORD PTR [r8+272]
|
|
mov r10, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [rcx+272], r9
|
|
adc r10, QWORD PTR [r8+280]
|
|
mov r9, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [rcx+280], r10
|
|
adc r9, QWORD PTR [r8+288]
|
|
mov r10, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [rcx+288], r9
|
|
adc r10, QWORD PTR [r8+296]
|
|
mov r9, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [rcx+296], r10
|
|
adc r9, QWORD PTR [r8+304]
|
|
mov r10, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [rcx+304], r9
|
|
adc r10, QWORD PTR [r8+312]
|
|
mov r9, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [rcx+312], r10
|
|
adc r9, QWORD PTR [r8+320]
|
|
mov r10, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [rcx+320], r9
|
|
adc r10, QWORD PTR [r8+328]
|
|
mov r9, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [rcx+328], r10
|
|
adc r9, QWORD PTR [r8+336]
|
|
mov r10, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [rcx+336], r9
|
|
adc r10, QWORD PTR [r8+344]
|
|
mov r9, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [rcx+344], r10
|
|
adc r9, QWORD PTR [r8+352]
|
|
mov r10, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [rcx+352], r9
|
|
adc r10, QWORD PTR [r8+360]
|
|
mov r9, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [rcx+360], r10
|
|
adc r9, QWORD PTR [r8+368]
|
|
mov r10, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+368], r9
|
|
adc r10, QWORD PTR [r8+376]
|
|
mov r9, QWORD PTR [rdx+384]
|
|
mov QWORD PTR [rcx+376], r10
|
|
adc r9, QWORD PTR [r8+384]
|
|
mov r10, QWORD PTR [rdx+392]
|
|
mov QWORD PTR [rcx+384], r9
|
|
adc r10, QWORD PTR [r8+392]
|
|
mov r9, QWORD PTR [rdx+400]
|
|
mov QWORD PTR [rcx+392], r10
|
|
adc r9, QWORD PTR [r8+400]
|
|
mov r10, QWORD PTR [rdx+408]
|
|
mov QWORD PTR [rcx+400], r9
|
|
adc r10, QWORD PTR [r8+408]
|
|
mov r9, QWORD PTR [rdx+416]
|
|
mov QWORD PTR [rcx+408], r10
|
|
adc r9, QWORD PTR [r8+416]
|
|
mov r10, QWORD PTR [rdx+424]
|
|
mov QWORD PTR [rcx+416], r9
|
|
adc r10, QWORD PTR [r8+424]
|
|
mov r9, QWORD PTR [rdx+432]
|
|
mov QWORD PTR [rcx+424], r10
|
|
adc r9, QWORD PTR [r8+432]
|
|
mov r10, QWORD PTR [rdx+440]
|
|
mov QWORD PTR [rcx+432], r9
|
|
adc r10, QWORD PTR [r8+440]
|
|
mov r9, QWORD PTR [rdx+448]
|
|
mov QWORD PTR [rcx+440], r10
|
|
adc r9, QWORD PTR [r8+448]
|
|
mov r10, QWORD PTR [rdx+456]
|
|
mov QWORD PTR [rcx+448], r9
|
|
adc r10, QWORD PTR [r8+456]
|
|
mov r9, QWORD PTR [rdx+464]
|
|
mov QWORD PTR [rcx+456], r10
|
|
adc r9, QWORD PTR [r8+464]
|
|
mov r10, QWORD PTR [rdx+472]
|
|
mov QWORD PTR [rcx+464], r9
|
|
adc r10, QWORD PTR [r8+472]
|
|
mov r9, QWORD PTR [rdx+480]
|
|
mov QWORD PTR [rcx+472], r10
|
|
adc r9, QWORD PTR [r8+480]
|
|
mov r10, QWORD PTR [rdx+488]
|
|
mov QWORD PTR [rcx+480], r9
|
|
adc r10, QWORD PTR [r8+488]
|
|
mov r9, QWORD PTR [rdx+496]
|
|
mov QWORD PTR [rcx+488], r10
|
|
adc r9, QWORD PTR [r8+496]
|
|
mov r10, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [rcx+496], r9
|
|
adc r10, QWORD PTR [r8+504]
|
|
mov QWORD PTR [rcx+504], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_4096_add_64 ENDP
|
|
_text ENDS
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_mul_64 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 1576
|
|
mov QWORD PTR [rsp+1536], rcx
|
|
mov QWORD PTR [rsp+1544], rdx
|
|
mov QWORD PTR [rsp+1552], r8
|
|
lea r12, QWORD PTR [rsp+1024]
|
|
lea r14, QWORD PTR [rdx+256]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
adc rax, QWORD PTR [r14+192]
|
|
mov r9, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
adc r9, QWORD PTR [r14+200]
|
|
mov r10, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
adc r10, QWORD PTR [r14+208]
|
|
mov rax, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
adc rax, QWORD PTR [r14+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
adc r9, QWORD PTR [r14+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
adc r10, QWORD PTR [r14+232]
|
|
mov rax, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
adc rax, QWORD PTR [r14+240]
|
|
mov r9, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
adc r9, QWORD PTR [r14+248]
|
|
mov QWORD PTR [r12+248], r9
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+1560], r15
|
|
lea r13, QWORD PTR [rsp+1280]
|
|
lea r14, QWORD PTR [r8+256]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov QWORD PTR [r13+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov QWORD PTR [r13+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mov QWORD PTR [r13+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [r8+128]
|
|
mov QWORD PTR [r13+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov QWORD PTR [r13+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [r8+144]
|
|
mov QWORD PTR [r13+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [r8+152]
|
|
mov QWORD PTR [r13+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov QWORD PTR [r13+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [r8+168]
|
|
mov QWORD PTR [r13+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [r8+176]
|
|
mov QWORD PTR [r13+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov QWORD PTR [r13+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov rax, QWORD PTR [r8+192]
|
|
mov QWORD PTR [r13+184], r10
|
|
adc rax, QWORD PTR [r14+192]
|
|
mov r9, QWORD PTR [r8+200]
|
|
mov QWORD PTR [r13+192], rax
|
|
adc r9, QWORD PTR [r14+200]
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov QWORD PTR [r13+200], r9
|
|
adc r10, QWORD PTR [r14+208]
|
|
mov rax, QWORD PTR [r8+216]
|
|
mov QWORD PTR [r13+208], r10
|
|
adc rax, QWORD PTR [r14+216]
|
|
mov r9, QWORD PTR [r8+224]
|
|
mov QWORD PTR [r13+216], rax
|
|
adc r9, QWORD PTR [r14+224]
|
|
mov r10, QWORD PTR [r8+232]
|
|
mov QWORD PTR [r13+224], r9
|
|
adc r10, QWORD PTR [r14+232]
|
|
mov rax, QWORD PTR [r8+240]
|
|
mov QWORD PTR [r13+232], r10
|
|
adc rax, QWORD PTR [r14+240]
|
|
mov r9, QWORD PTR [r8+248]
|
|
mov QWORD PTR [r13+240], rax
|
|
adc r9, QWORD PTR [r14+248]
|
|
mov QWORD PTR [r13+248], r9
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+1568], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_2048_mul_32
|
|
mov r8, QWORD PTR [rsp+1552]
|
|
mov rdx, QWORD PTR [rsp+1544]
|
|
lea rcx, QWORD PTR [rsp+512]
|
|
add r8, 256
|
|
add rdx, 256
|
|
call sp_2048_mul_32
|
|
mov r8, QWORD PTR [rsp+1552]
|
|
mov rdx, QWORD PTR [rsp+1544]
|
|
mov rcx, QWORD PTR [rsp+1536]
|
|
call sp_2048_mul_32
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+1552]
|
|
mov rdx, QWORD PTR [rsp+1544]
|
|
mov rcx, QWORD PTR [rsp+1536]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+1560]
|
|
mov rdi, QWORD PTR [rsp+1568]
|
|
mov rsi, QWORD PTR [rsp+1536]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+1024]
|
|
lea r13, QWORD PTR [rsp+1280]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 512
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12], rax
|
|
mov QWORD PTR [r13], r9
|
|
mov rax, QWORD PTR [r12+8]
|
|
mov r9, QWORD PTR [r13+8]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+8], rax
|
|
mov QWORD PTR [r13+8], r9
|
|
mov rax, QWORD PTR [r12+16]
|
|
mov r9, QWORD PTR [r13+16]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+16], rax
|
|
mov QWORD PTR [r13+16], r9
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+24], rax
|
|
mov QWORD PTR [r13+24], r9
|
|
mov rax, QWORD PTR [r12+32]
|
|
mov r9, QWORD PTR [r13+32]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+32], rax
|
|
mov QWORD PTR [r13+32], r9
|
|
mov rax, QWORD PTR [r12+40]
|
|
mov r9, QWORD PTR [r13+40]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+40], rax
|
|
mov QWORD PTR [r13+40], r9
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+48], rax
|
|
mov QWORD PTR [r13+48], r9
|
|
mov rax, QWORD PTR [r12+56]
|
|
mov r9, QWORD PTR [r13+56]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+56], rax
|
|
mov QWORD PTR [r13+56], r9
|
|
mov rax, QWORD PTR [r12+64]
|
|
mov r9, QWORD PTR [r13+64]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+64], rax
|
|
mov QWORD PTR [r13+64], r9
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+72], rax
|
|
mov QWORD PTR [r13+72], r9
|
|
mov rax, QWORD PTR [r12+80]
|
|
mov r9, QWORD PTR [r13+80]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+80], rax
|
|
mov QWORD PTR [r13+80], r9
|
|
mov rax, QWORD PTR [r12+88]
|
|
mov r9, QWORD PTR [r13+88]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+88], rax
|
|
mov QWORD PTR [r13+88], r9
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [r13+96]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+96], rax
|
|
mov QWORD PTR [r13+96], r9
|
|
mov rax, QWORD PTR [r12+104]
|
|
mov r9, QWORD PTR [r13+104]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+104], rax
|
|
mov QWORD PTR [r13+104], r9
|
|
mov rax, QWORD PTR [r12+112]
|
|
mov r9, QWORD PTR [r13+112]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+112], rax
|
|
mov QWORD PTR [r13+112], r9
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [r13+120]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+120], rax
|
|
mov QWORD PTR [r13+120], r9
|
|
mov rax, QWORD PTR [r12+128]
|
|
mov r9, QWORD PTR [r13+128]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+128], rax
|
|
mov QWORD PTR [r13+128], r9
|
|
mov rax, QWORD PTR [r12+136]
|
|
mov r9, QWORD PTR [r13+136]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+136], rax
|
|
mov QWORD PTR [r13+136], r9
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [r13+144]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+144], rax
|
|
mov QWORD PTR [r13+144], r9
|
|
mov rax, QWORD PTR [r12+152]
|
|
mov r9, QWORD PTR [r13+152]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+152], rax
|
|
mov QWORD PTR [r13+152], r9
|
|
mov rax, QWORD PTR [r12+160]
|
|
mov r9, QWORD PTR [r13+160]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+160], rax
|
|
mov QWORD PTR [r13+160], r9
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [r13+168]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+168], rax
|
|
mov QWORD PTR [r13+168], r9
|
|
mov rax, QWORD PTR [r12+176]
|
|
mov r9, QWORD PTR [r13+176]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+176], rax
|
|
mov QWORD PTR [r13+176], r9
|
|
mov rax, QWORD PTR [r12+184]
|
|
mov r9, QWORD PTR [r13+184]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+184], rax
|
|
mov QWORD PTR [r13+184], r9
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [r13+192]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+192], rax
|
|
mov QWORD PTR [r13+192], r9
|
|
mov rax, QWORD PTR [r12+200]
|
|
mov r9, QWORD PTR [r13+200]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+200], rax
|
|
mov QWORD PTR [r13+200], r9
|
|
mov rax, QWORD PTR [r12+208]
|
|
mov r9, QWORD PTR [r13+208]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+208], rax
|
|
mov QWORD PTR [r13+208], r9
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [r13+216]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+216], rax
|
|
mov QWORD PTR [r13+216], r9
|
|
mov rax, QWORD PTR [r12+224]
|
|
mov r9, QWORD PTR [r13+224]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+224], rax
|
|
mov QWORD PTR [r13+224], r9
|
|
mov rax, QWORD PTR [r12+232]
|
|
mov r9, QWORD PTR [r13+232]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+232], rax
|
|
mov QWORD PTR [r13+232], r9
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [r13+240]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+240], rax
|
|
mov QWORD PTR [r13+240], r9
|
|
mov rax, QWORD PTR [r12+248]
|
|
mov r9, QWORD PTR [r13+248]
|
|
and rax, rdi
|
|
and r9, r15
|
|
mov QWORD PTR [r12+248], rax
|
|
mov QWORD PTR [r13+248], r9
|
|
mov rax, QWORD PTR [r12]
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r13+248]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+512]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [r13+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [r13+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [r13+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [r13+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [r13+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [r13+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [r13+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [r13+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [r13+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [r13+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [r13+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [r13+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [r13+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [r13+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [r13+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [r13+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [r13+376]
|
|
mov rax, QWORD PTR [r12+384]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb rax, QWORD PTR [r13+384]
|
|
mov r9, QWORD PTR [r12+392]
|
|
mov QWORD PTR [r12+384], rax
|
|
sbb r9, QWORD PTR [r13+392]
|
|
mov r10, QWORD PTR [r12+400]
|
|
mov QWORD PTR [r12+392], r9
|
|
sbb r10, QWORD PTR [r13+400]
|
|
mov rax, QWORD PTR [r12+408]
|
|
mov QWORD PTR [r12+400], r10
|
|
sbb rax, QWORD PTR [r13+408]
|
|
mov r9, QWORD PTR [r12+416]
|
|
mov QWORD PTR [r12+408], rax
|
|
sbb r9, QWORD PTR [r13+416]
|
|
mov r10, QWORD PTR [r12+424]
|
|
mov QWORD PTR [r12+416], r9
|
|
sbb r10, QWORD PTR [r13+424]
|
|
mov rax, QWORD PTR [r12+432]
|
|
mov QWORD PTR [r12+424], r10
|
|
sbb rax, QWORD PTR [r13+432]
|
|
mov r9, QWORD PTR [r12+440]
|
|
mov QWORD PTR [r12+432], rax
|
|
sbb r9, QWORD PTR [r13+440]
|
|
mov r10, QWORD PTR [r12+448]
|
|
mov QWORD PTR [r12+440], r9
|
|
sbb r10, QWORD PTR [r13+448]
|
|
mov rax, QWORD PTR [r12+456]
|
|
mov QWORD PTR [r12+448], r10
|
|
sbb rax, QWORD PTR [r13+456]
|
|
mov r9, QWORD PTR [r12+464]
|
|
mov QWORD PTR [r12+456], rax
|
|
sbb r9, QWORD PTR [r13+464]
|
|
mov r10, QWORD PTR [r12+472]
|
|
mov QWORD PTR [r12+464], r9
|
|
sbb r10, QWORD PTR [r13+472]
|
|
mov rax, QWORD PTR [r12+480]
|
|
mov QWORD PTR [r12+472], r10
|
|
sbb rax, QWORD PTR [r13+480]
|
|
mov r9, QWORD PTR [r12+488]
|
|
mov QWORD PTR [r12+480], rax
|
|
sbb r9, QWORD PTR [r13+488]
|
|
mov r10, QWORD PTR [r12+496]
|
|
mov QWORD PTR [r12+488], r9
|
|
sbb r10, QWORD PTR [r13+496]
|
|
mov rax, QWORD PTR [r12+504]
|
|
mov QWORD PTR [r12+496], r10
|
|
sbb rax, QWORD PTR [r13+504]
|
|
mov QWORD PTR [r12+504], rax
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [rcx+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [rcx+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [rcx+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [rcx+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [rcx+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [rcx+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [rcx+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [rcx+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [rcx+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [rcx+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [rcx+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [rcx+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [rcx+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [rcx+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [rcx+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [rcx+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [rcx+376]
|
|
mov rax, QWORD PTR [r12+384]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb rax, QWORD PTR [rcx+384]
|
|
mov r9, QWORD PTR [r12+392]
|
|
mov QWORD PTR [r12+384], rax
|
|
sbb r9, QWORD PTR [rcx+392]
|
|
mov r10, QWORD PTR [r12+400]
|
|
mov QWORD PTR [r12+392], r9
|
|
sbb r10, QWORD PTR [rcx+400]
|
|
mov rax, QWORD PTR [r12+408]
|
|
mov QWORD PTR [r12+400], r10
|
|
sbb rax, QWORD PTR [rcx+408]
|
|
mov r9, QWORD PTR [r12+416]
|
|
mov QWORD PTR [r12+408], rax
|
|
sbb r9, QWORD PTR [rcx+416]
|
|
mov r10, QWORD PTR [r12+424]
|
|
mov QWORD PTR [r12+416], r9
|
|
sbb r10, QWORD PTR [rcx+424]
|
|
mov rax, QWORD PTR [r12+432]
|
|
mov QWORD PTR [r12+424], r10
|
|
sbb rax, QWORD PTR [rcx+432]
|
|
mov r9, QWORD PTR [r12+440]
|
|
mov QWORD PTR [r12+432], rax
|
|
sbb r9, QWORD PTR [rcx+440]
|
|
mov r10, QWORD PTR [r12+448]
|
|
mov QWORD PTR [r12+440], r9
|
|
sbb r10, QWORD PTR [rcx+448]
|
|
mov rax, QWORD PTR [r12+456]
|
|
mov QWORD PTR [r12+448], r10
|
|
sbb rax, QWORD PTR [rcx+456]
|
|
mov r9, QWORD PTR [r12+464]
|
|
mov QWORD PTR [r12+456], rax
|
|
sbb r9, QWORD PTR [rcx+464]
|
|
mov r10, QWORD PTR [r12+472]
|
|
mov QWORD PTR [r12+464], r9
|
|
sbb r10, QWORD PTR [rcx+472]
|
|
mov rax, QWORD PTR [r12+480]
|
|
mov QWORD PTR [r12+472], r10
|
|
sbb rax, QWORD PTR [rcx+480]
|
|
mov r9, QWORD PTR [r12+488]
|
|
mov QWORD PTR [r12+480], rax
|
|
sbb r9, QWORD PTR [rcx+488]
|
|
mov r10, QWORD PTR [r12+496]
|
|
mov QWORD PTR [r12+488], r9
|
|
sbb r10, QWORD PTR [rcx+496]
|
|
mov rax, QWORD PTR [r12+504]
|
|
mov QWORD PTR [r12+496], r10
|
|
sbb rax, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [r12+504], rax
|
|
sbb r11, 0
|
|
sub rsi, 256
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r12+248]
|
|
mov r10, QWORD PTR [rsi+256]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r10, QWORD PTR [r12+256]
|
|
mov rax, QWORD PTR [rsi+264]
|
|
mov QWORD PTR [rsi+256], r10
|
|
adc rax, QWORD PTR [r12+264]
|
|
mov r9, QWORD PTR [rsi+272]
|
|
mov QWORD PTR [rsi+264], rax
|
|
adc r9, QWORD PTR [r12+272]
|
|
mov r10, QWORD PTR [rsi+280]
|
|
mov QWORD PTR [rsi+272], r9
|
|
adc r10, QWORD PTR [r12+280]
|
|
mov rax, QWORD PTR [rsi+288]
|
|
mov QWORD PTR [rsi+280], r10
|
|
adc rax, QWORD PTR [r12+288]
|
|
mov r9, QWORD PTR [rsi+296]
|
|
mov QWORD PTR [rsi+288], rax
|
|
adc r9, QWORD PTR [r12+296]
|
|
mov r10, QWORD PTR [rsi+304]
|
|
mov QWORD PTR [rsi+296], r9
|
|
adc r10, QWORD PTR [r12+304]
|
|
mov rax, QWORD PTR [rsi+312]
|
|
mov QWORD PTR [rsi+304], r10
|
|
adc rax, QWORD PTR [r12+312]
|
|
mov r9, QWORD PTR [rsi+320]
|
|
mov QWORD PTR [rsi+312], rax
|
|
adc r9, QWORD PTR [r12+320]
|
|
mov r10, QWORD PTR [rsi+328]
|
|
mov QWORD PTR [rsi+320], r9
|
|
adc r10, QWORD PTR [r12+328]
|
|
mov rax, QWORD PTR [rsi+336]
|
|
mov QWORD PTR [rsi+328], r10
|
|
adc rax, QWORD PTR [r12+336]
|
|
mov r9, QWORD PTR [rsi+344]
|
|
mov QWORD PTR [rsi+336], rax
|
|
adc r9, QWORD PTR [r12+344]
|
|
mov r10, QWORD PTR [rsi+352]
|
|
mov QWORD PTR [rsi+344], r9
|
|
adc r10, QWORD PTR [r12+352]
|
|
mov rax, QWORD PTR [rsi+360]
|
|
mov QWORD PTR [rsi+352], r10
|
|
adc rax, QWORD PTR [r12+360]
|
|
mov r9, QWORD PTR [rsi+368]
|
|
mov QWORD PTR [rsi+360], rax
|
|
adc r9, QWORD PTR [r12+368]
|
|
mov r10, QWORD PTR [rsi+376]
|
|
mov QWORD PTR [rsi+368], r9
|
|
adc r10, QWORD PTR [r12+376]
|
|
mov rax, QWORD PTR [rsi+384]
|
|
mov QWORD PTR [rsi+376], r10
|
|
adc rax, QWORD PTR [r12+384]
|
|
mov r9, QWORD PTR [rsi+392]
|
|
mov QWORD PTR [rsi+384], rax
|
|
adc r9, QWORD PTR [r12+392]
|
|
mov r10, QWORD PTR [rsi+400]
|
|
mov QWORD PTR [rsi+392], r9
|
|
adc r10, QWORD PTR [r12+400]
|
|
mov rax, QWORD PTR [rsi+408]
|
|
mov QWORD PTR [rsi+400], r10
|
|
adc rax, QWORD PTR [r12+408]
|
|
mov r9, QWORD PTR [rsi+416]
|
|
mov QWORD PTR [rsi+408], rax
|
|
adc r9, QWORD PTR [r12+416]
|
|
mov r10, QWORD PTR [rsi+424]
|
|
mov QWORD PTR [rsi+416], r9
|
|
adc r10, QWORD PTR [r12+424]
|
|
mov rax, QWORD PTR [rsi+432]
|
|
mov QWORD PTR [rsi+424], r10
|
|
adc rax, QWORD PTR [r12+432]
|
|
mov r9, QWORD PTR [rsi+440]
|
|
mov QWORD PTR [rsi+432], rax
|
|
adc r9, QWORD PTR [r12+440]
|
|
mov r10, QWORD PTR [rsi+448]
|
|
mov QWORD PTR [rsi+440], r9
|
|
adc r10, QWORD PTR [r12+448]
|
|
mov rax, QWORD PTR [rsi+456]
|
|
mov QWORD PTR [rsi+448], r10
|
|
adc rax, QWORD PTR [r12+456]
|
|
mov r9, QWORD PTR [rsi+464]
|
|
mov QWORD PTR [rsi+456], rax
|
|
adc r9, QWORD PTR [r12+464]
|
|
mov r10, QWORD PTR [rsi+472]
|
|
mov QWORD PTR [rsi+464], r9
|
|
adc r10, QWORD PTR [r12+472]
|
|
mov rax, QWORD PTR [rsi+480]
|
|
mov QWORD PTR [rsi+472], r10
|
|
adc rax, QWORD PTR [r12+480]
|
|
mov r9, QWORD PTR [rsi+488]
|
|
mov QWORD PTR [rsi+480], rax
|
|
adc r9, QWORD PTR [r12+488]
|
|
mov r10, QWORD PTR [rsi+496]
|
|
mov QWORD PTR [rsi+488], r9
|
|
adc r10, QWORD PTR [r12+496]
|
|
mov rax, QWORD PTR [rsi+504]
|
|
mov QWORD PTR [rsi+496], r10
|
|
adc rax, QWORD PTR [r12+504]
|
|
mov QWORD PTR [rsi+504], rax
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+768], r11
|
|
add rsi, 256
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
xor r11, r11
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r13+248]
|
|
mov r10, QWORD PTR [rsi+256]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r10, QWORD PTR [r13+256]
|
|
mov QWORD PTR [rsi+256], r10
|
|
adc r11, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+264]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+272]
|
|
mov QWORD PTR [rsi+264], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+280]
|
|
mov QWORD PTR [rsi+272], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+288]
|
|
mov QWORD PTR [rsi+280], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+296]
|
|
mov QWORD PTR [rsi+288], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+304]
|
|
mov QWORD PTR [rsi+296], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+312]
|
|
mov QWORD PTR [rsi+304], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+320]
|
|
mov QWORD PTR [rsi+312], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+328]
|
|
mov QWORD PTR [rsi+320], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+336]
|
|
mov QWORD PTR [rsi+328], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+344]
|
|
mov QWORD PTR [rsi+336], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+352]
|
|
mov QWORD PTR [rsi+344], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+360]
|
|
mov QWORD PTR [rsi+352], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+368]
|
|
mov QWORD PTR [rsi+360], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+376]
|
|
mov QWORD PTR [rsi+368], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+384]
|
|
mov QWORD PTR [rsi+376], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+392]
|
|
mov QWORD PTR [rsi+384], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+400]
|
|
mov QWORD PTR [rsi+392], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+408]
|
|
mov QWORD PTR [rsi+400], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+416]
|
|
mov QWORD PTR [rsi+408], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+424]
|
|
mov QWORD PTR [rsi+416], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+432]
|
|
mov QWORD PTR [rsi+424], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+440]
|
|
mov QWORD PTR [rsi+432], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+448]
|
|
mov QWORD PTR [rsi+440], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+456]
|
|
mov QWORD PTR [rsi+448], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+464]
|
|
mov QWORD PTR [rsi+456], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+472]
|
|
mov QWORD PTR [rsi+464], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+480]
|
|
mov QWORD PTR [rsi+472], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+488]
|
|
mov QWORD PTR [rsi+480], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+496]
|
|
mov QWORD PTR [rsi+488], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+504]
|
|
mov QWORD PTR [rsi+496], r10
|
|
adc rax, 0
|
|
mov QWORD PTR [rsi+504], rax
|
|
add rsp, 1576
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_mul_64 ENDP
|
|
_text ENDS
|
|
; /* Add a to a into r. (r = a + a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_2048_dbl_32 PROC
|
|
mov r8, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r8, r8
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r8
|
|
adc r9, r9
|
|
mov QWORD PTR [rcx+248], r9
|
|
adc rax, 0
|
|
ret
|
|
sp_2048_dbl_32 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_sqr_64 PROC
|
|
push r12
|
|
sub rsp, 1304
|
|
mov QWORD PTR [rsp+1280], rcx
|
|
mov QWORD PTR [rsp+1288], rdx
|
|
lea r10, QWORD PTR [rsp+1024]
|
|
lea r11, QWORD PTR [rdx+256]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc rax, QWORD PTR [r11+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
adc r8, QWORD PTR [r11+104]
|
|
mov rax, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
adc rax, QWORD PTR [r11+112]
|
|
mov r8, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
adc r8, QWORD PTR [r11+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
adc rax, QWORD PTR [r11+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
adc r8, QWORD PTR [r11+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
adc rax, QWORD PTR [r11+144]
|
|
mov r8, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
adc r8, QWORD PTR [r11+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
adc rax, QWORD PTR [r11+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
adc r8, QWORD PTR [r11+168]
|
|
mov rax, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
adc rax, QWORD PTR [r11+176]
|
|
mov r8, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
adc r8, QWORD PTR [r11+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
adc rax, QWORD PTR [r11+192]
|
|
mov r8, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
adc r8, QWORD PTR [r11+200]
|
|
mov rax, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
adc rax, QWORD PTR [r11+208]
|
|
mov r8, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
adc r8, QWORD PTR [r11+216]
|
|
mov rax, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
adc rax, QWORD PTR [r11+224]
|
|
mov r8, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
adc r8, QWORD PTR [r11+232]
|
|
mov rax, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
adc rax, QWORD PTR [r11+240]
|
|
mov r8, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
adc r8, QWORD PTR [r11+248]
|
|
mov QWORD PTR [r10+248], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+1296], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_2048_sqr_32
|
|
mov rdx, QWORD PTR [rsp+1288]
|
|
lea rcx, QWORD PTR [rsp+512]
|
|
add rdx, 256
|
|
call sp_2048_sqr_32
|
|
mov rdx, QWORD PTR [rsp+1288]
|
|
mov rcx, QWORD PTR [rsp+1280]
|
|
call sp_2048_sqr_32
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+1288]
|
|
mov rcx, QWORD PTR [rsp+1280]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+1296]
|
|
lea r10, QWORD PTR [rsp+1024]
|
|
mov r9, r12
|
|
neg r12
|
|
mov rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [r10+8]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+512], rax
|
|
mov QWORD PTR [rcx+520], r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+528], rax
|
|
mov QWORD PTR [rcx+536], r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+544], rax
|
|
mov QWORD PTR [rcx+552], r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+560], rax
|
|
mov QWORD PTR [rcx+568], r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+576], rax
|
|
mov QWORD PTR [rcx+584], r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+592], rax
|
|
mov QWORD PTR [rcx+600], r8
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+608], rax
|
|
mov QWORD PTR [rcx+616], r8
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+624], rax
|
|
mov QWORD PTR [rcx+632], r8
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+640], rax
|
|
mov QWORD PTR [rcx+648], r8
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+656], rax
|
|
mov QWORD PTR [rcx+664], r8
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+672], rax
|
|
mov QWORD PTR [rcx+680], r8
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+688], rax
|
|
mov QWORD PTR [rcx+696], r8
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+704], rax
|
|
mov QWORD PTR [rcx+712], r8
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+720], rax
|
|
mov QWORD PTR [rcx+728], r8
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+736], rax
|
|
mov QWORD PTR [rcx+744], r8
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
and rax, r12
|
|
and r8, r12
|
|
mov QWORD PTR [rcx+752], rax
|
|
mov QWORD PTR [rcx+760], r8
|
|
mov rax, QWORD PTR [rcx+512]
|
|
add rax, rax
|
|
mov r8, QWORD PTR [rcx+520]
|
|
mov QWORD PTR [rcx+512], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+528]
|
|
mov QWORD PTR [rcx+520], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+536]
|
|
mov QWORD PTR [rcx+528], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+544]
|
|
mov QWORD PTR [rcx+536], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+552]
|
|
mov QWORD PTR [rcx+544], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+560]
|
|
mov QWORD PTR [rcx+552], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+568]
|
|
mov QWORD PTR [rcx+560], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+576]
|
|
mov QWORD PTR [rcx+568], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+584]
|
|
mov QWORD PTR [rcx+576], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+592]
|
|
mov QWORD PTR [rcx+584], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+600]
|
|
mov QWORD PTR [rcx+592], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+608]
|
|
mov QWORD PTR [rcx+600], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+616]
|
|
mov QWORD PTR [rcx+608], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+624]
|
|
mov QWORD PTR [rcx+616], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+632]
|
|
mov QWORD PTR [rcx+624], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+640]
|
|
mov QWORD PTR [rcx+632], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+648]
|
|
mov QWORD PTR [rcx+640], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+656]
|
|
mov QWORD PTR [rcx+648], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+664]
|
|
mov QWORD PTR [rcx+656], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+672]
|
|
mov QWORD PTR [rcx+664], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+680]
|
|
mov QWORD PTR [rcx+672], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+688]
|
|
mov QWORD PTR [rcx+680], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+696]
|
|
mov QWORD PTR [rcx+688], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+704]
|
|
mov QWORD PTR [rcx+696], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+712]
|
|
mov QWORD PTR [rcx+704], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+720]
|
|
mov QWORD PTR [rcx+712], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+728]
|
|
mov QWORD PTR [rcx+720], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+736]
|
|
mov QWORD PTR [rcx+728], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+744]
|
|
mov QWORD PTR [rcx+736], rax
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [rcx+752]
|
|
mov QWORD PTR [rcx+744], r8
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [rcx+760]
|
|
mov QWORD PTR [rcx+752], rax
|
|
adc r8, r8
|
|
mov QWORD PTR [rcx+760], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+512]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rdx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rdx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rdx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rdx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rdx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rdx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rdx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rdx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rdx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rdx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rdx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rdx+376]
|
|
mov rax, QWORD PTR [r10+384]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb rax, QWORD PTR [rdx+384]
|
|
mov r8, QWORD PTR [r10+392]
|
|
mov QWORD PTR [r10+384], rax
|
|
sbb r8, QWORD PTR [rdx+392]
|
|
mov rax, QWORD PTR [r10+400]
|
|
mov QWORD PTR [r10+392], r8
|
|
sbb rax, QWORD PTR [rdx+400]
|
|
mov r8, QWORD PTR [r10+408]
|
|
mov QWORD PTR [r10+400], rax
|
|
sbb r8, QWORD PTR [rdx+408]
|
|
mov rax, QWORD PTR [r10+416]
|
|
mov QWORD PTR [r10+408], r8
|
|
sbb rax, QWORD PTR [rdx+416]
|
|
mov r8, QWORD PTR [r10+424]
|
|
mov QWORD PTR [r10+416], rax
|
|
sbb r8, QWORD PTR [rdx+424]
|
|
mov rax, QWORD PTR [r10+432]
|
|
mov QWORD PTR [r10+424], r8
|
|
sbb rax, QWORD PTR [rdx+432]
|
|
mov r8, QWORD PTR [r10+440]
|
|
mov QWORD PTR [r10+432], rax
|
|
sbb r8, QWORD PTR [rdx+440]
|
|
mov rax, QWORD PTR [r10+448]
|
|
mov QWORD PTR [r10+440], r8
|
|
sbb rax, QWORD PTR [rdx+448]
|
|
mov r8, QWORD PTR [r10+456]
|
|
mov QWORD PTR [r10+448], rax
|
|
sbb r8, QWORD PTR [rdx+456]
|
|
mov rax, QWORD PTR [r10+464]
|
|
mov QWORD PTR [r10+456], r8
|
|
sbb rax, QWORD PTR [rdx+464]
|
|
mov r8, QWORD PTR [r10+472]
|
|
mov QWORD PTR [r10+464], rax
|
|
sbb r8, QWORD PTR [rdx+472]
|
|
mov rax, QWORD PTR [r10+480]
|
|
mov QWORD PTR [r10+472], r8
|
|
sbb rax, QWORD PTR [rdx+480]
|
|
mov r8, QWORD PTR [r10+488]
|
|
mov QWORD PTR [r10+480], rax
|
|
sbb r8, QWORD PTR [rdx+488]
|
|
mov rax, QWORD PTR [r10+496]
|
|
mov QWORD PTR [r10+488], r8
|
|
sbb rax, QWORD PTR [rdx+496]
|
|
mov r8, QWORD PTR [r10+504]
|
|
mov QWORD PTR [r10+496], rax
|
|
sbb r8, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [r10+504], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rcx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rcx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rcx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rcx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rcx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rcx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rcx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rcx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rcx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rcx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rcx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rcx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rcx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rcx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rcx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rcx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rcx+376]
|
|
mov rax, QWORD PTR [r10+384]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb rax, QWORD PTR [rcx+384]
|
|
mov r8, QWORD PTR [r10+392]
|
|
mov QWORD PTR [r10+384], rax
|
|
sbb r8, QWORD PTR [rcx+392]
|
|
mov rax, QWORD PTR [r10+400]
|
|
mov QWORD PTR [r10+392], r8
|
|
sbb rax, QWORD PTR [rcx+400]
|
|
mov r8, QWORD PTR [r10+408]
|
|
mov QWORD PTR [r10+400], rax
|
|
sbb r8, QWORD PTR [rcx+408]
|
|
mov rax, QWORD PTR [r10+416]
|
|
mov QWORD PTR [r10+408], r8
|
|
sbb rax, QWORD PTR [rcx+416]
|
|
mov r8, QWORD PTR [r10+424]
|
|
mov QWORD PTR [r10+416], rax
|
|
sbb r8, QWORD PTR [rcx+424]
|
|
mov rax, QWORD PTR [r10+432]
|
|
mov QWORD PTR [r10+424], r8
|
|
sbb rax, QWORD PTR [rcx+432]
|
|
mov r8, QWORD PTR [r10+440]
|
|
mov QWORD PTR [r10+432], rax
|
|
sbb r8, QWORD PTR [rcx+440]
|
|
mov rax, QWORD PTR [r10+448]
|
|
mov QWORD PTR [r10+440], r8
|
|
sbb rax, QWORD PTR [rcx+448]
|
|
mov r8, QWORD PTR [r10+456]
|
|
mov QWORD PTR [r10+448], rax
|
|
sbb r8, QWORD PTR [rcx+456]
|
|
mov rax, QWORD PTR [r10+464]
|
|
mov QWORD PTR [r10+456], r8
|
|
sbb rax, QWORD PTR [rcx+464]
|
|
mov r8, QWORD PTR [r10+472]
|
|
mov QWORD PTR [r10+464], rax
|
|
sbb r8, QWORD PTR [rcx+472]
|
|
mov rax, QWORD PTR [r10+480]
|
|
mov QWORD PTR [r10+472], r8
|
|
sbb rax, QWORD PTR [rcx+480]
|
|
mov r8, QWORD PTR [r10+488]
|
|
mov QWORD PTR [r10+480], rax
|
|
sbb r8, QWORD PTR [rcx+488]
|
|
mov rax, QWORD PTR [r10+496]
|
|
mov QWORD PTR [r10+488], r8
|
|
sbb rax, QWORD PTR [rcx+496]
|
|
mov r8, QWORD PTR [r10+504]
|
|
mov QWORD PTR [r10+496], rax
|
|
sbb r8, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [r10+504], r8
|
|
sbb r9, 0
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+256]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [rcx+384]
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [rcx+392]
|
|
mov QWORD PTR [rcx+384], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [rcx+400]
|
|
mov QWORD PTR [rcx+392], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [rcx+408]
|
|
mov QWORD PTR [rcx+400], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [rcx+416]
|
|
mov QWORD PTR [rcx+408], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [rcx+424]
|
|
mov QWORD PTR [rcx+416], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [rcx+432]
|
|
mov QWORD PTR [rcx+424], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [rcx+440]
|
|
mov QWORD PTR [rcx+432], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov rax, QWORD PTR [rcx+448]
|
|
mov QWORD PTR [rcx+440], r8
|
|
adc rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [rcx+456]
|
|
mov QWORD PTR [rcx+448], rax
|
|
adc r8, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [rcx+464]
|
|
mov QWORD PTR [rcx+456], r8
|
|
adc rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [rcx+472]
|
|
mov QWORD PTR [rcx+464], rax
|
|
adc r8, QWORD PTR [r10+216]
|
|
mov rax, QWORD PTR [rcx+480]
|
|
mov QWORD PTR [rcx+472], r8
|
|
adc rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [rcx+488]
|
|
mov QWORD PTR [rcx+480], rax
|
|
adc r8, QWORD PTR [r10+232]
|
|
mov rax, QWORD PTR [rcx+496]
|
|
mov QWORD PTR [rcx+488], r8
|
|
adc rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [rcx+496], rax
|
|
adc r8, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [rcx+512]
|
|
mov QWORD PTR [rcx+504], r8
|
|
adc rax, QWORD PTR [r10+256]
|
|
mov r8, QWORD PTR [rcx+520]
|
|
mov QWORD PTR [rcx+512], rax
|
|
adc r8, QWORD PTR [r10+264]
|
|
mov rax, QWORD PTR [rcx+528]
|
|
mov QWORD PTR [rcx+520], r8
|
|
adc rax, QWORD PTR [r10+272]
|
|
mov r8, QWORD PTR [rcx+536]
|
|
mov QWORD PTR [rcx+528], rax
|
|
adc r8, QWORD PTR [r10+280]
|
|
mov rax, QWORD PTR [rcx+544]
|
|
mov QWORD PTR [rcx+536], r8
|
|
adc rax, QWORD PTR [r10+288]
|
|
mov r8, QWORD PTR [rcx+552]
|
|
mov QWORD PTR [rcx+544], rax
|
|
adc r8, QWORD PTR [r10+296]
|
|
mov rax, QWORD PTR [rcx+560]
|
|
mov QWORD PTR [rcx+552], r8
|
|
adc rax, QWORD PTR [r10+304]
|
|
mov r8, QWORD PTR [rcx+568]
|
|
mov QWORD PTR [rcx+560], rax
|
|
adc r8, QWORD PTR [r10+312]
|
|
mov rax, QWORD PTR [rcx+576]
|
|
mov QWORD PTR [rcx+568], r8
|
|
adc rax, QWORD PTR [r10+320]
|
|
mov r8, QWORD PTR [rcx+584]
|
|
mov QWORD PTR [rcx+576], rax
|
|
adc r8, QWORD PTR [r10+328]
|
|
mov rax, QWORD PTR [rcx+592]
|
|
mov QWORD PTR [rcx+584], r8
|
|
adc rax, QWORD PTR [r10+336]
|
|
mov r8, QWORD PTR [rcx+600]
|
|
mov QWORD PTR [rcx+592], rax
|
|
adc r8, QWORD PTR [r10+344]
|
|
mov rax, QWORD PTR [rcx+608]
|
|
mov QWORD PTR [rcx+600], r8
|
|
adc rax, QWORD PTR [r10+352]
|
|
mov r8, QWORD PTR [rcx+616]
|
|
mov QWORD PTR [rcx+608], rax
|
|
adc r8, QWORD PTR [r10+360]
|
|
mov rax, QWORD PTR [rcx+624]
|
|
mov QWORD PTR [rcx+616], r8
|
|
adc rax, QWORD PTR [r10+368]
|
|
mov r8, QWORD PTR [rcx+632]
|
|
mov QWORD PTR [rcx+624], rax
|
|
adc r8, QWORD PTR [r10+376]
|
|
mov rax, QWORD PTR [rcx+640]
|
|
mov QWORD PTR [rcx+632], r8
|
|
adc rax, QWORD PTR [r10+384]
|
|
mov r8, QWORD PTR [rcx+648]
|
|
mov QWORD PTR [rcx+640], rax
|
|
adc r8, QWORD PTR [r10+392]
|
|
mov rax, QWORD PTR [rcx+656]
|
|
mov QWORD PTR [rcx+648], r8
|
|
adc rax, QWORD PTR [r10+400]
|
|
mov r8, QWORD PTR [rcx+664]
|
|
mov QWORD PTR [rcx+656], rax
|
|
adc r8, QWORD PTR [r10+408]
|
|
mov rax, QWORD PTR [rcx+672]
|
|
mov QWORD PTR [rcx+664], r8
|
|
adc rax, QWORD PTR [r10+416]
|
|
mov r8, QWORD PTR [rcx+680]
|
|
mov QWORD PTR [rcx+672], rax
|
|
adc r8, QWORD PTR [r10+424]
|
|
mov rax, QWORD PTR [rcx+688]
|
|
mov QWORD PTR [rcx+680], r8
|
|
adc rax, QWORD PTR [r10+432]
|
|
mov r8, QWORD PTR [rcx+696]
|
|
mov QWORD PTR [rcx+688], rax
|
|
adc r8, QWORD PTR [r10+440]
|
|
mov rax, QWORD PTR [rcx+704]
|
|
mov QWORD PTR [rcx+696], r8
|
|
adc rax, QWORD PTR [r10+448]
|
|
mov r8, QWORD PTR [rcx+712]
|
|
mov QWORD PTR [rcx+704], rax
|
|
adc r8, QWORD PTR [r10+456]
|
|
mov rax, QWORD PTR [rcx+720]
|
|
mov QWORD PTR [rcx+712], r8
|
|
adc rax, QWORD PTR [r10+464]
|
|
mov r8, QWORD PTR [rcx+728]
|
|
mov QWORD PTR [rcx+720], rax
|
|
adc r8, QWORD PTR [r10+472]
|
|
mov rax, QWORD PTR [rcx+736]
|
|
mov QWORD PTR [rcx+728], r8
|
|
adc rax, QWORD PTR [r10+480]
|
|
mov r8, QWORD PTR [rcx+744]
|
|
mov QWORD PTR [rcx+736], rax
|
|
adc r8, QWORD PTR [r10+488]
|
|
mov rax, QWORD PTR [rcx+752]
|
|
mov QWORD PTR [rcx+744], r8
|
|
adc rax, QWORD PTR [r10+496]
|
|
mov r8, QWORD PTR [rcx+760]
|
|
mov QWORD PTR [rcx+752], rax
|
|
adc r8, QWORD PTR [r10+504]
|
|
mov QWORD PTR [rcx+760], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+768], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+512]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rcx+520]
|
|
mov QWORD PTR [rcx+512], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [rcx+528]
|
|
mov QWORD PTR [rcx+520], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rcx+536]
|
|
mov QWORD PTR [rcx+528], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rcx+544]
|
|
mov QWORD PTR [rcx+536], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rcx+552]
|
|
mov QWORD PTR [rcx+544], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [rcx+560]
|
|
mov QWORD PTR [rcx+552], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rcx+568]
|
|
mov QWORD PTR [rcx+560], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rcx+576]
|
|
mov QWORD PTR [rcx+568], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rcx+584]
|
|
mov QWORD PTR [rcx+576], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [rcx+592]
|
|
mov QWORD PTR [rcx+584], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rcx+600]
|
|
mov QWORD PTR [rcx+592], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rcx+608]
|
|
mov QWORD PTR [rcx+600], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rcx+616]
|
|
mov QWORD PTR [rcx+608], rax
|
|
adc r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [rcx+624]
|
|
mov QWORD PTR [rcx+616], r8
|
|
adc rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rcx+632]
|
|
mov QWORD PTR [rcx+624], rax
|
|
adc r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rcx+640]
|
|
mov QWORD PTR [rcx+632], r8
|
|
adc rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rcx+648]
|
|
mov QWORD PTR [rcx+640], rax
|
|
adc r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [rcx+656]
|
|
mov QWORD PTR [rcx+648], r8
|
|
adc rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rcx+664]
|
|
mov QWORD PTR [rcx+656], rax
|
|
adc r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [rcx+672]
|
|
mov QWORD PTR [rcx+664], r8
|
|
adc rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rcx+680]
|
|
mov QWORD PTR [rcx+672], rax
|
|
adc r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [rcx+688]
|
|
mov QWORD PTR [rcx+680], r8
|
|
adc rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rcx+696]
|
|
mov QWORD PTR [rcx+688], rax
|
|
adc r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [rcx+704]
|
|
mov QWORD PTR [rcx+696], r8
|
|
adc rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rcx+712]
|
|
mov QWORD PTR [rcx+704], rax
|
|
adc r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [rcx+720]
|
|
mov QWORD PTR [rcx+712], r8
|
|
adc rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [rcx+728]
|
|
mov QWORD PTR [rcx+720], rax
|
|
adc r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [rcx+736]
|
|
mov QWORD PTR [rcx+728], r8
|
|
adc rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rcx+744]
|
|
mov QWORD PTR [rcx+736], rax
|
|
adc r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [rcx+752]
|
|
mov QWORD PTR [rcx+744], r8
|
|
adc rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [rcx+760]
|
|
mov QWORD PTR [rcx+752], rax
|
|
adc r8, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [rcx+768]
|
|
mov QWORD PTR [rcx+760], r8
|
|
adc rax, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [rcx+768], rax
|
|
adc r9, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+264]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [rcx+776], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [rcx+784], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [rcx+792], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [rcx+800], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [rcx+808], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [rcx+816], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [rcx+824], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [rcx+832], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [rcx+840], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [rcx+848], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [rcx+856], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [rcx+864], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [rcx+872], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+880], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+384]
|
|
mov QWORD PTR [rcx+888], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+392]
|
|
mov QWORD PTR [rcx+896], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+400]
|
|
mov QWORD PTR [rcx+904], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+408]
|
|
mov QWORD PTR [rcx+912], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+416]
|
|
mov QWORD PTR [rcx+920], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+424]
|
|
mov QWORD PTR [rcx+928], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+432]
|
|
mov QWORD PTR [rcx+936], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+440]
|
|
mov QWORD PTR [rcx+944], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+448]
|
|
mov QWORD PTR [rcx+952], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+456]
|
|
mov QWORD PTR [rcx+960], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+464]
|
|
mov QWORD PTR [rcx+968], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+472]
|
|
mov QWORD PTR [rcx+976], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+480]
|
|
mov QWORD PTR [rcx+984], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+488]
|
|
mov QWORD PTR [rcx+992], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+496]
|
|
mov QWORD PTR [rcx+1000], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [rcx+1008], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [rcx+1016], rax
|
|
add rsp, 1304
|
|
pop r12
|
|
ret
|
|
sp_4096_sqr_64 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_mul_avx2_64 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 1576
|
|
mov QWORD PTR [rsp+1536], rcx
|
|
mov QWORD PTR [rsp+1544], rdx
|
|
mov QWORD PTR [rsp+1552], r8
|
|
lea r12, QWORD PTR [rsp+1024]
|
|
lea r14, QWORD PTR [rdx+256]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r15, r15
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r12], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
adc rax, QWORD PTR [r14+192]
|
|
mov r9, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
adc r9, QWORD PTR [r14+200]
|
|
mov r10, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
adc r10, QWORD PTR [r14+208]
|
|
mov rax, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
adc rax, QWORD PTR [r14+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
adc r9, QWORD PTR [r14+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
adc r10, QWORD PTR [r14+232]
|
|
mov rax, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
adc rax, QWORD PTR [r14+240]
|
|
mov r9, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
adc r9, QWORD PTR [r14+248]
|
|
mov QWORD PTR [r12+248], r9
|
|
adc r15, 0
|
|
mov QWORD PTR [rsp+1560], r15
|
|
lea r13, QWORD PTR [rsp+1280]
|
|
lea r14, QWORD PTR [r8+256]
|
|
; Add
|
|
mov rax, QWORD PTR [r8]
|
|
xor rdi, rdi
|
|
add rax, QWORD PTR [r14]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov QWORD PTR [r13], rax
|
|
adc r9, QWORD PTR [r14+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov QWORD PTR [r13+8], r9
|
|
adc r10, QWORD PTR [r14+16]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mov QWORD PTR [r13+16], r10
|
|
adc rax, QWORD PTR [r14+24]
|
|
mov r9, QWORD PTR [r8+32]
|
|
mov QWORD PTR [r13+24], rax
|
|
adc r9, QWORD PTR [r14+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [r13+32], r9
|
|
adc r10, QWORD PTR [r14+40]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov QWORD PTR [r13+40], r10
|
|
adc rax, QWORD PTR [r14+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov QWORD PTR [r13+48], rax
|
|
adc r9, QWORD PTR [r14+56]
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov QWORD PTR [r13+56], r9
|
|
adc r10, QWORD PTR [r14+64]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mov QWORD PTR [r13+64], r10
|
|
adc rax, QWORD PTR [r14+72]
|
|
mov r9, QWORD PTR [r8+80]
|
|
mov QWORD PTR [r13+72], rax
|
|
adc r9, QWORD PTR [r14+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov QWORD PTR [r13+80], r9
|
|
adc r10, QWORD PTR [r14+88]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov QWORD PTR [r13+88], r10
|
|
adc rax, QWORD PTR [r14+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov QWORD PTR [r13+96], rax
|
|
adc r9, QWORD PTR [r14+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov QWORD PTR [r13+104], r9
|
|
adc r10, QWORD PTR [r14+112]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mov QWORD PTR [r13+112], r10
|
|
adc rax, QWORD PTR [r14+120]
|
|
mov r9, QWORD PTR [r8+128]
|
|
mov QWORD PTR [r13+120], rax
|
|
adc r9, QWORD PTR [r14+128]
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov QWORD PTR [r13+128], r9
|
|
adc r10, QWORD PTR [r14+136]
|
|
mov rax, QWORD PTR [r8+144]
|
|
mov QWORD PTR [r13+136], r10
|
|
adc rax, QWORD PTR [r14+144]
|
|
mov r9, QWORD PTR [r8+152]
|
|
mov QWORD PTR [r13+144], rax
|
|
adc r9, QWORD PTR [r14+152]
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov QWORD PTR [r13+152], r9
|
|
adc r10, QWORD PTR [r14+160]
|
|
mov rax, QWORD PTR [r8+168]
|
|
mov QWORD PTR [r13+160], r10
|
|
adc rax, QWORD PTR [r14+168]
|
|
mov r9, QWORD PTR [r8+176]
|
|
mov QWORD PTR [r13+168], rax
|
|
adc r9, QWORD PTR [r14+176]
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov QWORD PTR [r13+176], r9
|
|
adc r10, QWORD PTR [r14+184]
|
|
mov rax, QWORD PTR [r8+192]
|
|
mov QWORD PTR [r13+184], r10
|
|
adc rax, QWORD PTR [r14+192]
|
|
mov r9, QWORD PTR [r8+200]
|
|
mov QWORD PTR [r13+192], rax
|
|
adc r9, QWORD PTR [r14+200]
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov QWORD PTR [r13+200], r9
|
|
adc r10, QWORD PTR [r14+208]
|
|
mov rax, QWORD PTR [r8+216]
|
|
mov QWORD PTR [r13+208], r10
|
|
adc rax, QWORD PTR [r14+216]
|
|
mov r9, QWORD PTR [r8+224]
|
|
mov QWORD PTR [r13+216], rax
|
|
adc r9, QWORD PTR [r14+224]
|
|
mov r10, QWORD PTR [r8+232]
|
|
mov QWORD PTR [r13+224], r9
|
|
adc r10, QWORD PTR [r14+232]
|
|
mov rax, QWORD PTR [r8+240]
|
|
mov QWORD PTR [r13+232], r10
|
|
adc rax, QWORD PTR [r14+240]
|
|
mov r9, QWORD PTR [r8+248]
|
|
mov QWORD PTR [r13+240], rax
|
|
adc r9, QWORD PTR [r14+248]
|
|
mov QWORD PTR [r13+248], r9
|
|
adc rdi, 0
|
|
mov QWORD PTR [rsp+1568], rdi
|
|
mov r8, r13
|
|
mov rdx, r12
|
|
mov rcx, rsp
|
|
call sp_2048_mul_avx2_32
|
|
mov r8, QWORD PTR [rsp+1552]
|
|
mov rdx, QWORD PTR [rsp+1544]
|
|
lea rcx, QWORD PTR [rsp+512]
|
|
add r8, 256
|
|
add rdx, 256
|
|
call sp_2048_mul_avx2_32
|
|
mov r8, QWORD PTR [rsp+1552]
|
|
mov rdx, QWORD PTR [rsp+1544]
|
|
mov rcx, QWORD PTR [rsp+1536]
|
|
call sp_2048_mul_avx2_32
|
|
IFDEF _WIN64
|
|
mov r8, QWORD PTR [rsp+1552]
|
|
mov rdx, QWORD PTR [rsp+1544]
|
|
mov rcx, QWORD PTR [rsp+1536]
|
|
ENDIF
|
|
mov r15, QWORD PTR [rsp+1560]
|
|
mov rdi, QWORD PTR [rsp+1568]
|
|
mov rsi, QWORD PTR [rsp+1536]
|
|
mov r11, r15
|
|
lea r12, QWORD PTR [rsp+1024]
|
|
lea r13, QWORD PTR [rsp+1280]
|
|
and r11, rdi
|
|
neg r15
|
|
neg rdi
|
|
add rsi, 512
|
|
mov rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [r13]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
add rax, r9
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [r13+8]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [r13+16]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [r13+24]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [r13+32]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [r13+40]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [r13+48]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [r13+56]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [r13+64]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [r13+72]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [r13+80]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [r13+88]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [r13+96]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [r13+104]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [r13+112]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [r13+120]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [r13+128]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [r13+136]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [r13+144]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [r13+152]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [r13+160]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [r13+168]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [r13+176]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [r13+184]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [r13+192]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [r13+200]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [r13+208]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [r13+216]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [r13+224]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, r10
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [r13+232]
|
|
pext r10, r10, rdi
|
|
pext rax, rax, r15
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, rax
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [r13+240]
|
|
pext rax, rax, rdi
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, r9
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov r10, QWORD PTR [r13+248]
|
|
pext r9, r9, rdi
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, r10
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r11, 0
|
|
lea r13, QWORD PTR [rsp+512]
|
|
mov r12, rsp
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [r13+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [r13+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [r13+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [r13+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [r13+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [r13+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [r13+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [r13+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [r13+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [r13+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [r13+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [r13+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [r13+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [r13+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [r13+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [r13+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [r13+376]
|
|
mov rax, QWORD PTR [r12+384]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb rax, QWORD PTR [r13+384]
|
|
mov r9, QWORD PTR [r12+392]
|
|
mov QWORD PTR [r12+384], rax
|
|
sbb r9, QWORD PTR [r13+392]
|
|
mov r10, QWORD PTR [r12+400]
|
|
mov QWORD PTR [r12+392], r9
|
|
sbb r10, QWORD PTR [r13+400]
|
|
mov rax, QWORD PTR [r12+408]
|
|
mov QWORD PTR [r12+400], r10
|
|
sbb rax, QWORD PTR [r13+408]
|
|
mov r9, QWORD PTR [r12+416]
|
|
mov QWORD PTR [r12+408], rax
|
|
sbb r9, QWORD PTR [r13+416]
|
|
mov r10, QWORD PTR [r12+424]
|
|
mov QWORD PTR [r12+416], r9
|
|
sbb r10, QWORD PTR [r13+424]
|
|
mov rax, QWORD PTR [r12+432]
|
|
mov QWORD PTR [r12+424], r10
|
|
sbb rax, QWORD PTR [r13+432]
|
|
mov r9, QWORD PTR [r12+440]
|
|
mov QWORD PTR [r12+432], rax
|
|
sbb r9, QWORD PTR [r13+440]
|
|
mov r10, QWORD PTR [r12+448]
|
|
mov QWORD PTR [r12+440], r9
|
|
sbb r10, QWORD PTR [r13+448]
|
|
mov rax, QWORD PTR [r12+456]
|
|
mov QWORD PTR [r12+448], r10
|
|
sbb rax, QWORD PTR [r13+456]
|
|
mov r9, QWORD PTR [r12+464]
|
|
mov QWORD PTR [r12+456], rax
|
|
sbb r9, QWORD PTR [r13+464]
|
|
mov r10, QWORD PTR [r12+472]
|
|
mov QWORD PTR [r12+464], r9
|
|
sbb r10, QWORD PTR [r13+472]
|
|
mov rax, QWORD PTR [r12+480]
|
|
mov QWORD PTR [r12+472], r10
|
|
sbb rax, QWORD PTR [r13+480]
|
|
mov r9, QWORD PTR [r12+488]
|
|
mov QWORD PTR [r12+480], rax
|
|
sbb r9, QWORD PTR [r13+488]
|
|
mov r10, QWORD PTR [r12+496]
|
|
mov QWORD PTR [r12+488], r9
|
|
sbb r10, QWORD PTR [r13+496]
|
|
mov rax, QWORD PTR [r12+504]
|
|
mov QWORD PTR [r12+496], r10
|
|
sbb rax, QWORD PTR [r13+504]
|
|
mov QWORD PTR [r12+504], rax
|
|
sbb r11, 0
|
|
mov rax, QWORD PTR [r12]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [r12+8]
|
|
mov QWORD PTR [r12], rax
|
|
sbb r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [r12+16]
|
|
mov QWORD PTR [r12+8], r9
|
|
sbb r10, QWORD PTR [rcx+16]
|
|
mov rax, QWORD PTR [r12+24]
|
|
mov QWORD PTR [r12+16], r10
|
|
sbb rax, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [r12+32]
|
|
mov QWORD PTR [r12+24], rax
|
|
sbb r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [r12+40]
|
|
mov QWORD PTR [r12+32], r9
|
|
sbb r10, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r12+48]
|
|
mov QWORD PTR [r12+40], r10
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [r12+56]
|
|
mov QWORD PTR [r12+48], rax
|
|
sbb r9, QWORD PTR [rcx+56]
|
|
mov r10, QWORD PTR [r12+64]
|
|
mov QWORD PTR [r12+56], r9
|
|
sbb r10, QWORD PTR [rcx+64]
|
|
mov rax, QWORD PTR [r12+72]
|
|
mov QWORD PTR [r12+64], r10
|
|
sbb rax, QWORD PTR [rcx+72]
|
|
mov r9, QWORD PTR [r12+80]
|
|
mov QWORD PTR [r12+72], rax
|
|
sbb r9, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [r12+88]
|
|
mov QWORD PTR [r12+80], r9
|
|
sbb r10, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r12+96]
|
|
mov QWORD PTR [r12+88], r10
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [r12+104]
|
|
mov QWORD PTR [r12+96], rax
|
|
sbb r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [r12+112]
|
|
mov QWORD PTR [r12+104], r9
|
|
sbb r10, QWORD PTR [rcx+112]
|
|
mov rax, QWORD PTR [r12+120]
|
|
mov QWORD PTR [r12+112], r10
|
|
sbb rax, QWORD PTR [rcx+120]
|
|
mov r9, QWORD PTR [r12+128]
|
|
mov QWORD PTR [r12+120], rax
|
|
sbb r9, QWORD PTR [rcx+128]
|
|
mov r10, QWORD PTR [r12+136]
|
|
mov QWORD PTR [r12+128], r9
|
|
sbb r10, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r12+144]
|
|
mov QWORD PTR [r12+136], r10
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r9, QWORD PTR [r12+152]
|
|
mov QWORD PTR [r12+144], rax
|
|
sbb r9, QWORD PTR [rcx+152]
|
|
mov r10, QWORD PTR [r12+160]
|
|
mov QWORD PTR [r12+152], r9
|
|
sbb r10, QWORD PTR [rcx+160]
|
|
mov rax, QWORD PTR [r12+168]
|
|
mov QWORD PTR [r12+160], r10
|
|
sbb rax, QWORD PTR [rcx+168]
|
|
mov r9, QWORD PTR [r12+176]
|
|
mov QWORD PTR [r12+168], rax
|
|
sbb r9, QWORD PTR [rcx+176]
|
|
mov r10, QWORD PTR [r12+184]
|
|
mov QWORD PTR [r12+176], r9
|
|
sbb r10, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r12+192]
|
|
mov QWORD PTR [r12+184], r10
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r9, QWORD PTR [r12+200]
|
|
mov QWORD PTR [r12+192], rax
|
|
sbb r9, QWORD PTR [rcx+200]
|
|
mov r10, QWORD PTR [r12+208]
|
|
mov QWORD PTR [r12+200], r9
|
|
sbb r10, QWORD PTR [rcx+208]
|
|
mov rax, QWORD PTR [r12+216]
|
|
mov QWORD PTR [r12+208], r10
|
|
sbb rax, QWORD PTR [rcx+216]
|
|
mov r9, QWORD PTR [r12+224]
|
|
mov QWORD PTR [r12+216], rax
|
|
sbb r9, QWORD PTR [rcx+224]
|
|
mov r10, QWORD PTR [r12+232]
|
|
mov QWORD PTR [r12+224], r9
|
|
sbb r10, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r12+240]
|
|
mov QWORD PTR [r12+232], r10
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r9, QWORD PTR [r12+248]
|
|
mov QWORD PTR [r12+240], rax
|
|
sbb r9, QWORD PTR [rcx+248]
|
|
mov r10, QWORD PTR [r12+256]
|
|
mov QWORD PTR [r12+248], r9
|
|
sbb r10, QWORD PTR [rcx+256]
|
|
mov rax, QWORD PTR [r12+264]
|
|
mov QWORD PTR [r12+256], r10
|
|
sbb rax, QWORD PTR [rcx+264]
|
|
mov r9, QWORD PTR [r12+272]
|
|
mov QWORD PTR [r12+264], rax
|
|
sbb r9, QWORD PTR [rcx+272]
|
|
mov r10, QWORD PTR [r12+280]
|
|
mov QWORD PTR [r12+272], r9
|
|
sbb r10, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r12+288]
|
|
mov QWORD PTR [r12+280], r10
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r9, QWORD PTR [r12+296]
|
|
mov QWORD PTR [r12+288], rax
|
|
sbb r9, QWORD PTR [rcx+296]
|
|
mov r10, QWORD PTR [r12+304]
|
|
mov QWORD PTR [r12+296], r9
|
|
sbb r10, QWORD PTR [rcx+304]
|
|
mov rax, QWORD PTR [r12+312]
|
|
mov QWORD PTR [r12+304], r10
|
|
sbb rax, QWORD PTR [rcx+312]
|
|
mov r9, QWORD PTR [r12+320]
|
|
mov QWORD PTR [r12+312], rax
|
|
sbb r9, QWORD PTR [rcx+320]
|
|
mov r10, QWORD PTR [r12+328]
|
|
mov QWORD PTR [r12+320], r9
|
|
sbb r10, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r12+336]
|
|
mov QWORD PTR [r12+328], r10
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r9, QWORD PTR [r12+344]
|
|
mov QWORD PTR [r12+336], rax
|
|
sbb r9, QWORD PTR [rcx+344]
|
|
mov r10, QWORD PTR [r12+352]
|
|
mov QWORD PTR [r12+344], r9
|
|
sbb r10, QWORD PTR [rcx+352]
|
|
mov rax, QWORD PTR [r12+360]
|
|
mov QWORD PTR [r12+352], r10
|
|
sbb rax, QWORD PTR [rcx+360]
|
|
mov r9, QWORD PTR [r12+368]
|
|
mov QWORD PTR [r12+360], rax
|
|
sbb r9, QWORD PTR [rcx+368]
|
|
mov r10, QWORD PTR [r12+376]
|
|
mov QWORD PTR [r12+368], r9
|
|
sbb r10, QWORD PTR [rcx+376]
|
|
mov rax, QWORD PTR [r12+384]
|
|
mov QWORD PTR [r12+376], r10
|
|
sbb rax, QWORD PTR [rcx+384]
|
|
mov r9, QWORD PTR [r12+392]
|
|
mov QWORD PTR [r12+384], rax
|
|
sbb r9, QWORD PTR [rcx+392]
|
|
mov r10, QWORD PTR [r12+400]
|
|
mov QWORD PTR [r12+392], r9
|
|
sbb r10, QWORD PTR [rcx+400]
|
|
mov rax, QWORD PTR [r12+408]
|
|
mov QWORD PTR [r12+400], r10
|
|
sbb rax, QWORD PTR [rcx+408]
|
|
mov r9, QWORD PTR [r12+416]
|
|
mov QWORD PTR [r12+408], rax
|
|
sbb r9, QWORD PTR [rcx+416]
|
|
mov r10, QWORD PTR [r12+424]
|
|
mov QWORD PTR [r12+416], r9
|
|
sbb r10, QWORD PTR [rcx+424]
|
|
mov rax, QWORD PTR [r12+432]
|
|
mov QWORD PTR [r12+424], r10
|
|
sbb rax, QWORD PTR [rcx+432]
|
|
mov r9, QWORD PTR [r12+440]
|
|
mov QWORD PTR [r12+432], rax
|
|
sbb r9, QWORD PTR [rcx+440]
|
|
mov r10, QWORD PTR [r12+448]
|
|
mov QWORD PTR [r12+440], r9
|
|
sbb r10, QWORD PTR [rcx+448]
|
|
mov rax, QWORD PTR [r12+456]
|
|
mov QWORD PTR [r12+448], r10
|
|
sbb rax, QWORD PTR [rcx+456]
|
|
mov r9, QWORD PTR [r12+464]
|
|
mov QWORD PTR [r12+456], rax
|
|
sbb r9, QWORD PTR [rcx+464]
|
|
mov r10, QWORD PTR [r12+472]
|
|
mov QWORD PTR [r12+464], r9
|
|
sbb r10, QWORD PTR [rcx+472]
|
|
mov rax, QWORD PTR [r12+480]
|
|
mov QWORD PTR [r12+472], r10
|
|
sbb rax, QWORD PTR [rcx+480]
|
|
mov r9, QWORD PTR [r12+488]
|
|
mov QWORD PTR [r12+480], rax
|
|
sbb r9, QWORD PTR [rcx+488]
|
|
mov r10, QWORD PTR [r12+496]
|
|
mov QWORD PTR [r12+488], r9
|
|
sbb r10, QWORD PTR [rcx+496]
|
|
mov rax, QWORD PTR [r12+504]
|
|
mov QWORD PTR [r12+496], r10
|
|
sbb rax, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [r12+504], rax
|
|
sbb r11, 0
|
|
sub rsi, 256
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
add rax, QWORD PTR [r12]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r12+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r12+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r12+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r12+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r12+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r12+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r12+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r12+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r12+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r12+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r12+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r12+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r12+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r12+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r12+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r12+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r12+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r12+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r12+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r12+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r12+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r12+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r12+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r12+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r12+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r12+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r12+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r12+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r12+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r12+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r12+248]
|
|
mov r10, QWORD PTR [rsi+256]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r10, QWORD PTR [r12+256]
|
|
mov rax, QWORD PTR [rsi+264]
|
|
mov QWORD PTR [rsi+256], r10
|
|
adc rax, QWORD PTR [r12+264]
|
|
mov r9, QWORD PTR [rsi+272]
|
|
mov QWORD PTR [rsi+264], rax
|
|
adc r9, QWORD PTR [r12+272]
|
|
mov r10, QWORD PTR [rsi+280]
|
|
mov QWORD PTR [rsi+272], r9
|
|
adc r10, QWORD PTR [r12+280]
|
|
mov rax, QWORD PTR [rsi+288]
|
|
mov QWORD PTR [rsi+280], r10
|
|
adc rax, QWORD PTR [r12+288]
|
|
mov r9, QWORD PTR [rsi+296]
|
|
mov QWORD PTR [rsi+288], rax
|
|
adc r9, QWORD PTR [r12+296]
|
|
mov r10, QWORD PTR [rsi+304]
|
|
mov QWORD PTR [rsi+296], r9
|
|
adc r10, QWORD PTR [r12+304]
|
|
mov rax, QWORD PTR [rsi+312]
|
|
mov QWORD PTR [rsi+304], r10
|
|
adc rax, QWORD PTR [r12+312]
|
|
mov r9, QWORD PTR [rsi+320]
|
|
mov QWORD PTR [rsi+312], rax
|
|
adc r9, QWORD PTR [r12+320]
|
|
mov r10, QWORD PTR [rsi+328]
|
|
mov QWORD PTR [rsi+320], r9
|
|
adc r10, QWORD PTR [r12+328]
|
|
mov rax, QWORD PTR [rsi+336]
|
|
mov QWORD PTR [rsi+328], r10
|
|
adc rax, QWORD PTR [r12+336]
|
|
mov r9, QWORD PTR [rsi+344]
|
|
mov QWORD PTR [rsi+336], rax
|
|
adc r9, QWORD PTR [r12+344]
|
|
mov r10, QWORD PTR [rsi+352]
|
|
mov QWORD PTR [rsi+344], r9
|
|
adc r10, QWORD PTR [r12+352]
|
|
mov rax, QWORD PTR [rsi+360]
|
|
mov QWORD PTR [rsi+352], r10
|
|
adc rax, QWORD PTR [r12+360]
|
|
mov r9, QWORD PTR [rsi+368]
|
|
mov QWORD PTR [rsi+360], rax
|
|
adc r9, QWORD PTR [r12+368]
|
|
mov r10, QWORD PTR [rsi+376]
|
|
mov QWORD PTR [rsi+368], r9
|
|
adc r10, QWORD PTR [r12+376]
|
|
mov rax, QWORD PTR [rsi+384]
|
|
mov QWORD PTR [rsi+376], r10
|
|
adc rax, QWORD PTR [r12+384]
|
|
mov r9, QWORD PTR [rsi+392]
|
|
mov QWORD PTR [rsi+384], rax
|
|
adc r9, QWORD PTR [r12+392]
|
|
mov r10, QWORD PTR [rsi+400]
|
|
mov QWORD PTR [rsi+392], r9
|
|
adc r10, QWORD PTR [r12+400]
|
|
mov rax, QWORD PTR [rsi+408]
|
|
mov QWORD PTR [rsi+400], r10
|
|
adc rax, QWORD PTR [r12+408]
|
|
mov r9, QWORD PTR [rsi+416]
|
|
mov QWORD PTR [rsi+408], rax
|
|
adc r9, QWORD PTR [r12+416]
|
|
mov r10, QWORD PTR [rsi+424]
|
|
mov QWORD PTR [rsi+416], r9
|
|
adc r10, QWORD PTR [r12+424]
|
|
mov rax, QWORD PTR [rsi+432]
|
|
mov QWORD PTR [rsi+424], r10
|
|
adc rax, QWORD PTR [r12+432]
|
|
mov r9, QWORD PTR [rsi+440]
|
|
mov QWORD PTR [rsi+432], rax
|
|
adc r9, QWORD PTR [r12+440]
|
|
mov r10, QWORD PTR [rsi+448]
|
|
mov QWORD PTR [rsi+440], r9
|
|
adc r10, QWORD PTR [r12+448]
|
|
mov rax, QWORD PTR [rsi+456]
|
|
mov QWORD PTR [rsi+448], r10
|
|
adc rax, QWORD PTR [r12+456]
|
|
mov r9, QWORD PTR [rsi+464]
|
|
mov QWORD PTR [rsi+456], rax
|
|
adc r9, QWORD PTR [r12+464]
|
|
mov r10, QWORD PTR [rsi+472]
|
|
mov QWORD PTR [rsi+464], r9
|
|
adc r10, QWORD PTR [r12+472]
|
|
mov rax, QWORD PTR [rsi+480]
|
|
mov QWORD PTR [rsi+472], r10
|
|
adc rax, QWORD PTR [r12+480]
|
|
mov r9, QWORD PTR [rsi+488]
|
|
mov QWORD PTR [rsi+480], rax
|
|
adc r9, QWORD PTR [r12+488]
|
|
mov r10, QWORD PTR [rsi+496]
|
|
mov QWORD PTR [rsi+488], r9
|
|
adc r10, QWORD PTR [r12+496]
|
|
mov rax, QWORD PTR [rsi+504]
|
|
mov QWORD PTR [rsi+496], r10
|
|
adc rax, QWORD PTR [r12+504]
|
|
mov QWORD PTR [rsi+504], rax
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+768], r11
|
|
add rsi, 256
|
|
; Add
|
|
mov rax, QWORD PTR [rsi]
|
|
xor r11, r11
|
|
add rax, QWORD PTR [r13]
|
|
mov r9, QWORD PTR [rsi+8]
|
|
mov QWORD PTR [rsi], rax
|
|
adc r9, QWORD PTR [r13+8]
|
|
mov r10, QWORD PTR [rsi+16]
|
|
mov QWORD PTR [rsi+8], r9
|
|
adc r10, QWORD PTR [r13+16]
|
|
mov rax, QWORD PTR [rsi+24]
|
|
mov QWORD PTR [rsi+16], r10
|
|
adc rax, QWORD PTR [r13+24]
|
|
mov r9, QWORD PTR [rsi+32]
|
|
mov QWORD PTR [rsi+24], rax
|
|
adc r9, QWORD PTR [r13+32]
|
|
mov r10, QWORD PTR [rsi+40]
|
|
mov QWORD PTR [rsi+32], r9
|
|
adc r10, QWORD PTR [r13+40]
|
|
mov rax, QWORD PTR [rsi+48]
|
|
mov QWORD PTR [rsi+40], r10
|
|
adc rax, QWORD PTR [r13+48]
|
|
mov r9, QWORD PTR [rsi+56]
|
|
mov QWORD PTR [rsi+48], rax
|
|
adc r9, QWORD PTR [r13+56]
|
|
mov r10, QWORD PTR [rsi+64]
|
|
mov QWORD PTR [rsi+56], r9
|
|
adc r10, QWORD PTR [r13+64]
|
|
mov rax, QWORD PTR [rsi+72]
|
|
mov QWORD PTR [rsi+64], r10
|
|
adc rax, QWORD PTR [r13+72]
|
|
mov r9, QWORD PTR [rsi+80]
|
|
mov QWORD PTR [rsi+72], rax
|
|
adc r9, QWORD PTR [r13+80]
|
|
mov r10, QWORD PTR [rsi+88]
|
|
mov QWORD PTR [rsi+80], r9
|
|
adc r10, QWORD PTR [r13+88]
|
|
mov rax, QWORD PTR [rsi+96]
|
|
mov QWORD PTR [rsi+88], r10
|
|
adc rax, QWORD PTR [r13+96]
|
|
mov r9, QWORD PTR [rsi+104]
|
|
mov QWORD PTR [rsi+96], rax
|
|
adc r9, QWORD PTR [r13+104]
|
|
mov r10, QWORD PTR [rsi+112]
|
|
mov QWORD PTR [rsi+104], r9
|
|
adc r10, QWORD PTR [r13+112]
|
|
mov rax, QWORD PTR [rsi+120]
|
|
mov QWORD PTR [rsi+112], r10
|
|
adc rax, QWORD PTR [r13+120]
|
|
mov r9, QWORD PTR [rsi+128]
|
|
mov QWORD PTR [rsi+120], rax
|
|
adc r9, QWORD PTR [r13+128]
|
|
mov r10, QWORD PTR [rsi+136]
|
|
mov QWORD PTR [rsi+128], r9
|
|
adc r10, QWORD PTR [r13+136]
|
|
mov rax, QWORD PTR [rsi+144]
|
|
mov QWORD PTR [rsi+136], r10
|
|
adc rax, QWORD PTR [r13+144]
|
|
mov r9, QWORD PTR [rsi+152]
|
|
mov QWORD PTR [rsi+144], rax
|
|
adc r9, QWORD PTR [r13+152]
|
|
mov r10, QWORD PTR [rsi+160]
|
|
mov QWORD PTR [rsi+152], r9
|
|
adc r10, QWORD PTR [r13+160]
|
|
mov rax, QWORD PTR [rsi+168]
|
|
mov QWORD PTR [rsi+160], r10
|
|
adc rax, QWORD PTR [r13+168]
|
|
mov r9, QWORD PTR [rsi+176]
|
|
mov QWORD PTR [rsi+168], rax
|
|
adc r9, QWORD PTR [r13+176]
|
|
mov r10, QWORD PTR [rsi+184]
|
|
mov QWORD PTR [rsi+176], r9
|
|
adc r10, QWORD PTR [r13+184]
|
|
mov rax, QWORD PTR [rsi+192]
|
|
mov QWORD PTR [rsi+184], r10
|
|
adc rax, QWORD PTR [r13+192]
|
|
mov r9, QWORD PTR [rsi+200]
|
|
mov QWORD PTR [rsi+192], rax
|
|
adc r9, QWORD PTR [r13+200]
|
|
mov r10, QWORD PTR [rsi+208]
|
|
mov QWORD PTR [rsi+200], r9
|
|
adc r10, QWORD PTR [r13+208]
|
|
mov rax, QWORD PTR [rsi+216]
|
|
mov QWORD PTR [rsi+208], r10
|
|
adc rax, QWORD PTR [r13+216]
|
|
mov r9, QWORD PTR [rsi+224]
|
|
mov QWORD PTR [rsi+216], rax
|
|
adc r9, QWORD PTR [r13+224]
|
|
mov r10, QWORD PTR [rsi+232]
|
|
mov QWORD PTR [rsi+224], r9
|
|
adc r10, QWORD PTR [r13+232]
|
|
mov rax, QWORD PTR [rsi+240]
|
|
mov QWORD PTR [rsi+232], r10
|
|
adc rax, QWORD PTR [r13+240]
|
|
mov r9, QWORD PTR [rsi+248]
|
|
mov QWORD PTR [rsi+240], rax
|
|
adc r9, QWORD PTR [r13+248]
|
|
mov r10, QWORD PTR [rsi+256]
|
|
mov QWORD PTR [rsi+248], r9
|
|
adc r10, QWORD PTR [r13+256]
|
|
mov QWORD PTR [rsi+256], r10
|
|
adc r11, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [r13+264]
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+272]
|
|
mov QWORD PTR [rsi+264], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+280]
|
|
mov QWORD PTR [rsi+272], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+288]
|
|
mov QWORD PTR [rsi+280], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+296]
|
|
mov QWORD PTR [rsi+288], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+304]
|
|
mov QWORD PTR [rsi+296], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+312]
|
|
mov QWORD PTR [rsi+304], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+320]
|
|
mov QWORD PTR [rsi+312], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+328]
|
|
mov QWORD PTR [rsi+320], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+336]
|
|
mov QWORD PTR [rsi+328], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+344]
|
|
mov QWORD PTR [rsi+336], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+352]
|
|
mov QWORD PTR [rsi+344], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+360]
|
|
mov QWORD PTR [rsi+352], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+368]
|
|
mov QWORD PTR [rsi+360], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+376]
|
|
mov QWORD PTR [rsi+368], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+384]
|
|
mov QWORD PTR [rsi+376], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+392]
|
|
mov QWORD PTR [rsi+384], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+400]
|
|
mov QWORD PTR [rsi+392], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+408]
|
|
mov QWORD PTR [rsi+400], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+416]
|
|
mov QWORD PTR [rsi+408], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+424]
|
|
mov QWORD PTR [rsi+416], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+432]
|
|
mov QWORD PTR [rsi+424], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+440]
|
|
mov QWORD PTR [rsi+432], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+448]
|
|
mov QWORD PTR [rsi+440], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+456]
|
|
mov QWORD PTR [rsi+448], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+464]
|
|
mov QWORD PTR [rsi+456], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+472]
|
|
mov QWORD PTR [rsi+464], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+480]
|
|
mov QWORD PTR [rsi+472], r10
|
|
adc rax, 0
|
|
mov r9, QWORD PTR [r13+488]
|
|
mov QWORD PTR [rsi+480], rax
|
|
adc r9, 0
|
|
mov r10, QWORD PTR [r13+496]
|
|
mov QWORD PTR [rsi+488], r9
|
|
adc r10, 0
|
|
mov rax, QWORD PTR [r13+504]
|
|
mov QWORD PTR [rsi+496], r10
|
|
adc rax, 0
|
|
mov QWORD PTR [rsi+504], rax
|
|
add rsp, 1576
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_mul_avx2_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_sqr_avx2_64 PROC
|
|
push r12
|
|
sub rsp, 1304
|
|
mov QWORD PTR [rsp+1280], rcx
|
|
mov QWORD PTR [rsp+1288], rdx
|
|
lea r10, QWORD PTR [rsp+1024]
|
|
lea r11, QWORD PTR [rdx+256]
|
|
; Add
|
|
mov rax, QWORD PTR [rdx]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [r11]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [r10], rax
|
|
adc r8, QWORD PTR [r11+8]
|
|
mov rax, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
adc rax, QWORD PTR [r11+16]
|
|
mov r8, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
adc r8, QWORD PTR [r11+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
adc rax, QWORD PTR [r11+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
adc r8, QWORD PTR [r11+40]
|
|
mov rax, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
adc rax, QWORD PTR [r11+48]
|
|
mov r8, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
adc r8, QWORD PTR [r11+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
adc rax, QWORD PTR [r11+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
adc r8, QWORD PTR [r11+72]
|
|
mov rax, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
adc rax, QWORD PTR [r11+80]
|
|
mov r8, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
adc r8, QWORD PTR [r11+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
adc rax, QWORD PTR [r11+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
adc r8, QWORD PTR [r11+104]
|
|
mov rax, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
adc rax, QWORD PTR [r11+112]
|
|
mov r8, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
adc r8, QWORD PTR [r11+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
adc rax, QWORD PTR [r11+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
adc r8, QWORD PTR [r11+136]
|
|
mov rax, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
adc rax, QWORD PTR [r11+144]
|
|
mov r8, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
adc r8, QWORD PTR [r11+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
adc rax, QWORD PTR [r11+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
adc r8, QWORD PTR [r11+168]
|
|
mov rax, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
adc rax, QWORD PTR [r11+176]
|
|
mov r8, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
adc r8, QWORD PTR [r11+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
adc rax, QWORD PTR [r11+192]
|
|
mov r8, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
adc r8, QWORD PTR [r11+200]
|
|
mov rax, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
adc rax, QWORD PTR [r11+208]
|
|
mov r8, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
adc r8, QWORD PTR [r11+216]
|
|
mov rax, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
adc rax, QWORD PTR [r11+224]
|
|
mov r8, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
adc r8, QWORD PTR [r11+232]
|
|
mov rax, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
adc rax, QWORD PTR [r11+240]
|
|
mov r8, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
adc r8, QWORD PTR [r11+248]
|
|
mov QWORD PTR [r10+248], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+1296], r9
|
|
mov rdx, r10
|
|
mov rcx, rsp
|
|
call sp_2048_sqr_avx2_32
|
|
mov rdx, QWORD PTR [rsp+1288]
|
|
lea rcx, QWORD PTR [rsp+512]
|
|
add rdx, 256
|
|
call sp_2048_sqr_avx2_32
|
|
mov rdx, QWORD PTR [rsp+1288]
|
|
mov rcx, QWORD PTR [rsp+1280]
|
|
call sp_2048_sqr_avx2_32
|
|
IFDEF _WIN64
|
|
mov rdx, QWORD PTR [rsp+1288]
|
|
mov rcx, QWORD PTR [rsp+1280]
|
|
ENDIF
|
|
mov r12, QWORD PTR [rsp+1296]
|
|
lea r10, QWORD PTR [rsp+1024]
|
|
mov r9, r12
|
|
neg r12
|
|
mov rax, QWORD PTR [r10]
|
|
pext rax, rax, r12
|
|
add rax, rax
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [rcx+512], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [rcx+520], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [rcx+528], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [rcx+536], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [rcx+544], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [rcx+552], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [rcx+560], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [rcx+568], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [rcx+576], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [rcx+584], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [rcx+592], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [rcx+600], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [rcx+608], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [rcx+616], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [rcx+624], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [rcx+632], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [rcx+640], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [rcx+648], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [rcx+656], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [rcx+664], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [rcx+672], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [rcx+680], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [rcx+688], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [rcx+696], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [rcx+704], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [rcx+712], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [rcx+720], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [rcx+728], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [rcx+736], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [rcx+744], r8
|
|
pext rax, rax, r12
|
|
adc rax, rax
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [rcx+752], rax
|
|
pext r8, r8, r12
|
|
adc r8, r8
|
|
mov QWORD PTR [rcx+760], r8
|
|
adc r9, 0
|
|
lea rdx, QWORD PTR [rsp+512]
|
|
mov r10, rsp
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rdx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rdx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rdx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rdx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rdx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rdx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rdx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rdx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rdx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rdx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rdx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rdx+376]
|
|
mov rax, QWORD PTR [r10+384]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb rax, QWORD PTR [rdx+384]
|
|
mov r8, QWORD PTR [r10+392]
|
|
mov QWORD PTR [r10+384], rax
|
|
sbb r8, QWORD PTR [rdx+392]
|
|
mov rax, QWORD PTR [r10+400]
|
|
mov QWORD PTR [r10+392], r8
|
|
sbb rax, QWORD PTR [rdx+400]
|
|
mov r8, QWORD PTR [r10+408]
|
|
mov QWORD PTR [r10+400], rax
|
|
sbb r8, QWORD PTR [rdx+408]
|
|
mov rax, QWORD PTR [r10+416]
|
|
mov QWORD PTR [r10+408], r8
|
|
sbb rax, QWORD PTR [rdx+416]
|
|
mov r8, QWORD PTR [r10+424]
|
|
mov QWORD PTR [r10+416], rax
|
|
sbb r8, QWORD PTR [rdx+424]
|
|
mov rax, QWORD PTR [r10+432]
|
|
mov QWORD PTR [r10+424], r8
|
|
sbb rax, QWORD PTR [rdx+432]
|
|
mov r8, QWORD PTR [r10+440]
|
|
mov QWORD PTR [r10+432], rax
|
|
sbb r8, QWORD PTR [rdx+440]
|
|
mov rax, QWORD PTR [r10+448]
|
|
mov QWORD PTR [r10+440], r8
|
|
sbb rax, QWORD PTR [rdx+448]
|
|
mov r8, QWORD PTR [r10+456]
|
|
mov QWORD PTR [r10+448], rax
|
|
sbb r8, QWORD PTR [rdx+456]
|
|
mov rax, QWORD PTR [r10+464]
|
|
mov QWORD PTR [r10+456], r8
|
|
sbb rax, QWORD PTR [rdx+464]
|
|
mov r8, QWORD PTR [r10+472]
|
|
mov QWORD PTR [r10+464], rax
|
|
sbb r8, QWORD PTR [rdx+472]
|
|
mov rax, QWORD PTR [r10+480]
|
|
mov QWORD PTR [r10+472], r8
|
|
sbb rax, QWORD PTR [rdx+480]
|
|
mov r8, QWORD PTR [r10+488]
|
|
mov QWORD PTR [r10+480], rax
|
|
sbb r8, QWORD PTR [rdx+488]
|
|
mov rax, QWORD PTR [r10+496]
|
|
mov QWORD PTR [r10+488], r8
|
|
sbb rax, QWORD PTR [rdx+496]
|
|
mov r8, QWORD PTR [r10+504]
|
|
mov QWORD PTR [r10+496], rax
|
|
sbb r8, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [r10+504], r8
|
|
sbb r9, 0
|
|
mov rax, QWORD PTR [r10]
|
|
sub rax, QWORD PTR [rcx]
|
|
mov r8, QWORD PTR [r10+8]
|
|
mov QWORD PTR [r10], rax
|
|
sbb r8, QWORD PTR [rcx+8]
|
|
mov rax, QWORD PTR [r10+16]
|
|
mov QWORD PTR [r10+8], r8
|
|
sbb rax, QWORD PTR [rcx+16]
|
|
mov r8, QWORD PTR [r10+24]
|
|
mov QWORD PTR [r10+16], rax
|
|
sbb r8, QWORD PTR [rcx+24]
|
|
mov rax, QWORD PTR [r10+32]
|
|
mov QWORD PTR [r10+24], r8
|
|
sbb rax, QWORD PTR [rcx+32]
|
|
mov r8, QWORD PTR [r10+40]
|
|
mov QWORD PTR [r10+32], rax
|
|
sbb r8, QWORD PTR [rcx+40]
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov QWORD PTR [r10+40], r8
|
|
sbb rax, QWORD PTR [rcx+48]
|
|
mov r8, QWORD PTR [r10+56]
|
|
mov QWORD PTR [r10+48], rax
|
|
sbb r8, QWORD PTR [rcx+56]
|
|
mov rax, QWORD PTR [r10+64]
|
|
mov QWORD PTR [r10+56], r8
|
|
sbb rax, QWORD PTR [rcx+64]
|
|
mov r8, QWORD PTR [r10+72]
|
|
mov QWORD PTR [r10+64], rax
|
|
sbb r8, QWORD PTR [rcx+72]
|
|
mov rax, QWORD PTR [r10+80]
|
|
mov QWORD PTR [r10+72], r8
|
|
sbb rax, QWORD PTR [rcx+80]
|
|
mov r8, QWORD PTR [r10+88]
|
|
mov QWORD PTR [r10+80], rax
|
|
sbb r8, QWORD PTR [rcx+88]
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov QWORD PTR [r10+88], r8
|
|
sbb rax, QWORD PTR [rcx+96]
|
|
mov r8, QWORD PTR [r10+104]
|
|
mov QWORD PTR [r10+96], rax
|
|
sbb r8, QWORD PTR [rcx+104]
|
|
mov rax, QWORD PTR [r10+112]
|
|
mov QWORD PTR [r10+104], r8
|
|
sbb rax, QWORD PTR [rcx+112]
|
|
mov r8, QWORD PTR [r10+120]
|
|
mov QWORD PTR [r10+112], rax
|
|
sbb r8, QWORD PTR [rcx+120]
|
|
mov rax, QWORD PTR [r10+128]
|
|
mov QWORD PTR [r10+120], r8
|
|
sbb rax, QWORD PTR [rcx+128]
|
|
mov r8, QWORD PTR [r10+136]
|
|
mov QWORD PTR [r10+128], rax
|
|
sbb r8, QWORD PTR [rcx+136]
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov QWORD PTR [r10+136], r8
|
|
sbb rax, QWORD PTR [rcx+144]
|
|
mov r8, QWORD PTR [r10+152]
|
|
mov QWORD PTR [r10+144], rax
|
|
sbb r8, QWORD PTR [rcx+152]
|
|
mov rax, QWORD PTR [r10+160]
|
|
mov QWORD PTR [r10+152], r8
|
|
sbb rax, QWORD PTR [rcx+160]
|
|
mov r8, QWORD PTR [r10+168]
|
|
mov QWORD PTR [r10+160], rax
|
|
sbb r8, QWORD PTR [rcx+168]
|
|
mov rax, QWORD PTR [r10+176]
|
|
mov QWORD PTR [r10+168], r8
|
|
sbb rax, QWORD PTR [rcx+176]
|
|
mov r8, QWORD PTR [r10+184]
|
|
mov QWORD PTR [r10+176], rax
|
|
sbb r8, QWORD PTR [rcx+184]
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov QWORD PTR [r10+184], r8
|
|
sbb rax, QWORD PTR [rcx+192]
|
|
mov r8, QWORD PTR [r10+200]
|
|
mov QWORD PTR [r10+192], rax
|
|
sbb r8, QWORD PTR [rcx+200]
|
|
mov rax, QWORD PTR [r10+208]
|
|
mov QWORD PTR [r10+200], r8
|
|
sbb rax, QWORD PTR [rcx+208]
|
|
mov r8, QWORD PTR [r10+216]
|
|
mov QWORD PTR [r10+208], rax
|
|
sbb r8, QWORD PTR [rcx+216]
|
|
mov rax, QWORD PTR [r10+224]
|
|
mov QWORD PTR [r10+216], r8
|
|
sbb rax, QWORD PTR [rcx+224]
|
|
mov r8, QWORD PTR [r10+232]
|
|
mov QWORD PTR [r10+224], rax
|
|
sbb r8, QWORD PTR [rcx+232]
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov QWORD PTR [r10+232], r8
|
|
sbb rax, QWORD PTR [rcx+240]
|
|
mov r8, QWORD PTR [r10+248]
|
|
mov QWORD PTR [r10+240], rax
|
|
sbb r8, QWORD PTR [rcx+248]
|
|
mov rax, QWORD PTR [r10+256]
|
|
mov QWORD PTR [r10+248], r8
|
|
sbb rax, QWORD PTR [rcx+256]
|
|
mov r8, QWORD PTR [r10+264]
|
|
mov QWORD PTR [r10+256], rax
|
|
sbb r8, QWORD PTR [rcx+264]
|
|
mov rax, QWORD PTR [r10+272]
|
|
mov QWORD PTR [r10+264], r8
|
|
sbb rax, QWORD PTR [rcx+272]
|
|
mov r8, QWORD PTR [r10+280]
|
|
mov QWORD PTR [r10+272], rax
|
|
sbb r8, QWORD PTR [rcx+280]
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov QWORD PTR [r10+280], r8
|
|
sbb rax, QWORD PTR [rcx+288]
|
|
mov r8, QWORD PTR [r10+296]
|
|
mov QWORD PTR [r10+288], rax
|
|
sbb r8, QWORD PTR [rcx+296]
|
|
mov rax, QWORD PTR [r10+304]
|
|
mov QWORD PTR [r10+296], r8
|
|
sbb rax, QWORD PTR [rcx+304]
|
|
mov r8, QWORD PTR [r10+312]
|
|
mov QWORD PTR [r10+304], rax
|
|
sbb r8, QWORD PTR [rcx+312]
|
|
mov rax, QWORD PTR [r10+320]
|
|
mov QWORD PTR [r10+312], r8
|
|
sbb rax, QWORD PTR [rcx+320]
|
|
mov r8, QWORD PTR [r10+328]
|
|
mov QWORD PTR [r10+320], rax
|
|
sbb r8, QWORD PTR [rcx+328]
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov QWORD PTR [r10+328], r8
|
|
sbb rax, QWORD PTR [rcx+336]
|
|
mov r8, QWORD PTR [r10+344]
|
|
mov QWORD PTR [r10+336], rax
|
|
sbb r8, QWORD PTR [rcx+344]
|
|
mov rax, QWORD PTR [r10+352]
|
|
mov QWORD PTR [r10+344], r8
|
|
sbb rax, QWORD PTR [rcx+352]
|
|
mov r8, QWORD PTR [r10+360]
|
|
mov QWORD PTR [r10+352], rax
|
|
sbb r8, QWORD PTR [rcx+360]
|
|
mov rax, QWORD PTR [r10+368]
|
|
mov QWORD PTR [r10+360], r8
|
|
sbb rax, QWORD PTR [rcx+368]
|
|
mov r8, QWORD PTR [r10+376]
|
|
mov QWORD PTR [r10+368], rax
|
|
sbb r8, QWORD PTR [rcx+376]
|
|
mov rax, QWORD PTR [r10+384]
|
|
mov QWORD PTR [r10+376], r8
|
|
sbb rax, QWORD PTR [rcx+384]
|
|
mov r8, QWORD PTR [r10+392]
|
|
mov QWORD PTR [r10+384], rax
|
|
sbb r8, QWORD PTR [rcx+392]
|
|
mov rax, QWORD PTR [r10+400]
|
|
mov QWORD PTR [r10+392], r8
|
|
sbb rax, QWORD PTR [rcx+400]
|
|
mov r8, QWORD PTR [r10+408]
|
|
mov QWORD PTR [r10+400], rax
|
|
sbb r8, QWORD PTR [rcx+408]
|
|
mov rax, QWORD PTR [r10+416]
|
|
mov QWORD PTR [r10+408], r8
|
|
sbb rax, QWORD PTR [rcx+416]
|
|
mov r8, QWORD PTR [r10+424]
|
|
mov QWORD PTR [r10+416], rax
|
|
sbb r8, QWORD PTR [rcx+424]
|
|
mov rax, QWORD PTR [r10+432]
|
|
mov QWORD PTR [r10+424], r8
|
|
sbb rax, QWORD PTR [rcx+432]
|
|
mov r8, QWORD PTR [r10+440]
|
|
mov QWORD PTR [r10+432], rax
|
|
sbb r8, QWORD PTR [rcx+440]
|
|
mov rax, QWORD PTR [r10+448]
|
|
mov QWORD PTR [r10+440], r8
|
|
sbb rax, QWORD PTR [rcx+448]
|
|
mov r8, QWORD PTR [r10+456]
|
|
mov QWORD PTR [r10+448], rax
|
|
sbb r8, QWORD PTR [rcx+456]
|
|
mov rax, QWORD PTR [r10+464]
|
|
mov QWORD PTR [r10+456], r8
|
|
sbb rax, QWORD PTR [rcx+464]
|
|
mov r8, QWORD PTR [r10+472]
|
|
mov QWORD PTR [r10+464], rax
|
|
sbb r8, QWORD PTR [rcx+472]
|
|
mov rax, QWORD PTR [r10+480]
|
|
mov QWORD PTR [r10+472], r8
|
|
sbb rax, QWORD PTR [rcx+480]
|
|
mov r8, QWORD PTR [r10+488]
|
|
mov QWORD PTR [r10+480], rax
|
|
sbb r8, QWORD PTR [rcx+488]
|
|
mov rax, QWORD PTR [r10+496]
|
|
mov QWORD PTR [r10+488], r8
|
|
sbb rax, QWORD PTR [rcx+496]
|
|
mov r8, QWORD PTR [r10+504]
|
|
mov QWORD PTR [r10+496], rax
|
|
sbb r8, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [r10+504], r8
|
|
sbb r9, 0
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+256]
|
|
add rax, QWORD PTR [r10]
|
|
mov r8, QWORD PTR [rcx+264]
|
|
mov QWORD PTR [rcx+256], rax
|
|
adc r8, QWORD PTR [r10+8]
|
|
mov rax, QWORD PTR [rcx+272]
|
|
mov QWORD PTR [rcx+264], r8
|
|
adc rax, QWORD PTR [r10+16]
|
|
mov r8, QWORD PTR [rcx+280]
|
|
mov QWORD PTR [rcx+272], rax
|
|
adc r8, QWORD PTR [r10+24]
|
|
mov rax, QWORD PTR [rcx+288]
|
|
mov QWORD PTR [rcx+280], r8
|
|
adc rax, QWORD PTR [r10+32]
|
|
mov r8, QWORD PTR [rcx+296]
|
|
mov QWORD PTR [rcx+288], rax
|
|
adc r8, QWORD PTR [r10+40]
|
|
mov rax, QWORD PTR [rcx+304]
|
|
mov QWORD PTR [rcx+296], r8
|
|
adc rax, QWORD PTR [r10+48]
|
|
mov r8, QWORD PTR [rcx+312]
|
|
mov QWORD PTR [rcx+304], rax
|
|
adc r8, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [rcx+320]
|
|
mov QWORD PTR [rcx+312], r8
|
|
adc rax, QWORD PTR [r10+64]
|
|
mov r8, QWORD PTR [rcx+328]
|
|
mov QWORD PTR [rcx+320], rax
|
|
adc r8, QWORD PTR [r10+72]
|
|
mov rax, QWORD PTR [rcx+336]
|
|
mov QWORD PTR [rcx+328], r8
|
|
adc rax, QWORD PTR [r10+80]
|
|
mov r8, QWORD PTR [rcx+344]
|
|
mov QWORD PTR [rcx+336], rax
|
|
adc r8, QWORD PTR [r10+88]
|
|
mov rax, QWORD PTR [rcx+352]
|
|
mov QWORD PTR [rcx+344], r8
|
|
adc rax, QWORD PTR [r10+96]
|
|
mov r8, QWORD PTR [rcx+360]
|
|
mov QWORD PTR [rcx+352], rax
|
|
adc r8, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [rcx+368]
|
|
mov QWORD PTR [rcx+360], r8
|
|
adc rax, QWORD PTR [r10+112]
|
|
mov r8, QWORD PTR [rcx+376]
|
|
mov QWORD PTR [rcx+368], rax
|
|
adc r8, QWORD PTR [r10+120]
|
|
mov rax, QWORD PTR [rcx+384]
|
|
mov QWORD PTR [rcx+376], r8
|
|
adc rax, QWORD PTR [r10+128]
|
|
mov r8, QWORD PTR [rcx+392]
|
|
mov QWORD PTR [rcx+384], rax
|
|
adc r8, QWORD PTR [r10+136]
|
|
mov rax, QWORD PTR [rcx+400]
|
|
mov QWORD PTR [rcx+392], r8
|
|
adc rax, QWORD PTR [r10+144]
|
|
mov r8, QWORD PTR [rcx+408]
|
|
mov QWORD PTR [rcx+400], rax
|
|
adc r8, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [rcx+416]
|
|
mov QWORD PTR [rcx+408], r8
|
|
adc rax, QWORD PTR [r10+160]
|
|
mov r8, QWORD PTR [rcx+424]
|
|
mov QWORD PTR [rcx+416], rax
|
|
adc r8, QWORD PTR [r10+168]
|
|
mov rax, QWORD PTR [rcx+432]
|
|
mov QWORD PTR [rcx+424], r8
|
|
adc rax, QWORD PTR [r10+176]
|
|
mov r8, QWORD PTR [rcx+440]
|
|
mov QWORD PTR [rcx+432], rax
|
|
adc r8, QWORD PTR [r10+184]
|
|
mov rax, QWORD PTR [rcx+448]
|
|
mov QWORD PTR [rcx+440], r8
|
|
adc rax, QWORD PTR [r10+192]
|
|
mov r8, QWORD PTR [rcx+456]
|
|
mov QWORD PTR [rcx+448], rax
|
|
adc r8, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [rcx+464]
|
|
mov QWORD PTR [rcx+456], r8
|
|
adc rax, QWORD PTR [r10+208]
|
|
mov r8, QWORD PTR [rcx+472]
|
|
mov QWORD PTR [rcx+464], rax
|
|
adc r8, QWORD PTR [r10+216]
|
|
mov rax, QWORD PTR [rcx+480]
|
|
mov QWORD PTR [rcx+472], r8
|
|
adc rax, QWORD PTR [r10+224]
|
|
mov r8, QWORD PTR [rcx+488]
|
|
mov QWORD PTR [rcx+480], rax
|
|
adc r8, QWORD PTR [r10+232]
|
|
mov rax, QWORD PTR [rcx+496]
|
|
mov QWORD PTR [rcx+488], r8
|
|
adc rax, QWORD PTR [r10+240]
|
|
mov r8, QWORD PTR [rcx+504]
|
|
mov QWORD PTR [rcx+496], rax
|
|
adc r8, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [rcx+512]
|
|
mov QWORD PTR [rcx+504], r8
|
|
adc rax, QWORD PTR [r10+256]
|
|
mov r8, QWORD PTR [rcx+520]
|
|
mov QWORD PTR [rcx+512], rax
|
|
adc r8, QWORD PTR [r10+264]
|
|
mov rax, QWORD PTR [rcx+528]
|
|
mov QWORD PTR [rcx+520], r8
|
|
adc rax, QWORD PTR [r10+272]
|
|
mov r8, QWORD PTR [rcx+536]
|
|
mov QWORD PTR [rcx+528], rax
|
|
adc r8, QWORD PTR [r10+280]
|
|
mov rax, QWORD PTR [rcx+544]
|
|
mov QWORD PTR [rcx+536], r8
|
|
adc rax, QWORD PTR [r10+288]
|
|
mov r8, QWORD PTR [rcx+552]
|
|
mov QWORD PTR [rcx+544], rax
|
|
adc r8, QWORD PTR [r10+296]
|
|
mov rax, QWORD PTR [rcx+560]
|
|
mov QWORD PTR [rcx+552], r8
|
|
adc rax, QWORD PTR [r10+304]
|
|
mov r8, QWORD PTR [rcx+568]
|
|
mov QWORD PTR [rcx+560], rax
|
|
adc r8, QWORD PTR [r10+312]
|
|
mov rax, QWORD PTR [rcx+576]
|
|
mov QWORD PTR [rcx+568], r8
|
|
adc rax, QWORD PTR [r10+320]
|
|
mov r8, QWORD PTR [rcx+584]
|
|
mov QWORD PTR [rcx+576], rax
|
|
adc r8, QWORD PTR [r10+328]
|
|
mov rax, QWORD PTR [rcx+592]
|
|
mov QWORD PTR [rcx+584], r8
|
|
adc rax, QWORD PTR [r10+336]
|
|
mov r8, QWORD PTR [rcx+600]
|
|
mov QWORD PTR [rcx+592], rax
|
|
adc r8, QWORD PTR [r10+344]
|
|
mov rax, QWORD PTR [rcx+608]
|
|
mov QWORD PTR [rcx+600], r8
|
|
adc rax, QWORD PTR [r10+352]
|
|
mov r8, QWORD PTR [rcx+616]
|
|
mov QWORD PTR [rcx+608], rax
|
|
adc r8, QWORD PTR [r10+360]
|
|
mov rax, QWORD PTR [rcx+624]
|
|
mov QWORD PTR [rcx+616], r8
|
|
adc rax, QWORD PTR [r10+368]
|
|
mov r8, QWORD PTR [rcx+632]
|
|
mov QWORD PTR [rcx+624], rax
|
|
adc r8, QWORD PTR [r10+376]
|
|
mov rax, QWORD PTR [rcx+640]
|
|
mov QWORD PTR [rcx+632], r8
|
|
adc rax, QWORD PTR [r10+384]
|
|
mov r8, QWORD PTR [rcx+648]
|
|
mov QWORD PTR [rcx+640], rax
|
|
adc r8, QWORD PTR [r10+392]
|
|
mov rax, QWORD PTR [rcx+656]
|
|
mov QWORD PTR [rcx+648], r8
|
|
adc rax, QWORD PTR [r10+400]
|
|
mov r8, QWORD PTR [rcx+664]
|
|
mov QWORD PTR [rcx+656], rax
|
|
adc r8, QWORD PTR [r10+408]
|
|
mov rax, QWORD PTR [rcx+672]
|
|
mov QWORD PTR [rcx+664], r8
|
|
adc rax, QWORD PTR [r10+416]
|
|
mov r8, QWORD PTR [rcx+680]
|
|
mov QWORD PTR [rcx+672], rax
|
|
adc r8, QWORD PTR [r10+424]
|
|
mov rax, QWORD PTR [rcx+688]
|
|
mov QWORD PTR [rcx+680], r8
|
|
adc rax, QWORD PTR [r10+432]
|
|
mov r8, QWORD PTR [rcx+696]
|
|
mov QWORD PTR [rcx+688], rax
|
|
adc r8, QWORD PTR [r10+440]
|
|
mov rax, QWORD PTR [rcx+704]
|
|
mov QWORD PTR [rcx+696], r8
|
|
adc rax, QWORD PTR [r10+448]
|
|
mov r8, QWORD PTR [rcx+712]
|
|
mov QWORD PTR [rcx+704], rax
|
|
adc r8, QWORD PTR [r10+456]
|
|
mov rax, QWORD PTR [rcx+720]
|
|
mov QWORD PTR [rcx+712], r8
|
|
adc rax, QWORD PTR [r10+464]
|
|
mov r8, QWORD PTR [rcx+728]
|
|
mov QWORD PTR [rcx+720], rax
|
|
adc r8, QWORD PTR [r10+472]
|
|
mov rax, QWORD PTR [rcx+736]
|
|
mov QWORD PTR [rcx+728], r8
|
|
adc rax, QWORD PTR [r10+480]
|
|
mov r8, QWORD PTR [rcx+744]
|
|
mov QWORD PTR [rcx+736], rax
|
|
adc r8, QWORD PTR [r10+488]
|
|
mov rax, QWORD PTR [rcx+752]
|
|
mov QWORD PTR [rcx+744], r8
|
|
adc rax, QWORD PTR [r10+496]
|
|
mov r8, QWORD PTR [rcx+760]
|
|
mov QWORD PTR [rcx+752], rax
|
|
adc r8, QWORD PTR [r10+504]
|
|
mov QWORD PTR [rcx+760], r8
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+768], r9
|
|
; Add in place
|
|
mov rax, QWORD PTR [rcx+512]
|
|
xor r9, r9
|
|
add rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rcx+520]
|
|
mov QWORD PTR [rcx+512], rax
|
|
adc r8, QWORD PTR [rdx+8]
|
|
mov rax, QWORD PTR [rcx+528]
|
|
mov QWORD PTR [rcx+520], r8
|
|
adc rax, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rcx+536]
|
|
mov QWORD PTR [rcx+528], rax
|
|
adc r8, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rcx+544]
|
|
mov QWORD PTR [rcx+536], r8
|
|
adc rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rcx+552]
|
|
mov QWORD PTR [rcx+544], rax
|
|
adc r8, QWORD PTR [rdx+40]
|
|
mov rax, QWORD PTR [rcx+560]
|
|
mov QWORD PTR [rcx+552], r8
|
|
adc rax, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rcx+568]
|
|
mov QWORD PTR [rcx+560], rax
|
|
adc r8, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rcx+576]
|
|
mov QWORD PTR [rcx+568], r8
|
|
adc rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rcx+584]
|
|
mov QWORD PTR [rcx+576], rax
|
|
adc r8, QWORD PTR [rdx+72]
|
|
mov rax, QWORD PTR [rcx+592]
|
|
mov QWORD PTR [rcx+584], r8
|
|
adc rax, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rcx+600]
|
|
mov QWORD PTR [rcx+592], rax
|
|
adc r8, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rcx+608]
|
|
mov QWORD PTR [rcx+600], r8
|
|
adc rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rcx+616]
|
|
mov QWORD PTR [rcx+608], rax
|
|
adc r8, QWORD PTR [rdx+104]
|
|
mov rax, QWORD PTR [rcx+624]
|
|
mov QWORD PTR [rcx+616], r8
|
|
adc rax, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rcx+632]
|
|
mov QWORD PTR [rcx+624], rax
|
|
adc r8, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rcx+640]
|
|
mov QWORD PTR [rcx+632], r8
|
|
adc rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rcx+648]
|
|
mov QWORD PTR [rcx+640], rax
|
|
adc r8, QWORD PTR [rdx+136]
|
|
mov rax, QWORD PTR [rcx+656]
|
|
mov QWORD PTR [rcx+648], r8
|
|
adc rax, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rcx+664]
|
|
mov QWORD PTR [rcx+656], rax
|
|
adc r8, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [rcx+672]
|
|
mov QWORD PTR [rcx+664], r8
|
|
adc rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rcx+680]
|
|
mov QWORD PTR [rcx+672], rax
|
|
adc r8, QWORD PTR [rdx+168]
|
|
mov rax, QWORD PTR [rcx+688]
|
|
mov QWORD PTR [rcx+680], r8
|
|
adc rax, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rcx+696]
|
|
mov QWORD PTR [rcx+688], rax
|
|
adc r8, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [rcx+704]
|
|
mov QWORD PTR [rcx+696], r8
|
|
adc rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rcx+712]
|
|
mov QWORD PTR [rcx+704], rax
|
|
adc r8, QWORD PTR [rdx+200]
|
|
mov rax, QWORD PTR [rcx+720]
|
|
mov QWORD PTR [rcx+712], r8
|
|
adc rax, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [rcx+728]
|
|
mov QWORD PTR [rcx+720], rax
|
|
adc r8, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [rcx+736]
|
|
mov QWORD PTR [rcx+728], r8
|
|
adc rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rcx+744]
|
|
mov QWORD PTR [rcx+736], rax
|
|
adc r8, QWORD PTR [rdx+232]
|
|
mov rax, QWORD PTR [rcx+752]
|
|
mov QWORD PTR [rcx+744], r8
|
|
adc rax, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [rcx+760]
|
|
mov QWORD PTR [rcx+752], rax
|
|
adc r8, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [rcx+768]
|
|
mov QWORD PTR [rcx+760], r8
|
|
adc rax, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [rcx+768], rax
|
|
adc r9, 0
|
|
; Add to zero
|
|
mov rax, QWORD PTR [rdx+264]
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [rcx+776], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [rcx+784], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [rcx+792], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [rcx+800], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [rcx+808], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [rcx+816], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [rcx+824], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [rcx+832], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [rcx+840], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [rcx+848], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [rcx+856], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [rcx+864], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [rcx+872], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+880], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+384]
|
|
mov QWORD PTR [rcx+888], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+392]
|
|
mov QWORD PTR [rcx+896], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+400]
|
|
mov QWORD PTR [rcx+904], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+408]
|
|
mov QWORD PTR [rcx+912], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+416]
|
|
mov QWORD PTR [rcx+920], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+424]
|
|
mov QWORD PTR [rcx+928], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+432]
|
|
mov QWORD PTR [rcx+936], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+440]
|
|
mov QWORD PTR [rcx+944], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+448]
|
|
mov QWORD PTR [rcx+952], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+456]
|
|
mov QWORD PTR [rcx+960], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+464]
|
|
mov QWORD PTR [rcx+968], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+472]
|
|
mov QWORD PTR [rcx+976], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+480]
|
|
mov QWORD PTR [rcx+984], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+488]
|
|
mov QWORD PTR [rcx+992], r8
|
|
adc rax, 0
|
|
mov r8, QWORD PTR [rdx+496]
|
|
mov QWORD PTR [rcx+1000], rax
|
|
adc r8, 0
|
|
mov rax, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [rcx+1008], r8
|
|
adc rax, 0
|
|
mov QWORD PTR [rcx+1016], rax
|
|
add rsp, 1304
|
|
pop r12
|
|
ret
|
|
sp_4096_sqr_avx2_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_mul_d_64 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[16] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+128]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+128], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[17] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+136], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[18] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+144], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[19] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+152]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+152], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[20] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+160]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+160], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[21] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+168]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[22] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+176]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+176], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[23] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+184]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+184], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[24] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+192]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+192], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[25] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+200]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+200], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[26] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+208]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+208], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[27] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+216]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[28] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+224]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+224], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[29] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+232]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+232], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[30] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+240]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+240], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[31] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+248]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+248], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[32] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+256]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+256], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[33] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+264]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+264], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[34] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+272]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+272], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[35] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+280]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+280], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[36] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+288]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+288], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[37] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+296]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+296], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[38] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+304]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+304], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[39] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+312]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+312], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[40] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+320]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+320], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[41] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+328]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+328], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[42] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+336]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+336], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[43] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+344]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+344], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[44] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+352]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+352], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[45] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+360]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+360], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[46] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+368]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+368], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[47] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+376]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+376], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[48] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+384]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+384], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[49] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+392]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+392], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[50] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+400]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+400], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[51] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+408]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+408], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[52] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+416]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+416], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[53] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+424]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+424], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[54] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+432]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+432], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[55] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+440]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+440], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[56] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+448]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+448], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[57] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+456]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+456], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[58] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+464]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+464], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[59] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+472]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+472], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[60] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+480]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+480], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[61] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+488]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+488], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[62] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+496]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+496], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[63] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+504]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+504], r10
|
|
mov QWORD PTR [rcx+512], r11
|
|
pop r12
|
|
ret
|
|
sp_4096_mul_d_64 ENDP
|
|
_text ENDS
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_cond_sub_64 PROC
|
|
sub rsp, 512
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [r8+136]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+128], r10
|
|
mov QWORD PTR [rsp+136], r11
|
|
mov r10, QWORD PTR [r8+144]
|
|
mov r11, QWORD PTR [r8+152]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+144], r10
|
|
mov QWORD PTR [rsp+152], r11
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r11, QWORD PTR [r8+168]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+160], r10
|
|
mov QWORD PTR [rsp+168], r11
|
|
mov r10, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [r8+184]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+176], r10
|
|
mov QWORD PTR [rsp+184], r11
|
|
mov r10, QWORD PTR [r8+192]
|
|
mov r11, QWORD PTR [r8+200]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+192], r10
|
|
mov QWORD PTR [rsp+200], r11
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r11, QWORD PTR [r8+216]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+208], r10
|
|
mov QWORD PTR [rsp+216], r11
|
|
mov r10, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [r8+232]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+224], r10
|
|
mov QWORD PTR [rsp+232], r11
|
|
mov r10, QWORD PTR [r8+240]
|
|
mov r11, QWORD PTR [r8+248]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+240], r10
|
|
mov QWORD PTR [rsp+248], r11
|
|
mov r10, QWORD PTR [r8+256]
|
|
mov r11, QWORD PTR [r8+264]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+256], r10
|
|
mov QWORD PTR [rsp+264], r11
|
|
mov r10, QWORD PTR [r8+272]
|
|
mov r11, QWORD PTR [r8+280]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+272], r10
|
|
mov QWORD PTR [rsp+280], r11
|
|
mov r10, QWORD PTR [r8+288]
|
|
mov r11, QWORD PTR [r8+296]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+288], r10
|
|
mov QWORD PTR [rsp+296], r11
|
|
mov r10, QWORD PTR [r8+304]
|
|
mov r11, QWORD PTR [r8+312]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+304], r10
|
|
mov QWORD PTR [rsp+312], r11
|
|
mov r10, QWORD PTR [r8+320]
|
|
mov r11, QWORD PTR [r8+328]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+320], r10
|
|
mov QWORD PTR [rsp+328], r11
|
|
mov r10, QWORD PTR [r8+336]
|
|
mov r11, QWORD PTR [r8+344]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+336], r10
|
|
mov QWORD PTR [rsp+344], r11
|
|
mov r10, QWORD PTR [r8+352]
|
|
mov r11, QWORD PTR [r8+360]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+352], r10
|
|
mov QWORD PTR [rsp+360], r11
|
|
mov r10, QWORD PTR [r8+368]
|
|
mov r11, QWORD PTR [r8+376]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+368], r10
|
|
mov QWORD PTR [rsp+376], r11
|
|
mov r10, QWORD PTR [r8+384]
|
|
mov r11, QWORD PTR [r8+392]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+384], r10
|
|
mov QWORD PTR [rsp+392], r11
|
|
mov r10, QWORD PTR [r8+400]
|
|
mov r11, QWORD PTR [r8+408]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+400], r10
|
|
mov QWORD PTR [rsp+408], r11
|
|
mov r10, QWORD PTR [r8+416]
|
|
mov r11, QWORD PTR [r8+424]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+416], r10
|
|
mov QWORD PTR [rsp+424], r11
|
|
mov r10, QWORD PTR [r8+432]
|
|
mov r11, QWORD PTR [r8+440]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+432], r10
|
|
mov QWORD PTR [rsp+440], r11
|
|
mov r10, QWORD PTR [r8+448]
|
|
mov r11, QWORD PTR [r8+456]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+448], r10
|
|
mov QWORD PTR [rsp+456], r11
|
|
mov r10, QWORD PTR [r8+464]
|
|
mov r11, QWORD PTR [r8+472]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+464], r10
|
|
mov QWORD PTR [rsp+472], r11
|
|
mov r10, QWORD PTR [r8+480]
|
|
mov r11, QWORD PTR [r8+488]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+480], r10
|
|
mov QWORD PTR [rsp+488], r11
|
|
mov r10, QWORD PTR [r8+496]
|
|
mov r11, QWORD PTR [r8+504]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+496], r10
|
|
mov QWORD PTR [rsp+504], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov r10, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rsp+128]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+120], r11
|
|
mov r11, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rsp+136]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+128], r10
|
|
mov r10, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rsp+144]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+136], r11
|
|
mov r11, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rsp+152]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+144], r10
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rsp+160]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+152], r11
|
|
mov r11, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rsp+168]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+160], r10
|
|
mov r10, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rsp+176]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+168], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rsp+184]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov r10, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rsp+192]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+184], r11
|
|
mov r11, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rsp+200]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+192], r10
|
|
mov r10, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [rsp+208]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+200], r11
|
|
mov r11, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rsp+216]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+208], r10
|
|
mov r10, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rsp+224]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+216], r11
|
|
mov r11, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rsp+232]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+224], r10
|
|
mov r10, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [rsp+240]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+232], r11
|
|
mov r11, QWORD PTR [rdx+248]
|
|
mov r8, QWORD PTR [rsp+248]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+240], r10
|
|
mov r10, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [rsp+256]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+248], r11
|
|
mov r11, QWORD PTR [rdx+264]
|
|
mov r8, QWORD PTR [rsp+264]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+256], r10
|
|
mov r10, QWORD PTR [rdx+272]
|
|
mov r8, QWORD PTR [rsp+272]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+264], r11
|
|
mov r11, QWORD PTR [rdx+280]
|
|
mov r8, QWORD PTR [rsp+280]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+272], r10
|
|
mov r10, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [rsp+288]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+280], r11
|
|
mov r11, QWORD PTR [rdx+296]
|
|
mov r8, QWORD PTR [rsp+296]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+288], r10
|
|
mov r10, QWORD PTR [rdx+304]
|
|
mov r8, QWORD PTR [rsp+304]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+296], r11
|
|
mov r11, QWORD PTR [rdx+312]
|
|
mov r8, QWORD PTR [rsp+312]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+304], r10
|
|
mov r10, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [rsp+320]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+312], r11
|
|
mov r11, QWORD PTR [rdx+328]
|
|
mov r8, QWORD PTR [rsp+328]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+320], r10
|
|
mov r10, QWORD PTR [rdx+336]
|
|
mov r8, QWORD PTR [rsp+336]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+328], r11
|
|
mov r11, QWORD PTR [rdx+344]
|
|
mov r8, QWORD PTR [rsp+344]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+336], r10
|
|
mov r10, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [rsp+352]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+344], r11
|
|
mov r11, QWORD PTR [rdx+360]
|
|
mov r8, QWORD PTR [rsp+360]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+352], r10
|
|
mov r10, QWORD PTR [rdx+368]
|
|
mov r8, QWORD PTR [rsp+368]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+360], r11
|
|
mov r11, QWORD PTR [rdx+376]
|
|
mov r8, QWORD PTR [rsp+376]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+368], r10
|
|
mov r10, QWORD PTR [rdx+384]
|
|
mov r8, QWORD PTR [rsp+384]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+376], r11
|
|
mov r11, QWORD PTR [rdx+392]
|
|
mov r8, QWORD PTR [rsp+392]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+384], r10
|
|
mov r10, QWORD PTR [rdx+400]
|
|
mov r8, QWORD PTR [rsp+400]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+392], r11
|
|
mov r11, QWORD PTR [rdx+408]
|
|
mov r8, QWORD PTR [rsp+408]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+400], r10
|
|
mov r10, QWORD PTR [rdx+416]
|
|
mov r8, QWORD PTR [rsp+416]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+408], r11
|
|
mov r11, QWORD PTR [rdx+424]
|
|
mov r8, QWORD PTR [rsp+424]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+416], r10
|
|
mov r10, QWORD PTR [rdx+432]
|
|
mov r8, QWORD PTR [rsp+432]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+424], r11
|
|
mov r11, QWORD PTR [rdx+440]
|
|
mov r8, QWORD PTR [rsp+440]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+432], r10
|
|
mov r10, QWORD PTR [rdx+448]
|
|
mov r8, QWORD PTR [rsp+448]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+440], r11
|
|
mov r11, QWORD PTR [rdx+456]
|
|
mov r8, QWORD PTR [rsp+456]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+448], r10
|
|
mov r10, QWORD PTR [rdx+464]
|
|
mov r8, QWORD PTR [rsp+464]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+456], r11
|
|
mov r11, QWORD PTR [rdx+472]
|
|
mov r8, QWORD PTR [rsp+472]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+464], r10
|
|
mov r10, QWORD PTR [rdx+480]
|
|
mov r8, QWORD PTR [rsp+480]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+472], r11
|
|
mov r11, QWORD PTR [rdx+488]
|
|
mov r8, QWORD PTR [rsp+488]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+480], r10
|
|
mov r10, QWORD PTR [rdx+496]
|
|
mov r8, QWORD PTR [rsp+496]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+488], r11
|
|
mov r11, QWORD PTR [rdx+504]
|
|
mov r8, QWORD PTR [rsp+504]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+496], r10
|
|
mov QWORD PTR [rcx+504], r11
|
|
sbb rax, 0
|
|
add rsp, 512
|
|
ret
|
|
sp_4096_cond_sub_64 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 4096 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_mont_reduce_64 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 64
|
|
mov r10, 64
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_4096_mont_loop_64:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc r11, 0
|
|
; a[i+6] += m[6] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+48], r14
|
|
adc r12, 0
|
|
; a[i+7] += m[7] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+56]
|
|
mov r14, QWORD PTR [rcx+56]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+56], r14
|
|
adc r11, 0
|
|
; a[i+8] += m[8] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+64]
|
|
mov r14, QWORD PTR [rcx+64]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+64], r14
|
|
adc r12, 0
|
|
; a[i+9] += m[9] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rcx+72]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+72], r14
|
|
adc r11, 0
|
|
; a[i+10] += m[10] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+80]
|
|
mov r14, QWORD PTR [rcx+80]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+80], r14
|
|
adc r12, 0
|
|
; a[i+11] += m[11] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
mov r14, QWORD PTR [rcx+88]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+88], r14
|
|
adc r11, 0
|
|
; a[i+12] += m[12] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
mov r14, QWORD PTR [rcx+96]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+96], r14
|
|
adc r12, 0
|
|
; a[i+13] += m[13] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+104]
|
|
mov r14, QWORD PTR [rcx+104]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+104], r14
|
|
adc r11, 0
|
|
; a[i+14] += m[14] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rcx+112]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+112], r14
|
|
adc r12, 0
|
|
; a[i+15] += m[15] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+120]
|
|
mov r14, QWORD PTR [rcx+120]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+120], r14
|
|
adc r11, 0
|
|
; a[i+16] += m[16] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+128]
|
|
mov r14, QWORD PTR [rcx+128]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+128], r14
|
|
adc r12, 0
|
|
; a[i+17] += m[17] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+136]
|
|
mov r14, QWORD PTR [rcx+136]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+136], r14
|
|
adc r11, 0
|
|
; a[i+18] += m[18] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+144]
|
|
mov r14, QWORD PTR [rcx+144]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+144], r14
|
|
adc r12, 0
|
|
; a[i+19] += m[19] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+152]
|
|
mov r14, QWORD PTR [rcx+152]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+152], r14
|
|
adc r11, 0
|
|
; a[i+20] += m[20] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+160]
|
|
mov r14, QWORD PTR [rcx+160]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+160], r14
|
|
adc r12, 0
|
|
; a[i+21] += m[21] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+168]
|
|
mov r14, QWORD PTR [rcx+168]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+168], r14
|
|
adc r11, 0
|
|
; a[i+22] += m[22] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+176]
|
|
mov r14, QWORD PTR [rcx+176]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+176], r14
|
|
adc r12, 0
|
|
; a[i+23] += m[23] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+184]
|
|
mov r14, QWORD PTR [rcx+184]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+184], r14
|
|
adc r11, 0
|
|
; a[i+24] += m[24] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+192]
|
|
mov r14, QWORD PTR [rcx+192]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+192], r14
|
|
adc r12, 0
|
|
; a[i+25] += m[25] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+200]
|
|
mov r14, QWORD PTR [rcx+200]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+200], r14
|
|
adc r11, 0
|
|
; a[i+26] += m[26] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+208]
|
|
mov r14, QWORD PTR [rcx+208]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+208], r14
|
|
adc r12, 0
|
|
; a[i+27] += m[27] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+216]
|
|
mov r14, QWORD PTR [rcx+216]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+216], r14
|
|
adc r11, 0
|
|
; a[i+28] += m[28] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+224]
|
|
mov r14, QWORD PTR [rcx+224]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+224], r14
|
|
adc r12, 0
|
|
; a[i+29] += m[29] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+232]
|
|
mov r14, QWORD PTR [rcx+232]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+232], r14
|
|
adc r11, 0
|
|
; a[i+30] += m[30] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+240]
|
|
mov r14, QWORD PTR [rcx+240]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+240], r14
|
|
adc r12, 0
|
|
; a[i+31] += m[31] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+248]
|
|
mov r14, QWORD PTR [rcx+248]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+248], r14
|
|
adc r11, 0
|
|
; a[i+32] += m[32] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+256]
|
|
mov r14, QWORD PTR [rcx+256]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+256], r14
|
|
adc r12, 0
|
|
; a[i+33] += m[33] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+264]
|
|
mov r14, QWORD PTR [rcx+264]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+264], r14
|
|
adc r11, 0
|
|
; a[i+34] += m[34] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+272]
|
|
mov r14, QWORD PTR [rcx+272]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+272], r14
|
|
adc r12, 0
|
|
; a[i+35] += m[35] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+280]
|
|
mov r14, QWORD PTR [rcx+280]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+280], r14
|
|
adc r11, 0
|
|
; a[i+36] += m[36] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+288]
|
|
mov r14, QWORD PTR [rcx+288]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+288], r14
|
|
adc r12, 0
|
|
; a[i+37] += m[37] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+296]
|
|
mov r14, QWORD PTR [rcx+296]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+296], r14
|
|
adc r11, 0
|
|
; a[i+38] += m[38] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+304]
|
|
mov r14, QWORD PTR [rcx+304]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+304], r14
|
|
adc r12, 0
|
|
; a[i+39] += m[39] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+312]
|
|
mov r14, QWORD PTR [rcx+312]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+312], r14
|
|
adc r11, 0
|
|
; a[i+40] += m[40] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+320]
|
|
mov r14, QWORD PTR [rcx+320]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+320], r14
|
|
adc r12, 0
|
|
; a[i+41] += m[41] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+328]
|
|
mov r14, QWORD PTR [rcx+328]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+328], r14
|
|
adc r11, 0
|
|
; a[i+42] += m[42] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+336]
|
|
mov r14, QWORD PTR [rcx+336]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+336], r14
|
|
adc r12, 0
|
|
; a[i+43] += m[43] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+344]
|
|
mov r14, QWORD PTR [rcx+344]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+344], r14
|
|
adc r11, 0
|
|
; a[i+44] += m[44] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+352]
|
|
mov r14, QWORD PTR [rcx+352]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+352], r14
|
|
adc r12, 0
|
|
; a[i+45] += m[45] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+360]
|
|
mov r14, QWORD PTR [rcx+360]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+360], r14
|
|
adc r11, 0
|
|
; a[i+46] += m[46] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+368]
|
|
mov r14, QWORD PTR [rcx+368]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+368], r14
|
|
adc r12, 0
|
|
; a[i+47] += m[47] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+376]
|
|
mov r14, QWORD PTR [rcx+376]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+376], r14
|
|
adc r11, 0
|
|
; a[i+48] += m[48] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+384]
|
|
mov r14, QWORD PTR [rcx+384]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+384], r14
|
|
adc r12, 0
|
|
; a[i+49] += m[49] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+392]
|
|
mov r14, QWORD PTR [rcx+392]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+392], r14
|
|
adc r11, 0
|
|
; a[i+50] += m[50] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+400]
|
|
mov r14, QWORD PTR [rcx+400]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+400], r14
|
|
adc r12, 0
|
|
; a[i+51] += m[51] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+408]
|
|
mov r14, QWORD PTR [rcx+408]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+408], r14
|
|
adc r11, 0
|
|
; a[i+52] += m[52] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+416]
|
|
mov r14, QWORD PTR [rcx+416]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+416], r14
|
|
adc r12, 0
|
|
; a[i+53] += m[53] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+424]
|
|
mov r14, QWORD PTR [rcx+424]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+424], r14
|
|
adc r11, 0
|
|
; a[i+54] += m[54] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+432]
|
|
mov r14, QWORD PTR [rcx+432]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+432], r14
|
|
adc r12, 0
|
|
; a[i+55] += m[55] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+440]
|
|
mov r14, QWORD PTR [rcx+440]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+440], r14
|
|
adc r11, 0
|
|
; a[i+56] += m[56] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+448]
|
|
mov r14, QWORD PTR [rcx+448]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+448], r14
|
|
adc r12, 0
|
|
; a[i+57] += m[57] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+456]
|
|
mov r14, QWORD PTR [rcx+456]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+456], r14
|
|
adc r11, 0
|
|
; a[i+58] += m[58] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+464]
|
|
mov r14, QWORD PTR [rcx+464]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+464], r14
|
|
adc r12, 0
|
|
; a[i+59] += m[59] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+472]
|
|
mov r14, QWORD PTR [rcx+472]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+472], r14
|
|
adc r11, 0
|
|
; a[i+60] += m[60] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+480]
|
|
mov r14, QWORD PTR [rcx+480]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+480], r14
|
|
adc r12, 0
|
|
; a[i+61] += m[61] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+488]
|
|
mov r14, QWORD PTR [rcx+488]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+488], r14
|
|
adc r11, 0
|
|
; a[i+62] += m[62] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+496]
|
|
mov r14, QWORD PTR [rcx+496]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+496], r14
|
|
adc r12, 0
|
|
; a[i+63] += m[63] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+504]
|
|
mov r14, QWORD PTR [rcx+504]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+504], r14
|
|
adc QWORD PTR [rcx+512], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_4096_mont_loop_64
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
neg rsi
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 512
|
|
call sp_4096_cond_sub_64
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_mont_reduce_64 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into r. (r = a - b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_sub_64 PROC
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
sub r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
sbb r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
sbb r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
sbb r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
sbb r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
sbb r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
sbb r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
sbb r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
sbb r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
sbb r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
sbb r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
sbb r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
sbb r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
sbb r10, QWORD PTR [r8+120]
|
|
mov r9, QWORD PTR [rdx+128]
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r9, QWORD PTR [r8+128]
|
|
mov r10, QWORD PTR [rdx+136]
|
|
mov QWORD PTR [rcx+128], r9
|
|
sbb r10, QWORD PTR [r8+136]
|
|
mov r9, QWORD PTR [rdx+144]
|
|
mov QWORD PTR [rcx+136], r10
|
|
sbb r9, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+152]
|
|
mov QWORD PTR [rcx+144], r9
|
|
sbb r10, QWORD PTR [r8+152]
|
|
mov r9, QWORD PTR [rdx+160]
|
|
mov QWORD PTR [rcx+152], r10
|
|
sbb r9, QWORD PTR [r8+160]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
mov QWORD PTR [rcx+160], r9
|
|
sbb r10, QWORD PTR [r8+168]
|
|
mov r9, QWORD PTR [rdx+176]
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r9, QWORD PTR [r8+176]
|
|
mov r10, QWORD PTR [rdx+184]
|
|
mov QWORD PTR [rcx+176], r9
|
|
sbb r10, QWORD PTR [r8+184]
|
|
mov r9, QWORD PTR [rdx+192]
|
|
mov QWORD PTR [rcx+184], r10
|
|
sbb r9, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+200]
|
|
mov QWORD PTR [rcx+192], r9
|
|
sbb r10, QWORD PTR [r8+200]
|
|
mov r9, QWORD PTR [rdx+208]
|
|
mov QWORD PTR [rcx+200], r10
|
|
sbb r9, QWORD PTR [r8+208]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
mov QWORD PTR [rcx+208], r9
|
|
sbb r10, QWORD PTR [r8+216]
|
|
mov r9, QWORD PTR [rdx+224]
|
|
mov QWORD PTR [rcx+216], r10
|
|
sbb r9, QWORD PTR [r8+224]
|
|
mov r10, QWORD PTR [rdx+232]
|
|
mov QWORD PTR [rcx+224], r9
|
|
sbb r10, QWORD PTR [r8+232]
|
|
mov r9, QWORD PTR [rdx+240]
|
|
mov QWORD PTR [rcx+232], r10
|
|
sbb r9, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+248]
|
|
mov QWORD PTR [rcx+240], r9
|
|
sbb r10, QWORD PTR [r8+248]
|
|
mov r9, QWORD PTR [rdx+256]
|
|
mov QWORD PTR [rcx+248], r10
|
|
sbb r9, QWORD PTR [r8+256]
|
|
mov r10, QWORD PTR [rdx+264]
|
|
mov QWORD PTR [rcx+256], r9
|
|
sbb r10, QWORD PTR [r8+264]
|
|
mov r9, QWORD PTR [rdx+272]
|
|
mov QWORD PTR [rcx+264], r10
|
|
sbb r9, QWORD PTR [r8+272]
|
|
mov r10, QWORD PTR [rdx+280]
|
|
mov QWORD PTR [rcx+272], r9
|
|
sbb r10, QWORD PTR [r8+280]
|
|
mov r9, QWORD PTR [rdx+288]
|
|
mov QWORD PTR [rcx+280], r10
|
|
sbb r9, QWORD PTR [r8+288]
|
|
mov r10, QWORD PTR [rdx+296]
|
|
mov QWORD PTR [rcx+288], r9
|
|
sbb r10, QWORD PTR [r8+296]
|
|
mov r9, QWORD PTR [rdx+304]
|
|
mov QWORD PTR [rcx+296], r10
|
|
sbb r9, QWORD PTR [r8+304]
|
|
mov r10, QWORD PTR [rdx+312]
|
|
mov QWORD PTR [rcx+304], r9
|
|
sbb r10, QWORD PTR [r8+312]
|
|
mov r9, QWORD PTR [rdx+320]
|
|
mov QWORD PTR [rcx+312], r10
|
|
sbb r9, QWORD PTR [r8+320]
|
|
mov r10, QWORD PTR [rdx+328]
|
|
mov QWORD PTR [rcx+320], r9
|
|
sbb r10, QWORD PTR [r8+328]
|
|
mov r9, QWORD PTR [rdx+336]
|
|
mov QWORD PTR [rcx+328], r10
|
|
sbb r9, QWORD PTR [r8+336]
|
|
mov r10, QWORD PTR [rdx+344]
|
|
mov QWORD PTR [rcx+336], r9
|
|
sbb r10, QWORD PTR [r8+344]
|
|
mov r9, QWORD PTR [rdx+352]
|
|
mov QWORD PTR [rcx+344], r10
|
|
sbb r9, QWORD PTR [r8+352]
|
|
mov r10, QWORD PTR [rdx+360]
|
|
mov QWORD PTR [rcx+352], r9
|
|
sbb r10, QWORD PTR [r8+360]
|
|
mov r9, QWORD PTR [rdx+368]
|
|
mov QWORD PTR [rcx+360], r10
|
|
sbb r9, QWORD PTR [r8+368]
|
|
mov r10, QWORD PTR [rdx+376]
|
|
mov QWORD PTR [rcx+368], r9
|
|
sbb r10, QWORD PTR [r8+376]
|
|
mov r9, QWORD PTR [rdx+384]
|
|
mov QWORD PTR [rcx+376], r10
|
|
sbb r9, QWORD PTR [r8+384]
|
|
mov r10, QWORD PTR [rdx+392]
|
|
mov QWORD PTR [rcx+384], r9
|
|
sbb r10, QWORD PTR [r8+392]
|
|
mov r9, QWORD PTR [rdx+400]
|
|
mov QWORD PTR [rcx+392], r10
|
|
sbb r9, QWORD PTR [r8+400]
|
|
mov r10, QWORD PTR [rdx+408]
|
|
mov QWORD PTR [rcx+400], r9
|
|
sbb r10, QWORD PTR [r8+408]
|
|
mov r9, QWORD PTR [rdx+416]
|
|
mov QWORD PTR [rcx+408], r10
|
|
sbb r9, QWORD PTR [r8+416]
|
|
mov r10, QWORD PTR [rdx+424]
|
|
mov QWORD PTR [rcx+416], r9
|
|
sbb r10, QWORD PTR [r8+424]
|
|
mov r9, QWORD PTR [rdx+432]
|
|
mov QWORD PTR [rcx+424], r10
|
|
sbb r9, QWORD PTR [r8+432]
|
|
mov r10, QWORD PTR [rdx+440]
|
|
mov QWORD PTR [rcx+432], r9
|
|
sbb r10, QWORD PTR [r8+440]
|
|
mov r9, QWORD PTR [rdx+448]
|
|
mov QWORD PTR [rcx+440], r10
|
|
sbb r9, QWORD PTR [r8+448]
|
|
mov r10, QWORD PTR [rdx+456]
|
|
mov QWORD PTR [rcx+448], r9
|
|
sbb r10, QWORD PTR [r8+456]
|
|
mov r9, QWORD PTR [rdx+464]
|
|
mov QWORD PTR [rcx+456], r10
|
|
sbb r9, QWORD PTR [r8+464]
|
|
mov r10, QWORD PTR [rdx+472]
|
|
mov QWORD PTR [rcx+464], r9
|
|
sbb r10, QWORD PTR [r8+472]
|
|
mov r9, QWORD PTR [rdx+480]
|
|
mov QWORD PTR [rcx+472], r10
|
|
sbb r9, QWORD PTR [r8+480]
|
|
mov r10, QWORD PTR [rdx+488]
|
|
mov QWORD PTR [rcx+480], r9
|
|
sbb r10, QWORD PTR [r8+488]
|
|
mov r9, QWORD PTR [rdx+496]
|
|
mov QWORD PTR [rcx+488], r10
|
|
sbb r9, QWORD PTR [r8+496]
|
|
mov r10, QWORD PTR [rdx+504]
|
|
mov QWORD PTR [rcx+496], r9
|
|
sbb r10, QWORD PTR [r8+504]
|
|
mov QWORD PTR [rcx+504], r10
|
|
sbb rax, 0
|
|
ret
|
|
sp_4096_sub_64 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_mul_d_avx2_64 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[6] * B
|
|
mulx r10, r9, QWORD PTR [rax+48]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; A[7] * B
|
|
mulx r10, r9, QWORD PTR [rax+56]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; A[8] * B
|
|
mulx r10, r9, QWORD PTR [rax+64]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[9] * B
|
|
mulx r10, r9, QWORD PTR [rax+72]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
; A[10] * B
|
|
mulx r10, r9, QWORD PTR [rax+80]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
; A[11] * B
|
|
mulx r10, r9, QWORD PTR [rax+88]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+88], r12
|
|
; A[12] * B
|
|
mulx r10, r9, QWORD PTR [rax+96]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+96], r11
|
|
; A[13] * B
|
|
mulx r10, r9, QWORD PTR [rax+104]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+104], r12
|
|
; A[14] * B
|
|
mulx r10, r9, QWORD PTR [rax+112]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[15] * B
|
|
mulx r10, r9, QWORD PTR [rax+120]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+120], r12
|
|
; A[16] * B
|
|
mulx r10, r9, QWORD PTR [rax+128]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[17] * B
|
|
mulx r10, r9, QWORD PTR [rax+136]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[18] * B
|
|
mulx r10, r9, QWORD PTR [rax+144]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+144], r11
|
|
; A[19] * B
|
|
mulx r10, r9, QWORD PTR [rax+152]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+152], r12
|
|
; A[20] * B
|
|
mulx r10, r9, QWORD PTR [rax+160]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[21] * B
|
|
mulx r10, r9, QWORD PTR [rax+168]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+168], r12
|
|
; A[22] * B
|
|
mulx r10, r9, QWORD PTR [rax+176]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+176], r11
|
|
; A[23] * B
|
|
mulx r10, r9, QWORD PTR [rax+184]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+184], r12
|
|
; A[24] * B
|
|
mulx r10, r9, QWORD PTR [rax+192]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+192], r11
|
|
; A[25] * B
|
|
mulx r10, r9, QWORD PTR [rax+200]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+200], r12
|
|
; A[26] * B
|
|
mulx r10, r9, QWORD PTR [rax+208]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+208], r11
|
|
; A[27] * B
|
|
mulx r10, r9, QWORD PTR [rax+216]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+216], r12
|
|
; A[28] * B
|
|
mulx r10, r9, QWORD PTR [rax+224]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+224], r11
|
|
; A[29] * B
|
|
mulx r10, r9, QWORD PTR [rax+232]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+232], r12
|
|
; A[30] * B
|
|
mulx r10, r9, QWORD PTR [rax+240]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+240], r11
|
|
; A[31] * B
|
|
mulx r10, r9, QWORD PTR [rax+248]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+248], r12
|
|
; A[32] * B
|
|
mulx r10, r9, QWORD PTR [rax+256]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+256], r11
|
|
; A[33] * B
|
|
mulx r10, r9, QWORD PTR [rax+264]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+264], r12
|
|
; A[34] * B
|
|
mulx r10, r9, QWORD PTR [rax+272]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+272], r11
|
|
; A[35] * B
|
|
mulx r10, r9, QWORD PTR [rax+280]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+280], r12
|
|
; A[36] * B
|
|
mulx r10, r9, QWORD PTR [rax+288]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+288], r11
|
|
; A[37] * B
|
|
mulx r10, r9, QWORD PTR [rax+296]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+296], r12
|
|
; A[38] * B
|
|
mulx r10, r9, QWORD PTR [rax+304]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+304], r11
|
|
; A[39] * B
|
|
mulx r10, r9, QWORD PTR [rax+312]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+312], r12
|
|
; A[40] * B
|
|
mulx r10, r9, QWORD PTR [rax+320]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+320], r11
|
|
; A[41] * B
|
|
mulx r10, r9, QWORD PTR [rax+328]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+328], r12
|
|
; A[42] * B
|
|
mulx r10, r9, QWORD PTR [rax+336]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+336], r11
|
|
; A[43] * B
|
|
mulx r10, r9, QWORD PTR [rax+344]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+344], r12
|
|
; A[44] * B
|
|
mulx r10, r9, QWORD PTR [rax+352]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+352], r11
|
|
; A[45] * B
|
|
mulx r10, r9, QWORD PTR [rax+360]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+360], r12
|
|
; A[46] * B
|
|
mulx r10, r9, QWORD PTR [rax+368]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+368], r11
|
|
; A[47] * B
|
|
mulx r10, r9, QWORD PTR [rax+376]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+376], r12
|
|
; A[48] * B
|
|
mulx r10, r9, QWORD PTR [rax+384]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+384], r11
|
|
; A[49] * B
|
|
mulx r10, r9, QWORD PTR [rax+392]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+392], r12
|
|
; A[50] * B
|
|
mulx r10, r9, QWORD PTR [rax+400]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+400], r11
|
|
; A[51] * B
|
|
mulx r10, r9, QWORD PTR [rax+408]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+408], r12
|
|
; A[52] * B
|
|
mulx r10, r9, QWORD PTR [rax+416]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+416], r11
|
|
; A[53] * B
|
|
mulx r10, r9, QWORD PTR [rax+424]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+424], r12
|
|
; A[54] * B
|
|
mulx r10, r9, QWORD PTR [rax+432]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+432], r11
|
|
; A[55] * B
|
|
mulx r10, r9, QWORD PTR [rax+440]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+440], r12
|
|
; A[56] * B
|
|
mulx r10, r9, QWORD PTR [rax+448]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+448], r11
|
|
; A[57] * B
|
|
mulx r10, r9, QWORD PTR [rax+456]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+456], r12
|
|
; A[58] * B
|
|
mulx r10, r9, QWORD PTR [rax+464]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+464], r11
|
|
; A[59] * B
|
|
mulx r10, r9, QWORD PTR [rax+472]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+472], r12
|
|
; A[60] * B
|
|
mulx r10, r9, QWORD PTR [rax+480]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+480], r11
|
|
; A[61] * B
|
|
mulx r10, r9, QWORD PTR [rax+488]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+488], r12
|
|
; A[62] * B
|
|
mulx r10, r9, QWORD PTR [rax+496]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+496], r11
|
|
; A[63] * B
|
|
mulx r10, r9, QWORD PTR [rax+504]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+504], r12
|
|
mov QWORD PTR [rcx+512], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_mul_d_avx2_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_4096_word_asm_64 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_4096_word_asm_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_cond_sub_avx2_64 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [rdx+128]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+128], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+136], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+152]
|
|
mov r11, QWORD PTR [rdx+152]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+144], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+152], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+168]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+160], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [rdx+176]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+168], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+176], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+192]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+184], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+200]
|
|
mov r11, QWORD PTR [rdx+200]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+192], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+200], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+216]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+208], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [rdx+224]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+216], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+224], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+232], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+248]
|
|
mov r11, QWORD PTR [rdx+248]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+240], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+256]
|
|
mov r12, QWORD PTR [rdx+256]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+248], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+264]
|
|
mov r10, QWORD PTR [rdx+264]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+256], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+272]
|
|
mov r11, QWORD PTR [rdx+272]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+264], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+280]
|
|
mov r12, QWORD PTR [rdx+280]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+272], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+288]
|
|
mov r10, QWORD PTR [rdx+288]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+280], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+296]
|
|
mov r11, QWORD PTR [rdx+296]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+288], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+304]
|
|
mov r12, QWORD PTR [rdx+304]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+296], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+312]
|
|
mov r10, QWORD PTR [rdx+312]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+304], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+320]
|
|
mov r11, QWORD PTR [rdx+320]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+312], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+328]
|
|
mov r12, QWORD PTR [rdx+328]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+320], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+336]
|
|
mov r10, QWORD PTR [rdx+336]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+328], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+344]
|
|
mov r11, QWORD PTR [rdx+344]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+336], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+352]
|
|
mov r12, QWORD PTR [rdx+352]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+344], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+360]
|
|
mov r10, QWORD PTR [rdx+360]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+352], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+368]
|
|
mov r11, QWORD PTR [rdx+368]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+360], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+376]
|
|
mov r12, QWORD PTR [rdx+376]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+368], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+384]
|
|
mov r10, QWORD PTR [rdx+384]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+376], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+392]
|
|
mov r11, QWORD PTR [rdx+392]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+384], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+400]
|
|
mov r12, QWORD PTR [rdx+400]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+392], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+408]
|
|
mov r10, QWORD PTR [rdx+408]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+400], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+416]
|
|
mov r11, QWORD PTR [rdx+416]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+408], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+424]
|
|
mov r12, QWORD PTR [rdx+424]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+416], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+432]
|
|
mov r10, QWORD PTR [rdx+432]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+424], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+440]
|
|
mov r11, QWORD PTR [rdx+440]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+432], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+448]
|
|
mov r12, QWORD PTR [rdx+448]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+440], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+456]
|
|
mov r10, QWORD PTR [rdx+456]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+448], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+464]
|
|
mov r11, QWORD PTR [rdx+464]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+456], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+472]
|
|
mov r12, QWORD PTR [rdx+472]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+464], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+480]
|
|
mov r10, QWORD PTR [rdx+480]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+472], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+488]
|
|
mov r11, QWORD PTR [rdx+488]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+480], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+496]
|
|
mov r12, QWORD PTR [rdx+496]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+488], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+504]
|
|
mov r10, QWORD PTR [rdx+504]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+496], r12
|
|
sbb r10, r11
|
|
mov QWORD PTR [rcx+504], r10
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_4096_cond_sub_avx2_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_cmp_64 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+504]
|
|
mov r12, QWORD PTR [rdx+504]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+496]
|
|
mov r12, QWORD PTR [rdx+496]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+488]
|
|
mov r12, QWORD PTR [rdx+488]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+480]
|
|
mov r12, QWORD PTR [rdx+480]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+472]
|
|
mov r12, QWORD PTR [rdx+472]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+464]
|
|
mov r12, QWORD PTR [rdx+464]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+456]
|
|
mov r12, QWORD PTR [rdx+456]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+448]
|
|
mov r12, QWORD PTR [rdx+448]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+440]
|
|
mov r12, QWORD PTR [rdx+440]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+432]
|
|
mov r12, QWORD PTR [rdx+432]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+424]
|
|
mov r12, QWORD PTR [rdx+424]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+416]
|
|
mov r12, QWORD PTR [rdx+416]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+408]
|
|
mov r12, QWORD PTR [rdx+408]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+400]
|
|
mov r12, QWORD PTR [rdx+400]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+392]
|
|
mov r12, QWORD PTR [rdx+392]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+384]
|
|
mov r12, QWORD PTR [rdx+384]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+376]
|
|
mov r12, QWORD PTR [rdx+376]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+368]
|
|
mov r12, QWORD PTR [rdx+368]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+360]
|
|
mov r12, QWORD PTR [rdx+360]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+352]
|
|
mov r12, QWORD PTR [rdx+352]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+344]
|
|
mov r12, QWORD PTR [rdx+344]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+336]
|
|
mov r12, QWORD PTR [rdx+336]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+328]
|
|
mov r12, QWORD PTR [rdx+328]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+320]
|
|
mov r12, QWORD PTR [rdx+320]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+312]
|
|
mov r12, QWORD PTR [rdx+312]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+304]
|
|
mov r12, QWORD PTR [rdx+304]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+296]
|
|
mov r12, QWORD PTR [rdx+296]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+288]
|
|
mov r12, QWORD PTR [rdx+288]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+280]
|
|
mov r12, QWORD PTR [rdx+280]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+272]
|
|
mov r12, QWORD PTR [rdx+272]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+264]
|
|
mov r12, QWORD PTR [rdx+264]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+256]
|
|
mov r12, QWORD PTR [rdx+256]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+248]
|
|
mov r12, QWORD PTR [rdx+248]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+240]
|
|
mov r12, QWORD PTR [rdx+240]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+224]
|
|
mov r12, QWORD PTR [rdx+224]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+216]
|
|
mov r12, QWORD PTR [rdx+216]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+200]
|
|
mov r12, QWORD PTR [rdx+200]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+192]
|
|
mov r12, QWORD PTR [rdx+192]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+176]
|
|
mov r12, QWORD PTR [rdx+176]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+168]
|
|
mov r12, QWORD PTR [rdx+168]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+152]
|
|
mov r12, QWORD PTR [rdx+152]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+144]
|
|
mov r12, QWORD PTR [rdx+144]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+128]
|
|
mov r12, QWORD PTR [rdx+128]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+120]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+104]
|
|
mov r12, QWORD PTR [rdx+104]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+96]
|
|
mov r12, QWORD PTR [rdx+96]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+80]
|
|
mov r12, QWORD PTR [rdx+80]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+72]
|
|
mov r12, QWORD PTR [rdx+72]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+56]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+48]
|
|
mov r12, QWORD PTR [rdx+48]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_4096_cmp_64 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 4096 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_mont_reduce_avx2_64 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r9, rcx
|
|
mov r10, rdx
|
|
xor rbp, rbp
|
|
; i = 64
|
|
mov r11, 64
|
|
mov r15, QWORD PTR [r9]
|
|
mov rdi, QWORD PTR [r9+8]
|
|
mov rsi, QWORD PTR [r9+16]
|
|
mov rbx, QWORD PTR [r9+24]
|
|
add r9, 256
|
|
xor rbp, rbp
|
|
L_4096_mont_loop_avx2_64:
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-224]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-216]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-208]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-216], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9+-200]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-208], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9+-192]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-200], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+-184]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-192], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+-176]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-184], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+-168]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-176], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+-160]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-168], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+-152]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-160], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+-144]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-152], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+-136]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-144], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+-128]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-136], r13
|
|
; a[i+16] += m[16] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+128]
|
|
mov r13, QWORD PTR [r9+-120]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-128], r12
|
|
; a[i+17] += m[17] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+136]
|
|
mov r12, QWORD PTR [r9+-112]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-120], r13
|
|
; a[i+18] += m[18] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+144]
|
|
mov r13, QWORD PTR [r9+-104]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-112], r12
|
|
; a[i+19] += m[19] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+152]
|
|
mov r12, QWORD PTR [r9+-96]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-104], r13
|
|
; a[i+20] += m[20] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+160]
|
|
mov r13, QWORD PTR [r9+-88]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-96], r12
|
|
; a[i+21] += m[21] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+168]
|
|
mov r12, QWORD PTR [r9+-80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-88], r13
|
|
; a[i+22] += m[22] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+176]
|
|
mov r13, QWORD PTR [r9+-72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-80], r12
|
|
; a[i+23] += m[23] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+184]
|
|
mov r12, QWORD PTR [r9+-64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-72], r13
|
|
; a[i+24] += m[24] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+192]
|
|
mov r13, QWORD PTR [r9+-56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-64], r12
|
|
; a[i+25] += m[25] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+200]
|
|
mov r12, QWORD PTR [r9+-48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-56], r13
|
|
; a[i+26] += m[26] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+208]
|
|
mov r13, QWORD PTR [r9+-40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-48], r12
|
|
; a[i+27] += m[27] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+216]
|
|
mov r12, QWORD PTR [r9+-32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-40], r13
|
|
; a[i+28] += m[28] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+224]
|
|
mov r13, QWORD PTR [r9+-24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-32], r12
|
|
; a[i+29] += m[29] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+232]
|
|
mov r12, QWORD PTR [r9+-16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-24], r13
|
|
; a[i+30] += m[30] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+240]
|
|
mov r13, QWORD PTR [r9+-8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-16], r12
|
|
; a[i+31] += m[31] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+248]
|
|
mov r12, QWORD PTR [r9]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-8], r13
|
|
; a[i+32] += m[32] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+256]
|
|
mov r13, QWORD PTR [r9+8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9], r12
|
|
; a[i+33] += m[33] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+264]
|
|
mov r12, QWORD PTR [r9+16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+8], r13
|
|
; a[i+34] += m[34] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+272]
|
|
mov r13, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+16], r12
|
|
; a[i+35] += m[35] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+280]
|
|
mov r12, QWORD PTR [r9+32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+24], r13
|
|
; a[i+36] += m[36] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+288]
|
|
mov r13, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+32], r12
|
|
; a[i+37] += m[37] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+296]
|
|
mov r12, QWORD PTR [r9+48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+40], r13
|
|
; a[i+38] += m[38] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+304]
|
|
mov r13, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+48], r12
|
|
; a[i+39] += m[39] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+312]
|
|
mov r12, QWORD PTR [r9+64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+56], r13
|
|
; a[i+40] += m[40] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+320]
|
|
mov r13, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+64], r12
|
|
; a[i+41] += m[41] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+328]
|
|
mov r12, QWORD PTR [r9+80]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+72], r13
|
|
; a[i+42] += m[42] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+336]
|
|
mov r13, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+80], r12
|
|
; a[i+43] += m[43] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+344]
|
|
mov r12, QWORD PTR [r9+96]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+88], r13
|
|
; a[i+44] += m[44] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+352]
|
|
mov r13, QWORD PTR [r9+104]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+96], r12
|
|
; a[i+45] += m[45] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+360]
|
|
mov r12, QWORD PTR [r9+112]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+104], r13
|
|
; a[i+46] += m[46] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+368]
|
|
mov r13, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+112], r12
|
|
; a[i+47] += m[47] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+376]
|
|
mov r12, QWORD PTR [r9+128]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+120], r13
|
|
; a[i+48] += m[48] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+384]
|
|
mov r13, QWORD PTR [r9+136]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+128], r12
|
|
; a[i+49] += m[49] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+392]
|
|
mov r12, QWORD PTR [r9+144]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+136], r13
|
|
; a[i+50] += m[50] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+400]
|
|
mov r13, QWORD PTR [r9+152]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+144], r12
|
|
; a[i+51] += m[51] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+408]
|
|
mov r12, QWORD PTR [r9+160]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+152], r13
|
|
; a[i+52] += m[52] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+416]
|
|
mov r13, QWORD PTR [r9+168]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+160], r12
|
|
; a[i+53] += m[53] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+424]
|
|
mov r12, QWORD PTR [r9+176]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+168], r13
|
|
; a[i+54] += m[54] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+432]
|
|
mov r13, QWORD PTR [r9+184]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+176], r12
|
|
; a[i+55] += m[55] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+440]
|
|
mov r12, QWORD PTR [r9+192]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+184], r13
|
|
; a[i+56] += m[56] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+448]
|
|
mov r13, QWORD PTR [r9+200]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+192], r12
|
|
; a[i+57] += m[57] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+456]
|
|
mov r12, QWORD PTR [r9+208]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+200], r13
|
|
; a[i+58] += m[58] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+464]
|
|
mov r13, QWORD PTR [r9+216]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+208], r12
|
|
; a[i+59] += m[59] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+472]
|
|
mov r12, QWORD PTR [r9+224]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+216], r13
|
|
; a[i+60] += m[60] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+480]
|
|
mov r13, QWORD PTR [r9+232]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+224], r12
|
|
; a[i+61] += m[61] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+488]
|
|
mov r12, QWORD PTR [r9+240]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+232], r13
|
|
; a[i+62] += m[62] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+496]
|
|
mov r13, QWORD PTR [r9+248]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+240], r12
|
|
; a[i+63] += m[63] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+504]
|
|
mov r12, QWORD PTR [r9+256]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+248], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+256], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; a += 1
|
|
add r9, 8
|
|
; i -= 1
|
|
sub r11, 1
|
|
jnz L_4096_mont_loop_avx2_64
|
|
sub r9, 256
|
|
neg rbp
|
|
mov r8, r9
|
|
sub r9, 512
|
|
mov rcx, QWORD PTR [r10]
|
|
mov rdx, r15
|
|
pext rcx, rcx, rbp
|
|
sub rdx, rcx
|
|
mov rcx, QWORD PTR [r10+8]
|
|
mov rax, rdi
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+16]
|
|
mov rcx, rsi
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+8], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+24]
|
|
mov rdx, rbx
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+16], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+32]
|
|
mov rax, QWORD PTR [r8+32]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+24], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+40]
|
|
mov rcx, QWORD PTR [r8+40]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+32], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov rdx, QWORD PTR [r8+48]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+40], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r8+56]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+48], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+64]
|
|
mov rcx, QWORD PTR [r8+64]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+56], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+72]
|
|
mov rdx, QWORD PTR [r8+72]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+64], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+80]
|
|
mov rax, QWORD PTR [r8+80]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+72], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+88]
|
|
mov rcx, QWORD PTR [r8+88]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+80], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov rdx, QWORD PTR [r8+96]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+88], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r8+104]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+96], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+112]
|
|
mov rcx, QWORD PTR [r8+112]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+104], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+120]
|
|
mov rdx, QWORD PTR [r8+120]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+112], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+128]
|
|
mov rax, QWORD PTR [r8+128]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+120], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+136]
|
|
mov rcx, QWORD PTR [r8+136]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+128], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+144]
|
|
mov rdx, QWORD PTR [r8+144]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+136], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+152]
|
|
mov rax, QWORD PTR [r8+152]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+144], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+160]
|
|
mov rcx, QWORD PTR [r8+160]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+152], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+168]
|
|
mov rdx, QWORD PTR [r8+168]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+160], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+176]
|
|
mov rax, QWORD PTR [r8+176]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+168], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+184]
|
|
mov rcx, QWORD PTR [r8+184]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+176], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+192]
|
|
mov rdx, QWORD PTR [r8+192]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+184], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+200]
|
|
mov rax, QWORD PTR [r8+200]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+192], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+208]
|
|
mov rcx, QWORD PTR [r8+208]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+200], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+216]
|
|
mov rdx, QWORD PTR [r8+216]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+208], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+224]
|
|
mov rax, QWORD PTR [r8+224]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+216], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+232]
|
|
mov rcx, QWORD PTR [r8+232]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+224], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+240]
|
|
mov rdx, QWORD PTR [r8+240]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+232], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+248]
|
|
mov rax, QWORD PTR [r8+248]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+240], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+256]
|
|
mov rcx, QWORD PTR [r8+256]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+248], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+264]
|
|
mov rdx, QWORD PTR [r8+264]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+256], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+272]
|
|
mov rax, QWORD PTR [r8+272]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+264], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+280]
|
|
mov rcx, QWORD PTR [r8+280]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+272], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+288]
|
|
mov rdx, QWORD PTR [r8+288]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+280], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+296]
|
|
mov rax, QWORD PTR [r8+296]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+288], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+304]
|
|
mov rcx, QWORD PTR [r8+304]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+296], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+312]
|
|
mov rdx, QWORD PTR [r8+312]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+304], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+320]
|
|
mov rax, QWORD PTR [r8+320]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+312], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+328]
|
|
mov rcx, QWORD PTR [r8+328]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+320], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+336]
|
|
mov rdx, QWORD PTR [r8+336]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+328], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+344]
|
|
mov rax, QWORD PTR [r8+344]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+336], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+352]
|
|
mov rcx, QWORD PTR [r8+352]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+344], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+360]
|
|
mov rdx, QWORD PTR [r8+360]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+352], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+368]
|
|
mov rax, QWORD PTR [r8+368]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+360], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+376]
|
|
mov rcx, QWORD PTR [r8+376]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+368], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+384]
|
|
mov rdx, QWORD PTR [r8+384]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+376], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+392]
|
|
mov rax, QWORD PTR [r8+392]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+384], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+400]
|
|
mov rcx, QWORD PTR [r8+400]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+392], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+408]
|
|
mov rdx, QWORD PTR [r8+408]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+400], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+416]
|
|
mov rax, QWORD PTR [r8+416]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+408], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+424]
|
|
mov rcx, QWORD PTR [r8+424]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+416], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+432]
|
|
mov rdx, QWORD PTR [r8+432]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+424], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+440]
|
|
mov rax, QWORD PTR [r8+440]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+432], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+448]
|
|
mov rcx, QWORD PTR [r8+448]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+440], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+456]
|
|
mov rdx, QWORD PTR [r8+456]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+448], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+464]
|
|
mov rax, QWORD PTR [r8+464]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+456], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+472]
|
|
mov rcx, QWORD PTR [r8+472]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+464], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+480]
|
|
mov rdx, QWORD PTR [r8+480]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+472], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+488]
|
|
mov rax, QWORD PTR [r8+488]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+480], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+496]
|
|
mov rcx, QWORD PTR [r8+496]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+488], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+504]
|
|
mov rdx, QWORD PTR [r8+504]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+496], rcx
|
|
sbb rdx, rax
|
|
mov QWORD PTR [r9+504], rdx
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_mont_reduce_avx2_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_cond_add_32 PROC
|
|
sub rsp, 256
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [r8+136]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+128], r10
|
|
mov QWORD PTR [rsp+136], r11
|
|
mov r10, QWORD PTR [r8+144]
|
|
mov r11, QWORD PTR [r8+152]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+144], r10
|
|
mov QWORD PTR [rsp+152], r11
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r11, QWORD PTR [r8+168]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+160], r10
|
|
mov QWORD PTR [rsp+168], r11
|
|
mov r10, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [r8+184]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+176], r10
|
|
mov QWORD PTR [rsp+184], r11
|
|
mov r10, QWORD PTR [r8+192]
|
|
mov r11, QWORD PTR [r8+200]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+192], r10
|
|
mov QWORD PTR [rsp+200], r11
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r11, QWORD PTR [r8+216]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+208], r10
|
|
mov QWORD PTR [rsp+216], r11
|
|
mov r10, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [r8+232]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+224], r10
|
|
mov QWORD PTR [rsp+232], r11
|
|
mov r10, QWORD PTR [r8+240]
|
|
mov r11, QWORD PTR [r8+248]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+240], r10
|
|
mov QWORD PTR [rsp+248], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
add r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov r10, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rsp+128]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+120], r11
|
|
mov r11, QWORD PTR [rdx+136]
|
|
mov r8, QWORD PTR [rsp+136]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+128], r10
|
|
mov r10, QWORD PTR [rdx+144]
|
|
mov r8, QWORD PTR [rsp+144]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+136], r11
|
|
mov r11, QWORD PTR [rdx+152]
|
|
mov r8, QWORD PTR [rsp+152]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+144], r10
|
|
mov r10, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rsp+160]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+152], r11
|
|
mov r11, QWORD PTR [rdx+168]
|
|
mov r8, QWORD PTR [rsp+168]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+160], r10
|
|
mov r10, QWORD PTR [rdx+176]
|
|
mov r8, QWORD PTR [rsp+176]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+168], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov r8, QWORD PTR [rsp+184]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+176], r10
|
|
mov r10, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rsp+192]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+184], r11
|
|
mov r11, QWORD PTR [rdx+200]
|
|
mov r8, QWORD PTR [rsp+200]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+192], r10
|
|
mov r10, QWORD PTR [rdx+208]
|
|
mov r8, QWORD PTR [rsp+208]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+200], r11
|
|
mov r11, QWORD PTR [rdx+216]
|
|
mov r8, QWORD PTR [rsp+216]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+208], r10
|
|
mov r10, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rsp+224]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+216], r11
|
|
mov r11, QWORD PTR [rdx+232]
|
|
mov r8, QWORD PTR [rsp+232]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+224], r10
|
|
mov r10, QWORD PTR [rdx+240]
|
|
mov r8, QWORD PTR [rsp+240]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+232], r11
|
|
mov r11, QWORD PTR [rdx+248]
|
|
mov r8, QWORD PTR [rsp+248]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+240], r10
|
|
mov QWORD PTR [rcx+248], r11
|
|
adc rax, 0
|
|
add rsp, 256
|
|
ret
|
|
sp_4096_cond_add_32 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_cond_add_avx2_32 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
add r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+128]
|
|
mov r11, QWORD PTR [rdx+128]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+136]
|
|
mov r12, QWORD PTR [rdx+136]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+128], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+144]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+136], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+152]
|
|
mov r11, QWORD PTR [rdx+152]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+144], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+160]
|
|
mov r12, QWORD PTR [rdx+160]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+152], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+168]
|
|
mov r10, QWORD PTR [rdx+168]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+160], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+176]
|
|
mov r11, QWORD PTR [rdx+176]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+168], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+184]
|
|
mov r12, QWORD PTR [rdx+184]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+176], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+192]
|
|
mov r10, QWORD PTR [rdx+192]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+184], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+200]
|
|
mov r11, QWORD PTR [rdx+200]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+192], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+208]
|
|
mov r12, QWORD PTR [rdx+208]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+200], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+216]
|
|
mov r10, QWORD PTR [rdx+216]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+208], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+224]
|
|
mov r11, QWORD PTR [rdx+224]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+216], r10
|
|
adc r11, r12
|
|
mov r10, QWORD PTR [r8+232]
|
|
mov r12, QWORD PTR [rdx+232]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+224], r11
|
|
adc r12, r10
|
|
mov r11, QWORD PTR [r8+240]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+232], r12
|
|
adc r10, r11
|
|
mov r12, QWORD PTR [r8+248]
|
|
mov r11, QWORD PTR [rdx+248]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+240], r10
|
|
adc r11, r12
|
|
mov QWORD PTR [rcx+248], r11
|
|
adc rax, 0
|
|
pop r12
|
|
ret
|
|
sp_4096_cond_add_avx2_32 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Shift number left by n bit. (r = a << n)
|
|
; *
|
|
; * r Result of left shift by n.
|
|
; * a Number to shift.
|
|
; * n Amoutnt o shift.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_4096_lshift_64 PROC
|
|
push r12
|
|
push r13
|
|
mov r9, rcx
|
|
mov rcx, r8
|
|
mov r12, 0
|
|
mov r13, QWORD PTR [rdx+472]
|
|
mov rax, QWORD PTR [rdx+480]
|
|
mov r8, QWORD PTR [rdx+488]
|
|
mov r10, QWORD PTR [rdx+496]
|
|
mov r11, QWORD PTR [rdx+504]
|
|
shld r12, r11, cl
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+480], rax
|
|
mov QWORD PTR [r9+488], r8
|
|
mov QWORD PTR [r9+496], r10
|
|
mov QWORD PTR [r9+504], r11
|
|
mov QWORD PTR [r9+512], r12
|
|
mov r11, QWORD PTR [rdx+440]
|
|
mov rax, QWORD PTR [rdx+448]
|
|
mov r8, QWORD PTR [rdx+456]
|
|
mov r10, QWORD PTR [rdx+464]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+448], rax
|
|
mov QWORD PTR [r9+456], r8
|
|
mov QWORD PTR [r9+464], r10
|
|
mov QWORD PTR [r9+472], r13
|
|
mov r13, QWORD PTR [rdx+408]
|
|
mov rax, QWORD PTR [rdx+416]
|
|
mov r8, QWORD PTR [rdx+424]
|
|
mov r10, QWORD PTR [rdx+432]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+416], rax
|
|
mov QWORD PTR [r9+424], r8
|
|
mov QWORD PTR [r9+432], r10
|
|
mov QWORD PTR [r9+440], r11
|
|
mov r11, QWORD PTR [rdx+376]
|
|
mov rax, QWORD PTR [rdx+384]
|
|
mov r8, QWORD PTR [rdx+392]
|
|
mov r10, QWORD PTR [rdx+400]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+384], rax
|
|
mov QWORD PTR [r9+392], r8
|
|
mov QWORD PTR [r9+400], r10
|
|
mov QWORD PTR [r9+408], r13
|
|
mov r13, QWORD PTR [rdx+344]
|
|
mov rax, QWORD PTR [rdx+352]
|
|
mov r8, QWORD PTR [rdx+360]
|
|
mov r10, QWORD PTR [rdx+368]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+352], rax
|
|
mov QWORD PTR [r9+360], r8
|
|
mov QWORD PTR [r9+368], r10
|
|
mov QWORD PTR [r9+376], r11
|
|
mov r11, QWORD PTR [rdx+312]
|
|
mov rax, QWORD PTR [rdx+320]
|
|
mov r8, QWORD PTR [rdx+328]
|
|
mov r10, QWORD PTR [rdx+336]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+320], rax
|
|
mov QWORD PTR [r9+328], r8
|
|
mov QWORD PTR [r9+336], r10
|
|
mov QWORD PTR [r9+344], r13
|
|
mov r13, QWORD PTR [rdx+280]
|
|
mov rax, QWORD PTR [rdx+288]
|
|
mov r8, QWORD PTR [rdx+296]
|
|
mov r10, QWORD PTR [rdx+304]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+288], rax
|
|
mov QWORD PTR [r9+296], r8
|
|
mov QWORD PTR [r9+304], r10
|
|
mov QWORD PTR [r9+312], r11
|
|
mov r11, QWORD PTR [rdx+248]
|
|
mov rax, QWORD PTR [rdx+256]
|
|
mov r8, QWORD PTR [rdx+264]
|
|
mov r10, QWORD PTR [rdx+272]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+256], rax
|
|
mov QWORD PTR [r9+264], r8
|
|
mov QWORD PTR [r9+272], r10
|
|
mov QWORD PTR [r9+280], r13
|
|
mov r13, QWORD PTR [rdx+216]
|
|
mov rax, QWORD PTR [rdx+224]
|
|
mov r8, QWORD PTR [rdx+232]
|
|
mov r10, QWORD PTR [rdx+240]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+224], rax
|
|
mov QWORD PTR [r9+232], r8
|
|
mov QWORD PTR [r9+240], r10
|
|
mov QWORD PTR [r9+248], r11
|
|
mov r11, QWORD PTR [rdx+184]
|
|
mov rax, QWORD PTR [rdx+192]
|
|
mov r8, QWORD PTR [rdx+200]
|
|
mov r10, QWORD PTR [rdx+208]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+192], rax
|
|
mov QWORD PTR [r9+200], r8
|
|
mov QWORD PTR [r9+208], r10
|
|
mov QWORD PTR [r9+216], r13
|
|
mov r13, QWORD PTR [rdx+152]
|
|
mov rax, QWORD PTR [rdx+160]
|
|
mov r8, QWORD PTR [rdx+168]
|
|
mov r10, QWORD PTR [rdx+176]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+160], rax
|
|
mov QWORD PTR [r9+168], r8
|
|
mov QWORD PTR [r9+176], r10
|
|
mov QWORD PTR [r9+184], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov rax, QWORD PTR [rdx+128]
|
|
mov r8, QWORD PTR [rdx+136]
|
|
mov r10, QWORD PTR [rdx+144]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+128], rax
|
|
mov QWORD PTR [r9+136], r8
|
|
mov QWORD PTR [r9+144], r10
|
|
mov QWORD PTR [r9+152], r13
|
|
mov r13, QWORD PTR [rdx+88]
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+96], rax
|
|
mov QWORD PTR [r9+104], r8
|
|
mov QWORD PTR [r9+112], r10
|
|
mov QWORD PTR [r9+120], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r11, cl
|
|
mov QWORD PTR [r9+64], rax
|
|
mov QWORD PTR [r9+72], r8
|
|
mov QWORD PTR [r9+80], r10
|
|
mov QWORD PTR [r9+88], r13
|
|
mov r13, QWORD PTR [rdx+24]
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
shld r11, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shld rax, r13, cl
|
|
mov QWORD PTR [r9+32], rax
|
|
mov QWORD PTR [r9+40], r8
|
|
mov QWORD PTR [r9+48], r10
|
|
mov QWORD PTR [r9+56], r11
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
shld r13, r10, cl
|
|
shld r10, r8, cl
|
|
shld r8, rax, cl
|
|
shl rax, cl
|
|
mov QWORD PTR [r9], rax
|
|
mov QWORD PTR [r9+8], r8
|
|
mov QWORD PTR [r9+16], r10
|
|
mov QWORD PTR [r9+24], r13
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_4096_lshift_64 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFNDEF WOLFSSL_SP_NO_256
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mul_4 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
sub rsp, 32
|
|
; A[0] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
mov QWORD PTR [rsp], rax
|
|
mov r11, rdx
|
|
; A[0] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+8], r11
|
|
; A[0] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+16], r12
|
|
; A[0] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+24], r10
|
|
; A[1] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[2] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+16]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[3] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r10, QWORD PTR [rsp+16]
|
|
mov r11, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
add rsp, 32
|
|
pop r12
|
|
ret
|
|
sp_256_mul_4 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_sqr_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r8, rdx
|
|
sub rsp, 32
|
|
; A[0] * A[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul rax
|
|
xor r11, r11
|
|
mov QWORD PTR [rsp], rax
|
|
mov r10, rdx
|
|
; A[0] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+8], r10
|
|
; A[0] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[1] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+16], r11
|
|
; A[0] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8+8]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+24], r9
|
|
; A[1] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[2] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+32], r10
|
|
; A[2] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+16]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+40], r11
|
|
; A[3] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul rax
|
|
add r9, rax
|
|
adc r10, rdx
|
|
mov QWORD PTR [rcx+48], r9
|
|
mov QWORD PTR [rcx+56], r10
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r12, QWORD PTR [rsp+16]
|
|
mov r13, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
add rsp, 32
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_sqr_4 ENDP
|
|
_text ENDS
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_add_4 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_256_add_4 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into r. (r = a - b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_sub_4 PROC
|
|
push r12
|
|
xor rax, rax
|
|
mov r9, QWORD PTR [rdx]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov r11, QWORD PTR [rdx+16]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
sub r9, QWORD PTR [r8]
|
|
sbb r10, QWORD PTR [r8+8]
|
|
sbb r11, QWORD PTR [r8+16]
|
|
sbb r12, QWORD PTR [r8+24]
|
|
mov QWORD PTR [rcx], r9
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_256_sub_4 ENDP
|
|
_text ENDS
|
|
; /* Conditionally copy a into r using the mask m.
|
|
; * m is -1 to copy and 0 when not.
|
|
; *
|
|
; * r A single precision number to copy over.
|
|
; * a A single precision number to copy.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_cond_copy_4 PROC
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [rcx+16]
|
|
mov r11, QWORD PTR [rcx+24]
|
|
xor rax, QWORD PTR [rdx]
|
|
xor r9, QWORD PTR [rdx+8]
|
|
xor r10, QWORD PTR [rdx+16]
|
|
xor r11, QWORD PTR [rdx+24]
|
|
and rax, r8
|
|
and r9, r8
|
|
and r10, r8
|
|
and r11, r8
|
|
xor QWORD PTR [rcx], rax
|
|
xor QWORD PTR [rcx+8], r9
|
|
xor QWORD PTR [rcx+16], r10
|
|
xor QWORD PTR [rcx+24], r11
|
|
ret
|
|
sp_256_cond_copy_4 ENDP
|
|
_text ENDS
|
|
; /* Multiply two Montogmery form numbers mod the modulus (prime).
|
|
; * (r = a * b mod m)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply in Montogmery form.
|
|
; * b Second number to multiply in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; * mp Montogmery mulitplier.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_mul_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov r10, rdx
|
|
; A[0] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r10]
|
|
mov r11, rax
|
|
mov r12, rdx
|
|
; A[0] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r10]
|
|
xor r13, r13
|
|
add r12, rax
|
|
adc r13, rdx
|
|
; A[1] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r10+8]
|
|
xor r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[0] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r10]
|
|
add r13, rax
|
|
adc r14, rdx
|
|
; A[1] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r10+8]
|
|
xor r15, r15
|
|
add r13, rax
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
; A[2] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r10+16]
|
|
add r13, rax
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
; A[0] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r10]
|
|
xor rdi, rdi
|
|
add r14, rax
|
|
adc r15, rdx
|
|
adc rdi, 0
|
|
; A[1] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r10+8]
|
|
add r14, rax
|
|
adc r15, rdx
|
|
adc rdi, 0
|
|
; A[2] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r10+16]
|
|
add r14, rax
|
|
adc r15, rdx
|
|
adc rdi, 0
|
|
; A[3] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r10+24]
|
|
add r14, rax
|
|
adc r15, rdx
|
|
adc rdi, 0
|
|
; A[1] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r10+8]
|
|
xor rsi, rsi
|
|
add r15, rax
|
|
adc rdi, rdx
|
|
adc rsi, 0
|
|
; A[2] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r10+16]
|
|
add r15, rax
|
|
adc rdi, rdx
|
|
adc rsi, 0
|
|
; A[3] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r10+24]
|
|
add r15, rax
|
|
adc rdi, rdx
|
|
adc rsi, 0
|
|
; A[2] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r10+16]
|
|
xor rbx, rbx
|
|
add rdi, rax
|
|
adc rsi, rdx
|
|
adc rbx, 0
|
|
; A[3] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r10+24]
|
|
add rdi, rax
|
|
adc rsi, rdx
|
|
adc rbx, 0
|
|
; A[3] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r10+24]
|
|
add rsi, rax
|
|
adc rbx, rdx
|
|
; Start Reduction
|
|
; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
|
|
; - a[0] << 32 << 192
|
|
; + (a[0] * 2) << 192
|
|
mov rax, r11
|
|
mov rdx, r14
|
|
add rdx, r11
|
|
mov r10, r12
|
|
add rdx, r11
|
|
mov r8, r13
|
|
; a[0]-a[2] << 32
|
|
shl r11, 32
|
|
shld r13, r10, 32
|
|
shld r12, rax, 32
|
|
; - a[0] << 32 << 192
|
|
sub rdx, r11
|
|
; + a[0]-a[2] << 32 << 64
|
|
add r10, r11
|
|
adc r8, r12
|
|
adc rdx, r13
|
|
; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
|
|
; a += mu << 256
|
|
xor r11, r11
|
|
add r15, rax
|
|
adc rdi, r10
|
|
adc rsi, r8
|
|
adc rbx, rdx
|
|
sbb r11, 0
|
|
; a += mu << 192
|
|
add r14, rax
|
|
adc r15, r10
|
|
adc rdi, r8
|
|
adc rsi, rdx
|
|
adc rbx, 0
|
|
sbb r11, 0
|
|
; mu <<= 32
|
|
mov r9, rdx
|
|
shld rdx, r8, 32
|
|
shld r8, r10, 32
|
|
shld r10, rax, 32
|
|
shr r9, 32
|
|
shl rax, 32
|
|
; a += (mu << 32) << 64
|
|
add r14, r8
|
|
adc r15, rdx
|
|
adc rdi, r9
|
|
adc rsi, 0
|
|
adc rbx, 0
|
|
sbb r11, 0
|
|
; a -= (mu << 32) << 192
|
|
sub r14, rax
|
|
sbb r15, r10
|
|
sbb rdi, r8
|
|
sbb rsi, rdx
|
|
sbb rbx, r9
|
|
adc r11, 0
|
|
mov rax, 4294967295
|
|
mov r10, 18446744069414584321
|
|
; mask m and sub from result if overflow
|
|
; m[0] = -1 & mask = mask
|
|
and rax, r11
|
|
; m[2] = 0 & mask = 0
|
|
and r10, r11
|
|
sub r15, r11
|
|
sbb rdi, rax
|
|
sbb rsi, 0
|
|
sbb rbx, r10
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
mov QWORD PTR [rcx+16], rsi
|
|
mov QWORD PTR [rcx+24], rbx
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_mul_4 ENDP
|
|
_text ENDS
|
|
; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
|
|
; *
|
|
; * r Result of squaring.
|
|
; * a Number to square in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; * mp Montogmery mulitplier.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_sqr_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov r8, rdx
|
|
; A[0] * A[1]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r8+8]
|
|
mov r11, rax
|
|
mov r12, rdx
|
|
; A[0] * A[2]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r8+16]
|
|
xor r13, r13
|
|
add r12, rax
|
|
adc r13, rdx
|
|
; A[0] * A[3]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r8+24]
|
|
xor r14, r14
|
|
add r13, rax
|
|
adc r14, rdx
|
|
; A[1] * A[2]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8+16]
|
|
xor r15, r15
|
|
add r13, rax
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
; A[1] * A[3]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8+24]
|
|
add r14, rax
|
|
adc r15, rdx
|
|
; A[2] * A[3]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8+24]
|
|
xor rdi, rdi
|
|
add r15, rax
|
|
adc rdi, rdx
|
|
; Double
|
|
xor rsi, rsi
|
|
add r11, r11
|
|
adc r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
adc r15, r15
|
|
adc rdi, rdi
|
|
adc rsi, 0
|
|
; A[0] * A[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul rax
|
|
mov rax, rax
|
|
mov rdx, rdx
|
|
mov r10, rax
|
|
mov rbx, rdx
|
|
; A[1] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul rax
|
|
mov rax, rax
|
|
mov rdx, rdx
|
|
add r11, rbx
|
|
adc r12, rax
|
|
adc rdx, 0
|
|
mov rbx, rdx
|
|
; A[2] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul rax
|
|
mov rax, rax
|
|
mov rdx, rdx
|
|
add r13, rbx
|
|
adc r14, rax
|
|
adc rdx, 0
|
|
mov rbx, rdx
|
|
; A[3] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul rax
|
|
mov rax, rax
|
|
mov rdx, rdx
|
|
add r15, rbx
|
|
adc rdi, rax
|
|
adc rsi, rdx
|
|
; Start Reduction
|
|
; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
|
|
; - a[0] << 32 << 192
|
|
; + (a[0] * 2) << 192
|
|
mov rax, r10
|
|
mov rdx, r13
|
|
add rdx, r10
|
|
mov r8, r11
|
|
add rdx, r10
|
|
mov rbx, r12
|
|
; a[0]-a[2] << 32
|
|
shl r10, 32
|
|
shld r12, r8, 32
|
|
shld r11, rax, 32
|
|
; - a[0] << 32 << 192
|
|
sub rdx, r10
|
|
; + a[0]-a[2] << 32 << 64
|
|
add r8, r10
|
|
adc rbx, r11
|
|
adc rdx, r12
|
|
; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
|
|
; a += mu << 256
|
|
xor r10, r10
|
|
add r14, rax
|
|
adc r15, r8
|
|
adc rdi, rbx
|
|
adc rsi, rdx
|
|
sbb r10, 0
|
|
; a += mu << 192
|
|
add r13, rax
|
|
adc r14, r8
|
|
adc r15, rbx
|
|
adc rdi, rdx
|
|
adc rsi, 0
|
|
sbb r10, 0
|
|
; mu <<= 32
|
|
mov r9, rdx
|
|
shld rdx, rbx, 32
|
|
shld rbx, r8, 32
|
|
shld r8, rax, 32
|
|
shr r9, 32
|
|
shl rax, 32
|
|
; a += (mu << 32) << 64
|
|
add r13, rbx
|
|
adc r14, rdx
|
|
adc r15, r9
|
|
adc rdi, 0
|
|
adc rsi, 0
|
|
sbb r10, 0
|
|
; a -= (mu << 32) << 192
|
|
sub r13, rax
|
|
sbb r14, r8
|
|
sbb r15, rbx
|
|
sbb rdi, rdx
|
|
sbb rsi, r9
|
|
adc r10, 0
|
|
mov rax, 4294967295
|
|
mov r8, 18446744069414584321
|
|
; mask m and sub from result if overflow
|
|
; m[0] = -1 & mask = mask
|
|
and rax, r10
|
|
; m[2] = 0 & mask = 0
|
|
and r8, r10
|
|
sub r14, r10
|
|
sbb r15, rax
|
|
sbb rdi, 0
|
|
sbb rsi, r8
|
|
mov QWORD PTR [rcx], r14
|
|
mov QWORD PTR [rcx+8], r15
|
|
mov QWORD PTR [rcx+16], rdi
|
|
mov QWORD PTR [rcx+24], rsi
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_sqr_4 ENDP
|
|
_text ENDS
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_cmp_4 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_256_cmp_4 ENDP
|
|
_text ENDS
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_cond_sub_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov rax, 0
|
|
mov r14, QWORD PTR [r8]
|
|
mov r15, QWORD PTR [r8+8]
|
|
mov rdi, QWORD PTR [r8+16]
|
|
mov rsi, QWORD PTR [r8+24]
|
|
and r14, r9
|
|
and r15, r9
|
|
and rdi, r9
|
|
and rsi, r9
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
mov r13, QWORD PTR [rdx+24]
|
|
sub r10, r14
|
|
sbb r11, r15
|
|
sbb r12, rdi
|
|
sbb r13, rsi
|
|
mov QWORD PTR [rcx], r10
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
sbb rax, 0
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_cond_sub_4 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 256 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_reduce_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
; i = 0
|
|
xor rdi, rdi
|
|
mov r10, 4
|
|
mov r15, rcx
|
|
L_mont_loop_4:
|
|
; mu = a[i] * mp
|
|
mov r14, QWORD PTR [r15]
|
|
imul r14, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, QWORD PTR [r9]
|
|
mov r12, QWORD PTR [r9+8]
|
|
mul r14
|
|
mov rsi, QWORD PTR [r15]
|
|
add rsi, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [r15], rsi
|
|
adc r11, 0
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r12
|
|
mul r14
|
|
mov r12, QWORD PTR [r9+16]
|
|
mov rsi, QWORD PTR [r15+8]
|
|
add rax, r11
|
|
mov r13, rdx
|
|
adc r13, 0
|
|
add rsi, rax
|
|
mov QWORD PTR [r15+8], rsi
|
|
adc r13, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r12
|
|
mul r14
|
|
mov r12, QWORD PTR [r9+24]
|
|
mov rsi, QWORD PTR [r15+16]
|
|
add rax, r13
|
|
mov r11, rdx
|
|
adc r11, 0
|
|
add rsi, rax
|
|
mov QWORD PTR [r15+16], rsi
|
|
adc r11, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r12
|
|
mul r14
|
|
mov rsi, QWORD PTR [r15+24]
|
|
add rax, r11
|
|
adc rdx, rdi
|
|
mov rdi, 0
|
|
adc rdi, 0
|
|
add rsi, rax
|
|
mov QWORD PTR [r15+24], rsi
|
|
adc QWORD PTR [r15+32], rdx
|
|
adc rdi, 0
|
|
; i += 1
|
|
add r15, 8
|
|
dec r10
|
|
jnz L_mont_loop_4
|
|
xor rax, rax
|
|
mov rdx, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
mov rsi, QWORD PTR [rcx+48]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
sub rax, rdi
|
|
mov r12, QWORD PTR [r9]
|
|
mov r13, QWORD PTR [r9+8]
|
|
mov r14, QWORD PTR [r9+16]
|
|
mov r15, QWORD PTR [r9+24]
|
|
and r12, rax
|
|
and r13, rax
|
|
and r14, rax
|
|
and r15, rax
|
|
sub rdx, r12
|
|
sbb r10, r13
|
|
sbb rsi, r14
|
|
sbb r11, r15
|
|
mov QWORD PTR [rcx], rdx
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], rsi
|
|
mov QWORD PTR [rcx+24], r11
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_reduce_4 ENDP
|
|
_text ENDS
|
|
; /* Add two Montgomery form numbers (r = a + b % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a First number to add in Montogmery form.
|
|
; * b Second number to add in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_add_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r12, 4294967295
|
|
mov r13, 18446744069414584321
|
|
add rax, QWORD PTR [r8]
|
|
adc r9, QWORD PTR [r8+8]
|
|
adc r10, QWORD PTR [r8+16]
|
|
mov rdx, 0
|
|
adc r11, QWORD PTR [r8+24]
|
|
sbb rdx, 0
|
|
and r12, rdx
|
|
and r13, rdx
|
|
sub rax, rdx
|
|
sbb r9, r12
|
|
sbb r10, 0
|
|
sbb r11, r13
|
|
adc rdx, 0
|
|
and r12, rdx
|
|
and r13, rdx
|
|
sub rax, rdx
|
|
sbb r9, r12
|
|
mov QWORD PTR [rcx], rax
|
|
sbb r10, 0
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r11, r13
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_add_4 ENDP
|
|
_text ENDS
|
|
; /* Double a Montgomery form number (r = a + a % m).
|
|
; *
|
|
; * r Result of doubling.
|
|
; * a Number to double in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_dbl_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov r11, 4294967295
|
|
mov r12, 18446744069414584321
|
|
add rax, rax
|
|
adc r8, r8
|
|
adc r9, r9
|
|
mov r13, 0
|
|
adc r10, r10
|
|
sbb r13, 0
|
|
and r11, r13
|
|
and r12, r13
|
|
sub rax, r13
|
|
sbb r8, r11
|
|
sbb r9, 0
|
|
sbb r10, r12
|
|
adc r13, 0
|
|
and r11, r13
|
|
and r12, r13
|
|
sub rax, r13
|
|
sbb r8, r11
|
|
mov QWORD PTR [rcx], rax
|
|
sbb r9, 0
|
|
mov QWORD PTR [rcx+8], r8
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+16], r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_dbl_4 ENDP
|
|
_text ENDS
|
|
; /* Triple a Montgomery form number (r = a + a + a % m).
|
|
; *
|
|
; * r Result of Tripling.
|
|
; * a Number to triple in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_tpl_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov r11, 4294967295
|
|
mov r12, 18446744069414584321
|
|
add rax, rax
|
|
adc r8, r8
|
|
adc r9, r9
|
|
mov r13, 0
|
|
adc r10, r10
|
|
sbb r13, 0
|
|
and r11, r13
|
|
and r12, r13
|
|
sub rax, r13
|
|
sbb r8, r11
|
|
sbb r9, 0
|
|
sbb r10, r12
|
|
adc r13, 0
|
|
and r11, r13
|
|
and r12, r13
|
|
sub rax, r13
|
|
sbb r8, r11
|
|
sbb r9, 0
|
|
sbb r10, r12
|
|
mov r11, 4294967295
|
|
mov r12, 18446744069414584321
|
|
add rax, QWORD PTR [rdx]
|
|
adc r8, QWORD PTR [rdx+8]
|
|
adc r9, QWORD PTR [rdx+16]
|
|
mov r13, 0
|
|
adc r10, QWORD PTR [rdx+24]
|
|
sbb r13, 0
|
|
and r11, r13
|
|
and r12, r13
|
|
sub rax, r13
|
|
sbb r8, r11
|
|
sbb r9, 0
|
|
sbb r10, r12
|
|
adc r13, 0
|
|
and r11, r13
|
|
and r12, r13
|
|
sub rax, r13
|
|
sbb r8, r11
|
|
mov QWORD PTR [rcx], rax
|
|
sbb r9, 0
|
|
mov QWORD PTR [rcx+8], r8
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+16], r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_tpl_4 ENDP
|
|
_text ENDS
|
|
; /* Subtract two Montgomery form numbers (r = a - b % m).
|
|
; *
|
|
; * r Result of subtration.
|
|
; * a Number to subtract from in Montogmery form.
|
|
; * b Number to subtract with in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_sub_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r12, 4294967295
|
|
mov r13, 18446744069414584321
|
|
sub rax, QWORD PTR [r8]
|
|
sbb r9, QWORD PTR [r8+8]
|
|
sbb r10, QWORD PTR [r8+16]
|
|
mov rdx, 0
|
|
sbb r11, QWORD PTR [r8+24]
|
|
sbb rdx, 0
|
|
and r12, rdx
|
|
and r13, rdx
|
|
add rax, rdx
|
|
adc r9, r12
|
|
adc r10, 0
|
|
adc r11, r13
|
|
adc rdx, 0
|
|
and r12, rdx
|
|
and r13, rdx
|
|
add rax, rdx
|
|
adc r9, r12
|
|
mov QWORD PTR [rcx], rax
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+8], r9
|
|
adc r11, r13
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_sub_4 ENDP
|
|
_text ENDS
|
|
; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_div2_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov r11, 4294967295
|
|
mov r12, 18446744069414584321
|
|
mov r13, rax
|
|
and r13, 1
|
|
neg r13
|
|
and r11, r13
|
|
and r12, r13
|
|
add rax, r13
|
|
adc r8, r11
|
|
adc r9, 0
|
|
adc r10, r12
|
|
mov r13, 0
|
|
adc r13, 0
|
|
shrd rax, r8, 1
|
|
shrd r8, r9, 1
|
|
shrd r9, r10, 1
|
|
shrd r10, r13, 1
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r8
|
|
mov QWORD PTR [rcx+16], r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_div2_4 ENDP
|
|
_text ENDS
|
|
IFNDEF WC_NO_CACHE_RESISTANT
|
|
; /* Touch each possible point that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of point to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_get_point_33_4 PROC
|
|
mov rax, 1
|
|
movd xmm13, r8d
|
|
add rdx, 200
|
|
movd xmm15, eax
|
|
mov rax, 32
|
|
pshufd xmm15, xmm15, 0
|
|
pshufd xmm13, xmm13, 0
|
|
pxor xmm14, xmm14
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
pxor xmm3, xmm3
|
|
pxor xmm4, xmm4
|
|
pxor xmm5, xmm5
|
|
movdqa xmm14, xmm15
|
|
L_256_get_point_33_4_start:
|
|
movdqa xmm12, xmm14
|
|
paddd xmm14, xmm15
|
|
pcmpeqd xmm12, xmm13
|
|
movdqu xmm6, [rdx]
|
|
movdqu xmm7, [rdx+16]
|
|
movdqu xmm8, [rdx+64]
|
|
movdqu xmm9, [rdx+80]
|
|
movdqu xmm10, [rdx+128]
|
|
movdqu xmm11, [rdx+144]
|
|
add rdx, 200
|
|
pand xmm6, xmm12
|
|
pand xmm7, xmm12
|
|
pand xmm8, xmm12
|
|
pand xmm9, xmm12
|
|
pand xmm10, xmm12
|
|
pand xmm11, xmm12
|
|
por xmm0, xmm6
|
|
por xmm1, xmm7
|
|
por xmm2, xmm8
|
|
por xmm3, xmm9
|
|
por xmm4, xmm10
|
|
por xmm5, xmm11
|
|
dec rax
|
|
jnz L_256_get_point_33_4_start
|
|
movdqu [rcx], xmm0
|
|
movdqu [rcx+16], xmm1
|
|
movdqu [rcx+64], xmm2
|
|
movdqu [rcx+80], xmm3
|
|
movdqu [rcx+128], xmm4
|
|
movdqu [rcx+144], xmm5
|
|
ret
|
|
sp_256_get_point_33_4 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Touch each possible point that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of point to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_get_point_33_avx2_4 PROC
|
|
mov rax, 1
|
|
movd xmm7, r8d
|
|
add rdx, 200
|
|
movd xmm9, eax
|
|
mov rax, 32
|
|
vpxor ymm8, ymm8, ymm8
|
|
vpermd ymm7, ymm8, ymm7
|
|
vpermd ymm9, ymm8, ymm9
|
|
vpxor ymm0, ymm0, ymm0
|
|
vpxor ymm1, ymm1, ymm1
|
|
vpxor ymm2, ymm2, ymm2
|
|
vmovdqa ymm8, ymm9
|
|
L_256_get_point_33_avx2_4_start:
|
|
vpcmpeqd ymm6, ymm8, ymm7
|
|
vpaddd ymm8, ymm8, ymm9
|
|
vmovupd ymm3, [rdx]
|
|
vmovupd ymm4, [rdx+64]
|
|
vmovupd ymm5, [rdx+128]
|
|
add rdx, 200
|
|
vpand ymm3, ymm3, ymm6
|
|
vpand ymm4, ymm4, ymm6
|
|
vpand ymm5, ymm5, ymm6
|
|
vpor ymm0, ymm0, ymm3
|
|
vpor ymm1, ymm1, ymm4
|
|
vpor ymm2, ymm2, ymm5
|
|
dec rax
|
|
jnz L_256_get_point_33_avx2_4_start
|
|
vmovupd YMMWORD PTR [rcx], ymm0
|
|
vmovupd YMMWORD PTR [rcx+64], ymm1
|
|
vmovupd YMMWORD PTR [rcx+128], ymm2
|
|
ret
|
|
sp_256_get_point_33_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply two Montogmery form numbers mod the modulus (prime).
|
|
; * (r = a * b mod m)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply in Montogmery form.
|
|
; * b Second number to multiply in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; * mp Montogmery mulitplier.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_mul_avx2_4 PROC
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rbp
|
|
push rdi
|
|
push rsi
|
|
mov rbp, r8
|
|
mov rdi, rdx
|
|
; A[0] * B[0]
|
|
mov rdx, QWORD PTR [rbp]
|
|
mulx r9, r8, QWORD PTR [rdi]
|
|
; A[2] * B[0]
|
|
mulx r11, r10, QWORD PTR [rdi+16]
|
|
; A[1] * B[0]
|
|
mulx rsi, rax, QWORD PTR [rdi+8]
|
|
xor r15, r15
|
|
adcx r9, rax
|
|
; A[1] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
mulx r13, r12, QWORD PTR [rdi+8]
|
|
adcx r10, rsi
|
|
; A[0] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
mulx rsi, rax, QWORD PTR [rdi]
|
|
adox r9, rax
|
|
; A[2] * B[1]
|
|
mulx r14, rax, QWORD PTR [rdi+16]
|
|
adox r10, rsi
|
|
adcx r11, rax
|
|
; A[1] * B[2]
|
|
mov rdx, QWORD PTR [rbp+16]
|
|
mulx rsi, rax, QWORD PTR [rdi+8]
|
|
adcx r12, r14
|
|
adox r11, rax
|
|
adcx r13, r15
|
|
adox r12, rsi
|
|
; A[0] * B[2]
|
|
mulx rsi, rax, QWORD PTR [rdi]
|
|
adox r13, r15
|
|
xor r14, r14
|
|
adcx r10, rax
|
|
; A[1] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
mulx rax, rdx, QWORD PTR [rdi+8]
|
|
adcx r11, rsi
|
|
adox r10, rdx
|
|
; A[3] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
adox r11, rax
|
|
mulx rsi, rax, QWORD PTR [rdi+24]
|
|
adcx r12, rax
|
|
; A[2] * B[2]
|
|
mov rdx, QWORD PTR [rbp+16]
|
|
mulx rax, rdx, QWORD PTR [rdi+16]
|
|
adcx r13, rsi
|
|
adox r12, rdx
|
|
; A[3] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
adox r13, rax
|
|
mulx rsi, rax, QWORD PTR [rdi+24]
|
|
adox r14, r15
|
|
adcx r14, rax
|
|
; A[0] * B[3]
|
|
mulx rax, rdx, QWORD PTR [rdi]
|
|
adcx r15, rsi
|
|
xor rsi, rsi
|
|
adcx r11, rdx
|
|
; A[3] * B[0]
|
|
mov rdx, QWORD PTR [rdi+24]
|
|
adcx r12, rax
|
|
mulx rax, rbx, QWORD PTR [rbp]
|
|
adox r11, rbx
|
|
adox r12, rax
|
|
; A[3] * B[2]
|
|
mulx rax, rdx, QWORD PTR [rbp+16]
|
|
adcx r13, rdx
|
|
; A[2] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
adcx r14, rax
|
|
mulx rdx, rax, QWORD PTR [rdi+16]
|
|
adcx r15, rsi
|
|
adox r13, rax
|
|
adox r14, rdx
|
|
adox r15, rsi
|
|
; Start Reduction
|
|
; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
|
|
; - a[0] << 32 << 192
|
|
; + (a[0] * 2) << 192
|
|
mov rax, r8
|
|
mov rdx, r11
|
|
add rdx, r8
|
|
mov rdi, r9
|
|
add rdx, r8
|
|
mov rbp, r10
|
|
; a[0]-a[2] << 32
|
|
shl r8, 32
|
|
shld r10, rdi, 32
|
|
shld r9, rax, 32
|
|
; - a[0] << 32 << 192
|
|
sub rdx, r8
|
|
; + a[0]-a[2] << 32 << 64
|
|
add rdi, r8
|
|
adc rbp, r9
|
|
adc rdx, r10
|
|
; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
|
|
; a += mu << 256
|
|
xor r8, r8
|
|
add r12, rax
|
|
adc r13, rdi
|
|
adc r14, rbp
|
|
adc r15, rdx
|
|
sbb r8, 0
|
|
; a += mu << 192
|
|
add r11, rax
|
|
adc r12, rdi
|
|
adc r13, rbp
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
sbb r8, 0
|
|
; mu <<= 32
|
|
mov rsi, rdx
|
|
shld rdx, rbp, 32
|
|
shld rbp, rdi, 32
|
|
shld rdi, rax, 32
|
|
shr rsi, 32
|
|
shl rax, 32
|
|
; a += (mu << 32) << 64
|
|
add r11, rbp
|
|
adc r12, rdx
|
|
adc r13, rsi
|
|
adc r14, 0
|
|
adc r15, 0
|
|
sbb r8, 0
|
|
; a -= (mu << 32) << 192
|
|
sub r11, rax
|
|
sbb r12, rdi
|
|
sbb r13, rbp
|
|
sbb r14, rdx
|
|
sbb r15, rsi
|
|
adc r8, 0
|
|
mov rax, 4294967295
|
|
mov rdi, 18446744069414584321
|
|
; mask m and sub from result if overflow
|
|
; m[0] = -1 & mask = mask
|
|
and rax, r8
|
|
; m[2] = 0 & mask = 0
|
|
and rdi, r8
|
|
sub r12, r8
|
|
sbb r13, rax
|
|
sbb r14, 0
|
|
sbb r15, rdi
|
|
mov QWORD PTR [rcx], r12
|
|
mov QWORD PTR [rcx+8], r13
|
|
mov QWORD PTR [rcx+16], r14
|
|
mov QWORD PTR [rcx+24], r15
|
|
pop rsi
|
|
pop rdi
|
|
pop rbp
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
ret
|
|
sp_256_mont_mul_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
|
|
; *
|
|
; * r Result of squaring.
|
|
; * a Number to square in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; * mp Montogmery mulitplier.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_sqr_avx2_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov rax, rdx
|
|
; A[0] * A[1]
|
|
mov rdx, QWORD PTR [rax]
|
|
mov r15, QWORD PTR [rax+16]
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
; A[0] * A[3]
|
|
mulx r12, r11, QWORD PTR [rax+24]
|
|
; A[2] * A[1]
|
|
mov rdx, r15
|
|
mulx rbx, rsi, QWORD PTR [rax+8]
|
|
; A[2] * A[3]
|
|
mulx r14, r13, QWORD PTR [rax+24]
|
|
xor r15, r15
|
|
adox r11, rsi
|
|
adox r12, rbx
|
|
; A[2] * A[0]
|
|
mulx rbx, rsi, QWORD PTR [rax]
|
|
; A[1] * A[3]
|
|
mov rdx, QWORD PTR [rax+8]
|
|
adox r13, r15
|
|
mulx r8, rdi, QWORD PTR [rax+24]
|
|
adcx r10, rsi
|
|
adox r14, r15
|
|
adcx r11, rbx
|
|
adcx r12, rdi
|
|
adcx r13, r8
|
|
adcx r14, r15
|
|
; Double with Carry Flag
|
|
xor r15, r15
|
|
; A[0] * A[0]
|
|
mov rdx, QWORD PTR [rax]
|
|
mulx rdi, r8, rdx
|
|
adcx r9, r9
|
|
adcx r10, r10
|
|
adox r9, rdi
|
|
; A[1] * A[1]
|
|
mov rdx, QWORD PTR [rax+8]
|
|
mulx rbx, rsi, rdx
|
|
adcx r11, r11
|
|
adox r10, rsi
|
|
; A[2] * A[2]
|
|
mov rdx, QWORD PTR [rax+16]
|
|
mulx rsi, rdi, rdx
|
|
adcx r12, r12
|
|
adox r11, rbx
|
|
adcx r13, r13
|
|
adox r12, rdi
|
|
adcx r14, r14
|
|
; A[3] * A[3]
|
|
mov rdx, QWORD PTR [rax+24]
|
|
mulx rbx, rdi, rdx
|
|
adox r13, rsi
|
|
adcx r15, r15
|
|
adox r14, rdi
|
|
adox r15, rbx
|
|
; Start Reduction
|
|
; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
|
|
; - a[0] << 32 << 192
|
|
; + (a[0] * 2) << 192
|
|
mov rdi, r8
|
|
mov rdx, r11
|
|
add rdx, r8
|
|
mov rax, r9
|
|
add rdx, r8
|
|
mov rsi, r10
|
|
; a[0]-a[2] << 32
|
|
shl r8, 32
|
|
shld r10, rax, 32
|
|
shld r9, rdi, 32
|
|
; - a[0] << 32 << 192
|
|
sub rdx, r8
|
|
; + a[0]-a[2] << 32 << 64
|
|
add rax, r8
|
|
adc rsi, r9
|
|
adc rdx, r10
|
|
; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
|
|
; a += mu << 256
|
|
xor r8, r8
|
|
add r12, rdi
|
|
adc r13, rax
|
|
adc r14, rsi
|
|
adc r15, rdx
|
|
sbb r8, 0
|
|
; a += mu << 192
|
|
add r11, rdi
|
|
adc r12, rax
|
|
adc r13, rsi
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
sbb r8, 0
|
|
; mu <<= 32
|
|
mov rbx, rdx
|
|
shld rdx, rsi, 32
|
|
shld rsi, rax, 32
|
|
shld rax, rdi, 32
|
|
shr rbx, 32
|
|
shl rdi, 32
|
|
; a += (mu << 32) << 64
|
|
add r11, rsi
|
|
adc r12, rdx
|
|
adc r13, rbx
|
|
adc r14, 0
|
|
adc r15, 0
|
|
sbb r8, 0
|
|
; a -= (mu << 32) << 192
|
|
sub r11, rdi
|
|
sbb r12, rax
|
|
sbb r13, rsi
|
|
sbb r14, rdx
|
|
sbb r15, rbx
|
|
adc r8, 0
|
|
mov rdi, 4294967295
|
|
mov rax, 18446744069414584321
|
|
; mask m and sub from result if overflow
|
|
; m[0] = -1 & mask = mask
|
|
and rdi, r8
|
|
; m[2] = 0 & mask = 0
|
|
and rax, r8
|
|
sub r12, r8
|
|
sbb r13, rdi
|
|
sbb r14, 0
|
|
sbb r15, rax
|
|
mov QWORD PTR [rcx], r12
|
|
mov QWORD PTR [rcx+8], r13
|
|
mov QWORD PTR [rcx+16], r14
|
|
mov QWORD PTR [rcx+24], r15
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_sqr_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_cond_sub_avx2_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov rax, 0
|
|
mov r14, QWORD PTR [r8]
|
|
mov r15, QWORD PTR [r8+8]
|
|
mov rdi, QWORD PTR [r8+16]
|
|
mov rsi, QWORD PTR [r8+24]
|
|
and r14, r9
|
|
and r15, r9
|
|
and rdi, r9
|
|
and rsi, r9
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
mov r13, QWORD PTR [rdx+24]
|
|
sub r10, r14
|
|
sbb r11, r15
|
|
sbb r12, rdi
|
|
sbb r13, rsi
|
|
mov QWORD PTR [rcx], r10
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
sbb rax, 0
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_cond_sub_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 256 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_reduce_avx2_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov rax, rcx
|
|
mov r10, rdx
|
|
mov r11, r8
|
|
mov r14, QWORD PTR [rax]
|
|
mov r15, QWORD PTR [rax+8]
|
|
mov rdi, QWORD PTR [rax+16]
|
|
mov rsi, QWORD PTR [rax+24]
|
|
xor r13, r13
|
|
xor r12, r12
|
|
; a[0-4] += m[0-3] * mu = m[0-3] * (a[0] * mp)
|
|
mov rbx, QWORD PTR [rax+32]
|
|
; mu = a[0] * mp
|
|
mov rdx, r14
|
|
mulx rcx, rdx, r11
|
|
; a[0] += m[0] * mu
|
|
mulx r9, r8, QWORD PTR [r10]
|
|
adcx r14, r8
|
|
; a[1] += m[1] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+8]
|
|
adox r15, r9
|
|
adcx r15, r8
|
|
; a[2] += m[2] * mu
|
|
mulx r9, r8, QWORD PTR [r10+16]
|
|
adox rdi, rcx
|
|
adcx rdi, r8
|
|
; a[3] += m[3] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+24]
|
|
adox rsi, r9
|
|
adcx rsi, r8
|
|
; a[4] += carry
|
|
adox rbx, rcx
|
|
adcx rbx, r12
|
|
; carry
|
|
adox r13, r12
|
|
adcx r13, r12
|
|
; a[1-5] += m[0-3] * mu = m[0-3] * (a[1] * mp)
|
|
mov r14, QWORD PTR [rax+40]
|
|
; mu = a[1] * mp
|
|
mov rdx, r15
|
|
mulx rcx, rdx, r11
|
|
; a[1] += m[0] * mu
|
|
mulx r9, r8, QWORD PTR [r10]
|
|
adcx r15, r8
|
|
; a[2] += m[1] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+8]
|
|
adox rdi, r9
|
|
adcx rdi, r8
|
|
; a[3] += m[2] * mu
|
|
mulx r9, r8, QWORD PTR [r10+16]
|
|
adox rsi, rcx
|
|
adcx rsi, r8
|
|
; a[4] += m[3] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+24]
|
|
adox rbx, r9
|
|
adcx rbx, r8
|
|
; a[5] += carry
|
|
adox r14, rcx
|
|
adcx r14, r13
|
|
mov r13, r12
|
|
; carry
|
|
adox r13, r12
|
|
adcx r13, r12
|
|
; a[2-6] += m[0-3] * mu = m[0-3] * (a[2] * mp)
|
|
mov r15, QWORD PTR [rax+48]
|
|
; mu = a[2] * mp
|
|
mov rdx, rdi
|
|
mulx rcx, rdx, r11
|
|
; a[2] += m[0] * mu
|
|
mulx r9, r8, QWORD PTR [r10]
|
|
adcx rdi, r8
|
|
; a[3] += m[1] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+8]
|
|
adox rsi, r9
|
|
adcx rsi, r8
|
|
; a[4] += m[2] * mu
|
|
mulx r9, r8, QWORD PTR [r10+16]
|
|
adox rbx, rcx
|
|
adcx rbx, r8
|
|
; a[5] += m[3] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+24]
|
|
adox r14, r9
|
|
adcx r14, r8
|
|
; a[6] += carry
|
|
adox r15, rcx
|
|
adcx r15, r13
|
|
mov r13, r12
|
|
; carry
|
|
adox r13, r12
|
|
adcx r13, r12
|
|
; a[3-7] += m[0-3] * mu = m[0-3] * (a[3] * mp)
|
|
mov rdi, QWORD PTR [rax+56]
|
|
; mu = a[3] * mp
|
|
mov rdx, rsi
|
|
mulx rcx, rdx, r11
|
|
; a[3] += m[0] * mu
|
|
mulx r9, r8, QWORD PTR [r10]
|
|
adcx rsi, r8
|
|
; a[4] += m[1] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+8]
|
|
adox rbx, r9
|
|
adcx rbx, r8
|
|
; a[5] += m[2] * mu
|
|
mulx r9, r8, QWORD PTR [r10+16]
|
|
adox r14, rcx
|
|
adcx r14, r8
|
|
; a[6] += m[3] * mu
|
|
mulx rcx, r8, QWORD PTR [r10+24]
|
|
adox r15, r9
|
|
adcx r15, r8
|
|
; a[7] += carry
|
|
adox rdi, rcx
|
|
adcx rdi, r13
|
|
mov r13, r12
|
|
; carry
|
|
adox r13, r12
|
|
adcx r13, r12
|
|
; Subtract mod if carry
|
|
neg r13
|
|
mov r8, 17562291160714782033
|
|
mov r9, 13611842547513532036
|
|
mov rdx, 18446744069414584320
|
|
and r8, r13
|
|
and r9, r13
|
|
and rdx, r13
|
|
sub rbx, r8
|
|
sbb r14, r9
|
|
sbb r15, r13
|
|
sbb rdi, rdx
|
|
mov QWORD PTR [rax], rbx
|
|
mov QWORD PTR [rax+8], r14
|
|
mov QWORD PTR [rax+16], r15
|
|
mov QWORD PTR [rax+24], rdi
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mont_reduce_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_div2_avx2_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov r11, 4294967295
|
|
mov r12, 18446744069414584321
|
|
mov r13, rax
|
|
and r13, 1
|
|
neg r13
|
|
and r11, r13
|
|
and r12, r13
|
|
add rax, r13
|
|
adc r8, r11
|
|
adc r9, 0
|
|
adc r10, r12
|
|
mov r13, 0
|
|
adc r13, 0
|
|
shrd rax, r8, 1
|
|
shrd r8, r9, 1
|
|
shrd r9, r10, 1
|
|
shrd r10, r13, 1
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r8
|
|
mov QWORD PTR [rcx+16], r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_div2_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFNDEF WC_NO_CACHE_RESISTANT
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_get_entry_64_4 PROC
|
|
mov rax, 1
|
|
movd xmm9, r8d
|
|
add rdx, 64
|
|
movd xmm11, eax
|
|
mov rax, 63
|
|
pshufd xmm11, xmm11, 0
|
|
pshufd xmm9, xmm9, 0
|
|
pxor xmm10, xmm10
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
pxor xmm3, xmm3
|
|
movdqa xmm10, xmm11
|
|
L_256_get_entry_64_4_start:
|
|
movdqa xmm8, xmm10
|
|
paddd xmm10, xmm11
|
|
pcmpeqd xmm8, xmm9
|
|
movdqu xmm4, [rdx]
|
|
movdqu xmm5, [rdx+16]
|
|
movdqu xmm6, [rdx+32]
|
|
movdqu xmm7, [rdx+48]
|
|
add rdx, 64
|
|
pand xmm4, xmm8
|
|
pand xmm5, xmm8
|
|
pand xmm6, xmm8
|
|
pand xmm7, xmm8
|
|
por xmm0, xmm4
|
|
por xmm1, xmm5
|
|
por xmm2, xmm6
|
|
por xmm3, xmm7
|
|
dec rax
|
|
jnz L_256_get_entry_64_4_start
|
|
movdqu [rcx], xmm0
|
|
movdqu [rcx+16], xmm1
|
|
movdqu [rcx+64], xmm2
|
|
movdqu [rcx+80], xmm3
|
|
ret
|
|
sp_256_get_entry_64_4 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_get_entry_64_avx2_4 PROC
|
|
mov rax, 1
|
|
movd xmm5, r8d
|
|
add rdx, 64
|
|
movd xmm7, eax
|
|
mov rax, 64
|
|
vpxor ymm6, ymm6, ymm6
|
|
vpermd ymm5, ymm6, ymm5
|
|
vpermd ymm7, ymm6, ymm7
|
|
vpxor ymm0, ymm0, ymm0
|
|
vpxor ymm1, ymm1, ymm1
|
|
vmovdqa ymm6, ymm7
|
|
L_256_get_entry_64_avx2_4_start:
|
|
vpcmpeqd ymm4, ymm6, ymm5
|
|
vpaddd ymm6, ymm6, ymm7
|
|
vmovupd ymm2, [rdx]
|
|
vmovupd ymm3, [rdx+32]
|
|
add rdx, 64
|
|
vpand ymm2, ymm2, ymm4
|
|
vpand ymm3, ymm3, ymm4
|
|
vpor ymm0, ymm0, ymm2
|
|
vpor ymm1, ymm1, ymm3
|
|
dec rax
|
|
jnz L_256_get_entry_64_avx2_4_start
|
|
vmovupd YMMWORD PTR [rcx], ymm0
|
|
vmovupd YMMWORD PTR [rcx+64], ymm1
|
|
ret
|
|
sp_256_get_entry_64_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFNDEF WC_NO_CACHE_RESISTANT
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_get_entry_65_4 PROC
|
|
mov rax, 1
|
|
movd xmm9, r8d
|
|
add rdx, 64
|
|
movd xmm11, eax
|
|
mov rax, 64
|
|
pshufd xmm11, xmm11, 0
|
|
pshufd xmm9, xmm9, 0
|
|
pxor xmm10, xmm10
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
pxor xmm3, xmm3
|
|
movdqa xmm10, xmm11
|
|
L_256_get_entry_65_4_start:
|
|
movdqa xmm8, xmm10
|
|
paddd xmm10, xmm11
|
|
pcmpeqd xmm8, xmm9
|
|
movdqu xmm4, [rdx]
|
|
movdqu xmm5, [rdx+16]
|
|
movdqu xmm6, [rdx+32]
|
|
movdqu xmm7, [rdx+48]
|
|
add rdx, 64
|
|
pand xmm4, xmm8
|
|
pand xmm5, xmm8
|
|
pand xmm6, xmm8
|
|
pand xmm7, xmm8
|
|
por xmm0, xmm4
|
|
por xmm1, xmm5
|
|
por xmm2, xmm6
|
|
por xmm3, xmm7
|
|
dec rax
|
|
jnz L_256_get_entry_65_4_start
|
|
movdqu [rcx], xmm0
|
|
movdqu [rcx+16], xmm1
|
|
movdqu [rcx+64], xmm2
|
|
movdqu [rcx+80], xmm3
|
|
ret
|
|
sp_256_get_entry_65_4 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_get_entry_65_avx2_4 PROC
|
|
mov rax, 1
|
|
movd xmm5, r8d
|
|
add rdx, 64
|
|
movd xmm7, eax
|
|
mov rax, 65
|
|
vpxor ymm6, ymm6, ymm6
|
|
vpermd ymm5, ymm6, ymm5
|
|
vpermd ymm7, ymm6, ymm7
|
|
vpxor ymm0, ymm0, ymm0
|
|
vpxor ymm1, ymm1, ymm1
|
|
vmovdqa ymm6, ymm7
|
|
L_256_get_entry_65_avx2_4_start:
|
|
vpcmpeqd ymm4, ymm6, ymm5
|
|
vpaddd ymm6, ymm6, ymm7
|
|
vmovupd ymm2, [rdx]
|
|
vmovupd ymm3, [rdx+32]
|
|
add rdx, 64
|
|
vpand ymm2, ymm2, ymm4
|
|
vpand ymm3, ymm3, ymm4
|
|
vpor ymm0, ymm0, ymm2
|
|
vpor ymm1, ymm1, ymm3
|
|
dec rax
|
|
jnz L_256_get_entry_65_avx2_4_start
|
|
vmovupd YMMWORD PTR [rcx], ymm0
|
|
vmovupd YMMWORD PTR [rcx+64], ymm1
|
|
ret
|
|
sp_256_get_entry_65_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
; /* Add 1 to a. (a = a + 1)
|
|
; *
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_add_one_4 PROC
|
|
add QWORD PTR [rcx], 1
|
|
adc QWORD PTR [rcx+8], 0
|
|
adc QWORD PTR [rcx+16], 0
|
|
adc QWORD PTR [rcx+24], 0
|
|
ret
|
|
sp_256_add_one_4 ENDP
|
|
_text ENDS
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_from_bin_bswap PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 32
|
|
xor r13, r13
|
|
jmp L_256_from_bin_bswap_64_end
|
|
L_256_from_bin_bswap_64_start:
|
|
sub r11, 64
|
|
mov rax, QWORD PTR [r11+56]
|
|
mov r10, QWORD PTR [r11+48]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [r11+40]
|
|
mov r10, QWORD PTR [r11+32]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [r11+24]
|
|
mov r10, QWORD PTR [r11+16]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [r11+8]
|
|
mov r10, QWORD PTR [r11]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_256_from_bin_bswap_64_end:
|
|
cmp r9, 63
|
|
jg L_256_from_bin_bswap_64_start
|
|
jmp L_256_from_bin_bswap_8_end
|
|
L_256_from_bin_bswap_8_start:
|
|
sub r11, 8
|
|
mov rax, QWORD PTR [r11]
|
|
bswap rax
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_256_from_bin_bswap_8_end:
|
|
cmp r9, 7
|
|
jg L_256_from_bin_bswap_8_start
|
|
cmp r9, r13
|
|
je L_256_from_bin_bswap_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_256_from_bin_bswap_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_256_from_bin_bswap_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_256_from_bin_bswap_hi_end:
|
|
cmp rcx, r12
|
|
je L_256_from_bin_bswap_zero_end
|
|
L_256_from_bin_bswap_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_256_from_bin_bswap_zero_start
|
|
L_256_from_bin_bswap_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_from_bin_bswap ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the movbe instruction which is an optional instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_from_bin_movbe PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 32
|
|
xor r13, r13
|
|
jmp L_256_from_bin_movbe_64_end
|
|
L_256_from_bin_movbe_64_start:
|
|
sub r11, 64
|
|
movbe rax, QWORD PTR [r11+56]
|
|
movbe r10, QWORD PTR [r11+48]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
movbe rax, QWORD PTR [r11+40]
|
|
movbe r10, QWORD PTR [r11+32]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
movbe rax, QWORD PTR [r11+24]
|
|
movbe r10, QWORD PTR [r11+16]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
movbe rax, QWORD PTR [r11+8]
|
|
movbe r10, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_256_from_bin_movbe_64_end:
|
|
cmp r9, 63
|
|
jg L_256_from_bin_movbe_64_start
|
|
jmp L_256_from_bin_movbe_8_end
|
|
L_256_from_bin_movbe_8_start:
|
|
sub r11, 8
|
|
movbe rax, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_256_from_bin_movbe_8_end:
|
|
cmp r9, 7
|
|
jg L_256_from_bin_movbe_8_start
|
|
cmp r9, r13
|
|
je L_256_from_bin_movbe_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_256_from_bin_movbe_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_256_from_bin_movbe_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_256_from_bin_movbe_hi_end:
|
|
cmp rcx, r12
|
|
je L_256_from_bin_movbe_zero_end
|
|
L_256_from_bin_movbe_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_256_from_bin_movbe_zero_start
|
|
L_256_from_bin_movbe_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_from_bin_movbe ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 32
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_to_bin_bswap_4 PROC
|
|
mov rax, QWORD PTR [rcx+24]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
mov rax, QWORD PTR [rcx+8]
|
|
mov r8, QWORD PTR [rcx]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
ret
|
|
sp_256_to_bin_bswap_4 ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 32
|
|
; * Uses the movbe instruction which is optional.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_to_bin_movbe_4 PROC
|
|
movbe rax, QWORD PTR [rcx+24]
|
|
movbe r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
movbe rax, QWORD PTR [rcx+8]
|
|
movbe r8, QWORD PTR [rcx]
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
ret
|
|
sp_256_to_bin_movbe_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply.
|
|
; * b Second number to multiply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mul_avx2_4 PROC
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rbp
|
|
push rdi
|
|
push rsi
|
|
mov rbp, r8
|
|
mov rdi, rdx
|
|
; A[0] * B[0]
|
|
mov rdx, QWORD PTR [rbp]
|
|
mulx r9, r8, QWORD PTR [rdi]
|
|
; A[2] * B[0]
|
|
mulx r11, r10, QWORD PTR [rdi+16]
|
|
; A[1] * B[0]
|
|
mulx rsi, rax, QWORD PTR [rdi+8]
|
|
xor r15, r15
|
|
adcx r9, rax
|
|
; A[1] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
mulx r13, r12, QWORD PTR [rdi+8]
|
|
adcx r10, rsi
|
|
; A[0] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
mulx rsi, rax, QWORD PTR [rdi]
|
|
adox r9, rax
|
|
; A[2] * B[1]
|
|
mulx r14, rax, QWORD PTR [rdi+16]
|
|
adox r10, rsi
|
|
adcx r11, rax
|
|
; A[1] * B[2]
|
|
mov rdx, QWORD PTR [rbp+16]
|
|
mulx rsi, rax, QWORD PTR [rdi+8]
|
|
adcx r12, r14
|
|
adox r11, rax
|
|
adcx r13, r15
|
|
adox r12, rsi
|
|
; A[0] * B[2]
|
|
mulx rsi, rax, QWORD PTR [rdi]
|
|
adox r13, r15
|
|
xor r14, r14
|
|
adcx r10, rax
|
|
; A[1] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
mulx rax, rdx, QWORD PTR [rdi+8]
|
|
adcx r11, rsi
|
|
adox r10, rdx
|
|
; A[3] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
adox r11, rax
|
|
mulx rsi, rax, QWORD PTR [rdi+24]
|
|
adcx r12, rax
|
|
; A[2] * B[2]
|
|
mov rdx, QWORD PTR [rbp+16]
|
|
mulx rax, rdx, QWORD PTR [rdi+16]
|
|
adcx r13, rsi
|
|
adox r12, rdx
|
|
; A[3] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
adox r13, rax
|
|
mulx rsi, rax, QWORD PTR [rdi+24]
|
|
adox r14, r15
|
|
adcx r14, rax
|
|
; A[0] * B[3]
|
|
mulx rax, rdx, QWORD PTR [rdi]
|
|
adcx r15, rsi
|
|
xor rsi, rsi
|
|
adcx r11, rdx
|
|
; A[3] * B[0]
|
|
mov rdx, QWORD PTR [rdi+24]
|
|
adcx r12, rax
|
|
mulx rax, rbx, QWORD PTR [rbp]
|
|
adox r11, rbx
|
|
adox r12, rax
|
|
; A[3] * B[2]
|
|
mulx rax, rdx, QWORD PTR [rbp+16]
|
|
adcx r13, rdx
|
|
; A[2] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
adcx r14, rax
|
|
mulx rdx, rax, QWORD PTR [rdi+16]
|
|
adcx r15, rsi
|
|
adox r13, rax
|
|
adox r14, rdx
|
|
adox r15, rsi
|
|
mov QWORD PTR [rcx], r8
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov QWORD PTR [rcx+32], r12
|
|
mov QWORD PTR [rcx+40], r13
|
|
mov QWORD PTR [rcx+48], r14
|
|
mov QWORD PTR [rcx+56], r15
|
|
pop rsi
|
|
pop rdi
|
|
pop rbp
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
ret
|
|
sp_256_mul_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_sub_in_place_4 PROC
|
|
xor rax, rax
|
|
mov r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
sub QWORD PTR [rcx], r8
|
|
sbb QWORD PTR [rcx+8], r9
|
|
sbb QWORD PTR [rcx+16], r10
|
|
sbb QWORD PTR [rcx+24], r11
|
|
sbb rax, 0
|
|
ret
|
|
sp_256_sub_in_place_4 ENDP
|
|
_text ENDS
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mul_d_4 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
pop r12
|
|
ret
|
|
sp_256_mul_d_4 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mul_d_avx2_4 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov QWORD PTR [rcx+32], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mul_d_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_256_word_asm_4 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_256_word_asm_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply two Montogmery form numbers mod the modulus (prime).
|
|
; * (r = a * b mod m)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply in Montogmery form.
|
|
; * b Second number to multiply in Montogmery form.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_mul_order_avx2_4 PROC
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rbp
|
|
push rdi
|
|
push rsi
|
|
mov rbp, r8
|
|
mov rdi, rdx
|
|
; A[0] * B[0]
|
|
mov rdx, QWORD PTR [rbp]
|
|
mulx r9, r8, QWORD PTR [rdi]
|
|
; A[2] * B[0]
|
|
mulx r11, r10, QWORD PTR [rdi+16]
|
|
; A[1] * B[0]
|
|
mulx rsi, rax, QWORD PTR [rdi+8]
|
|
xor r15, r15
|
|
adcx r9, rax
|
|
; A[1] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
mulx r13, r12, QWORD PTR [rdi+8]
|
|
adcx r10, rsi
|
|
; A[0] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
mulx rsi, rax, QWORD PTR [rdi]
|
|
adox r9, rax
|
|
; A[2] * B[1]
|
|
mulx r14, rax, QWORD PTR [rdi+16]
|
|
adox r10, rsi
|
|
adcx r11, rax
|
|
; A[1] * B[2]
|
|
mov rdx, QWORD PTR [rbp+16]
|
|
mulx rsi, rax, QWORD PTR [rdi+8]
|
|
adcx r12, r14
|
|
adox r11, rax
|
|
adcx r13, r15
|
|
adox r12, rsi
|
|
; A[0] * B[2]
|
|
mulx rsi, rax, QWORD PTR [rdi]
|
|
adox r13, r15
|
|
xor r14, r14
|
|
adcx r10, rax
|
|
; A[1] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
mulx rax, rdx, QWORD PTR [rdi+8]
|
|
adcx r11, rsi
|
|
adox r10, rdx
|
|
; A[3] * B[1]
|
|
mov rdx, QWORD PTR [rbp+8]
|
|
adox r11, rax
|
|
mulx rsi, rax, QWORD PTR [rdi+24]
|
|
adcx r12, rax
|
|
; A[2] * B[2]
|
|
mov rdx, QWORD PTR [rbp+16]
|
|
mulx rax, rdx, QWORD PTR [rdi+16]
|
|
adcx r13, rsi
|
|
adox r12, rdx
|
|
; A[3] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
adox r13, rax
|
|
mulx rsi, rax, QWORD PTR [rdi+24]
|
|
adox r14, r15
|
|
adcx r14, rax
|
|
; A[0] * B[3]
|
|
mulx rax, rdx, QWORD PTR [rdi]
|
|
adcx r15, rsi
|
|
xor rsi, rsi
|
|
adcx r11, rdx
|
|
; A[3] * B[0]
|
|
mov rdx, QWORD PTR [rdi+24]
|
|
adcx r12, rax
|
|
mulx rax, rbx, QWORD PTR [rbp]
|
|
adox r11, rbx
|
|
adox r12, rax
|
|
; A[3] * B[2]
|
|
mulx rax, rdx, QWORD PTR [rbp+16]
|
|
adcx r13, rdx
|
|
; A[2] * B[3]
|
|
mov rdx, QWORD PTR [rbp+24]
|
|
adcx r14, rax
|
|
mulx rdx, rax, QWORD PTR [rdi+16]
|
|
adcx r15, rsi
|
|
adox r13, rax
|
|
adox r14, rdx
|
|
adox r15, rsi
|
|
; Start Reduction
|
|
mov rbx, 14758798090332847183
|
|
; A[0]
|
|
mov rdx, rbx
|
|
imul rdx, r8
|
|
mov rax, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rdi, rsi, rax
|
|
mov rax, 13611842547513532036
|
|
adcx r8, rsi
|
|
adox r9, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744073709551615
|
|
adcx r9, rsi
|
|
adox r10, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744069414584320
|
|
adcx r10, rsi
|
|
adox r11, rdi
|
|
mulx rdi, rsi, rax
|
|
adcx r11, rsi
|
|
adox r12, rdi
|
|
adcx r12, rbp
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
; A[1]
|
|
mov rdx, rbx
|
|
imul rdx, r9
|
|
mov rax, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rdi, rsi, rax
|
|
mov rax, 13611842547513532036
|
|
adcx r9, rsi
|
|
adox r10, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744073709551615
|
|
adcx r10, rsi
|
|
adox r11, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744069414584320
|
|
adcx r11, rsi
|
|
adox r12, rdi
|
|
mulx rdi, rsi, rax
|
|
adcx r12, rsi
|
|
adox r13, rdi
|
|
adcx r13, r8
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
; A[2]
|
|
mov rdx, rbx
|
|
imul rdx, r10
|
|
mov rax, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rdi, rsi, rax
|
|
mov rax, 13611842547513532036
|
|
adcx r10, rsi
|
|
adox r11, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744073709551615
|
|
adcx r11, rsi
|
|
adox r12, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744069414584320
|
|
adcx r12, rsi
|
|
adox r13, rdi
|
|
mulx rdi, rsi, rax
|
|
adcx r13, rsi
|
|
adox r14, rdi
|
|
adcx r14, r8
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
; A[3]
|
|
mov rdx, rbx
|
|
imul rdx, r11
|
|
mov rax, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rdi, rsi, rax
|
|
mov rax, 13611842547513532036
|
|
adcx r11, rsi
|
|
adox r12, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744073709551615
|
|
adcx r12, rsi
|
|
adox r13, rdi
|
|
mulx rdi, rsi, rax
|
|
mov rax, 18446744069414584320
|
|
adcx r13, rsi
|
|
adox r14, rdi
|
|
mulx rdi, rsi, rax
|
|
adcx r14, rsi
|
|
adox r15, rdi
|
|
adcx r15, r8
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
neg r8
|
|
mov rax, 17562291160714782033
|
|
mov rbx, 13611842547513532036
|
|
and rax, r8
|
|
mov rbp, 18446744069414584320
|
|
and rbx, r8
|
|
and rbp, r8
|
|
sub r12, rax
|
|
sbb r13, rbx
|
|
mov QWORD PTR [rcx], r12
|
|
sbb r14, r8
|
|
mov QWORD PTR [rcx+8], r13
|
|
sbb r15, rbp
|
|
mov QWORD PTR [rcx+16], r14
|
|
mov QWORD PTR [rcx+24], r15
|
|
pop rsi
|
|
pop rdi
|
|
pop rbp
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
ret
|
|
sp_256_mont_mul_order_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
|
|
; *
|
|
; * r Result of squaring.
|
|
; * a Number to square in Montogmery form.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mont_sqr_order_avx2_4 PROC
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov rax, rdx
|
|
; A[0] * A[1]
|
|
mov rdx, QWORD PTR [rax]
|
|
mov r15, QWORD PTR [rax+16]
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
; A[0] * A[3]
|
|
mulx r12, r11, QWORD PTR [rax+24]
|
|
; A[2] * A[1]
|
|
mov rdx, r15
|
|
mulx rbx, rsi, QWORD PTR [rax+8]
|
|
; A[2] * A[3]
|
|
mulx r14, r13, QWORD PTR [rax+24]
|
|
xor r15, r15
|
|
adox r11, rsi
|
|
adox r12, rbx
|
|
; A[2] * A[0]
|
|
mulx rbx, rsi, QWORD PTR [rax]
|
|
; A[1] * A[3]
|
|
mov rdx, QWORD PTR [rax+8]
|
|
adox r13, r15
|
|
mulx r8, rdi, QWORD PTR [rax+24]
|
|
adcx r10, rsi
|
|
adox r14, r15
|
|
adcx r11, rbx
|
|
adcx r12, rdi
|
|
adcx r13, r8
|
|
adcx r14, r15
|
|
; Double with Carry Flag
|
|
xor r15, r15
|
|
; A[0] * A[0]
|
|
mov rdx, QWORD PTR [rax]
|
|
mulx rdi, r8, rdx
|
|
adcx r9, r9
|
|
adcx r10, r10
|
|
adox r9, rdi
|
|
; A[1] * A[1]
|
|
mov rdx, QWORD PTR [rax+8]
|
|
mulx rbx, rsi, rdx
|
|
adcx r11, r11
|
|
adox r10, rsi
|
|
; A[2] * A[2]
|
|
mov rdx, QWORD PTR [rax+16]
|
|
mulx rsi, rdi, rdx
|
|
adcx r12, r12
|
|
adox r11, rbx
|
|
adcx r13, r13
|
|
adox r12, rdi
|
|
adcx r14, r14
|
|
; A[3] * A[3]
|
|
mov rdx, QWORD PTR [rax+24]
|
|
mulx rbx, rdi, rdx
|
|
adox r13, rsi
|
|
adcx r15, r15
|
|
adox r14, rdi
|
|
adox r15, rbx
|
|
; Start Reduction
|
|
mov rbx, 14758798090332847183
|
|
; A[0]
|
|
mov rdx, rbx
|
|
imul rdx, r8
|
|
mov rdi, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 13611842547513532036
|
|
adcx r8, rsi
|
|
adox r9, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744073709551615
|
|
adcx r9, rsi
|
|
adox r10, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744069414584320
|
|
adcx r10, rsi
|
|
adox r11, rax
|
|
mulx rax, rsi, rdi
|
|
adcx r11, rsi
|
|
adox r12, rax
|
|
adcx r12, rbp
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
; A[1]
|
|
mov rdx, rbx
|
|
imul rdx, r9
|
|
mov rdi, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 13611842547513532036
|
|
adcx r9, rsi
|
|
adox r10, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744073709551615
|
|
adcx r10, rsi
|
|
adox r11, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744069414584320
|
|
adcx r11, rsi
|
|
adox r12, rax
|
|
mulx rax, rsi, rdi
|
|
adcx r12, rsi
|
|
adox r13, rax
|
|
adcx r13, r8
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
; A[2]
|
|
mov rdx, rbx
|
|
imul rdx, r10
|
|
mov rdi, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 13611842547513532036
|
|
adcx r10, rsi
|
|
adox r11, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744073709551615
|
|
adcx r11, rsi
|
|
adox r12, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744069414584320
|
|
adcx r12, rsi
|
|
adox r13, rax
|
|
mulx rax, rsi, rdi
|
|
adcx r13, rsi
|
|
adox r14, rax
|
|
adcx r14, r8
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
; A[3]
|
|
mov rdx, rbx
|
|
imul rdx, r11
|
|
mov rdi, 17562291160714782033
|
|
xor rbp, rbp
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 13611842547513532036
|
|
adcx r11, rsi
|
|
adox r12, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744073709551615
|
|
adcx r12, rsi
|
|
adox r13, rax
|
|
mulx rax, rsi, rdi
|
|
mov rdi, 18446744069414584320
|
|
adcx r13, rsi
|
|
adox r14, rax
|
|
mulx rax, rsi, rdi
|
|
adcx r14, rsi
|
|
adox r15, rax
|
|
adcx r15, r8
|
|
mov r8, rbp
|
|
; carry
|
|
adox r8, rbp
|
|
adcx r8, rbp
|
|
neg r8
|
|
mov rdi, 17562291160714782033
|
|
mov rbx, 13611842547513532036
|
|
and rdi, r8
|
|
mov rbp, 18446744069414584320
|
|
and rbx, r8
|
|
and rbp, r8
|
|
sub r12, rdi
|
|
sbb r13, rbx
|
|
mov QWORD PTR [rcx], r12
|
|
sbb r14, r8
|
|
mov QWORD PTR [rcx+8], r13
|
|
sbb r15, rbp
|
|
mov QWORD PTR [rcx+16], r14
|
|
mov QWORD PTR [rcx+24], r15
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
sp_256_mont_sqr_order_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Non-constant time modular inversion.
|
|
; *
|
|
; * @param [out] r Resulting number.
|
|
; * @param [in] a Number to invert.
|
|
; * @param [in] m Modulus.
|
|
; * @return MP_OKAY on success.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mod_inv_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
sub rsp, 513
|
|
mov r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [r8+24]
|
|
mov r13, QWORD PTR [rdx]
|
|
mov r14, QWORD PTR [rdx+8]
|
|
mov r15, QWORD PTR [rdx+16]
|
|
mov rdi, QWORD PTR [rdx+24]
|
|
mov rsi, 0
|
|
test r13b, 1
|
|
jnz L_256_mod_inv_4_v_even_end
|
|
L_256_mod_inv_4_v_even_start:
|
|
shrd r13, r14, 1
|
|
shrd r14, r15, 1
|
|
shrd r15, rdi, 1
|
|
shr rdi, 1
|
|
mov BYTE PTR [rsp+rsi], 1
|
|
inc rsi
|
|
test r13b, 1
|
|
jz L_256_mod_inv_4_v_even_start
|
|
L_256_mod_inv_4_v_even_end:
|
|
L_256_mod_inv_4_uv_start:
|
|
cmp r12, rdi
|
|
jb L_256_mod_inv_4_uv_v
|
|
ja L_256_mod_inv_4_uv_u
|
|
cmp r11, r15
|
|
jb L_256_mod_inv_4_uv_v
|
|
ja L_256_mod_inv_4_uv_u
|
|
cmp r10, r14
|
|
jb L_256_mod_inv_4_uv_v
|
|
ja L_256_mod_inv_4_uv_u
|
|
cmp r9, r13
|
|
jb L_256_mod_inv_4_uv_v
|
|
L_256_mod_inv_4_uv_u:
|
|
mov BYTE PTR [rsp+rsi], 2
|
|
inc rsi
|
|
sub r9, r13
|
|
sbb r10, r14
|
|
sbb r11, r15
|
|
sbb r12, rdi
|
|
shrd r9, r10, 1
|
|
shrd r10, r11, 1
|
|
shrd r11, r12, 1
|
|
shr r12, 1
|
|
test r9b, 1
|
|
jnz L_256_mod_inv_4_usubv_even_end
|
|
L_256_mod_inv_4_usubv_even_start:
|
|
shrd r9, r10, 1
|
|
shrd r10, r11, 1
|
|
shrd r11, r12, 1
|
|
shr r12, 1
|
|
mov BYTE PTR [rsp+rsi], 0
|
|
inc rsi
|
|
test r9b, 1
|
|
jz L_256_mod_inv_4_usubv_even_start
|
|
L_256_mod_inv_4_usubv_even_end:
|
|
cmp r9, 1
|
|
jne L_256_mod_inv_4_uv_start
|
|
mov rdx, r10
|
|
or rdx, r11
|
|
jne L_256_mod_inv_4_uv_start
|
|
or rdx, r12
|
|
jne L_256_mod_inv_4_uv_start
|
|
mov al, 1
|
|
jmp L_256_mod_inv_4_uv_end
|
|
L_256_mod_inv_4_uv_v:
|
|
mov BYTE PTR [rsp+rsi], 3
|
|
inc rsi
|
|
sub r13, r9
|
|
sbb r14, r10
|
|
sbb r15, r11
|
|
sbb rdi, r12
|
|
shrd r13, r14, 1
|
|
shrd r14, r15, 1
|
|
shrd r15, rdi, 1
|
|
shr rdi, 1
|
|
test r13b, 1
|
|
jnz L_256_mod_inv_4_vsubu_even_end
|
|
L_256_mod_inv_4_vsubu_even_start:
|
|
shrd r13, r14, 1
|
|
shrd r14, r15, 1
|
|
shrd r15, rdi, 1
|
|
shr rdi, 1
|
|
mov BYTE PTR [rsp+rsi], 1
|
|
inc rsi
|
|
test r13b, 1
|
|
jz L_256_mod_inv_4_vsubu_even_start
|
|
L_256_mod_inv_4_vsubu_even_end:
|
|
cmp r13, 1
|
|
jne L_256_mod_inv_4_uv_start
|
|
mov rdx, r14
|
|
or rdx, r15
|
|
jne L_256_mod_inv_4_uv_start
|
|
or rdx, rdi
|
|
jne L_256_mod_inv_4_uv_start
|
|
mov al, 0
|
|
L_256_mod_inv_4_uv_end:
|
|
mov r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [r8+24]
|
|
mov r13, 1
|
|
xor r14, r14
|
|
xor r15, r15
|
|
xor rdi, rdi
|
|
mov BYTE PTR [rsp+rsi], 7
|
|
mov dl, BYTE PTR [rsp]
|
|
mov rsi, 1
|
|
cmp dl, 1
|
|
je L_256_mod_inv_4_op_div2_d
|
|
jl L_256_mod_inv_4_op_div2_b
|
|
cmp dl, 3
|
|
je L_256_mod_inv_4_op_d_sub_b
|
|
jl L_256_mod_inv_4_op_b_sub_d
|
|
jmp L_256_mod_inv_4_op_end
|
|
L_256_mod_inv_4_op_b_sub_d:
|
|
sub r9, r13
|
|
sbb r10, r14
|
|
sbb r11, r15
|
|
sbb r12, rdi
|
|
jnc L_256_mod_inv_4_op_div2_b
|
|
add r9, QWORD PTR [r8]
|
|
adc r10, QWORD PTR [r8+8]
|
|
adc r11, QWORD PTR [r8+16]
|
|
adc r12, QWORD PTR [r8+24]
|
|
L_256_mod_inv_4_op_div2_b:
|
|
test r9b, 1
|
|
mov rdx, 0
|
|
jz L_256_mod_inv_4_op_div2_b_mod
|
|
add r9, QWORD PTR [r8]
|
|
adc r10, QWORD PTR [r8+8]
|
|
adc r11, QWORD PTR [r8+16]
|
|
adc r12, QWORD PTR [r8+24]
|
|
adc rdx, 0
|
|
L_256_mod_inv_4_op_div2_b_mod:
|
|
shrd r9, r10, 1
|
|
shrd r10, r11, 1
|
|
shrd r11, r12, 1
|
|
shrd r12, rdx, 1
|
|
mov dl, BYTE PTR [rsp+rsi]
|
|
inc rsi
|
|
cmp dl, 1
|
|
je L_256_mod_inv_4_op_div2_d
|
|
jl L_256_mod_inv_4_op_div2_b
|
|
cmp dl, 3
|
|
je L_256_mod_inv_4_op_d_sub_b
|
|
jl L_256_mod_inv_4_op_b_sub_d
|
|
jmp L_256_mod_inv_4_op_end
|
|
L_256_mod_inv_4_op_d_sub_b:
|
|
sub r13, r9
|
|
sbb r14, r10
|
|
sbb r15, r11
|
|
sbb rdi, r12
|
|
jnc L_256_mod_inv_4_op_div2_d
|
|
add r13, QWORD PTR [r8]
|
|
adc r14, QWORD PTR [r8+8]
|
|
adc r15, QWORD PTR [r8+16]
|
|
adc rdi, QWORD PTR [r8+24]
|
|
L_256_mod_inv_4_op_div2_d:
|
|
test r13b, 1
|
|
mov rdx, 0
|
|
jz L_256_mod_inv_4_op_div2_d_mod
|
|
add r13, QWORD PTR [r8]
|
|
adc r14, QWORD PTR [r8+8]
|
|
adc r15, QWORD PTR [r8+16]
|
|
adc rdi, QWORD PTR [r8+24]
|
|
adc rdx, 0
|
|
L_256_mod_inv_4_op_div2_d_mod:
|
|
shrd r13, r14, 1
|
|
shrd r14, r15, 1
|
|
shrd r15, rdi, 1
|
|
shrd rdi, rdx, 1
|
|
mov dl, BYTE PTR [rsp+rsi]
|
|
inc rsi
|
|
cmp dl, 1
|
|
je L_256_mod_inv_4_op_div2_d
|
|
jl L_256_mod_inv_4_op_div2_b
|
|
cmp dl, 3
|
|
je L_256_mod_inv_4_op_d_sub_b
|
|
jl L_256_mod_inv_4_op_b_sub_d
|
|
L_256_mod_inv_4_op_end:
|
|
cmp al, 1
|
|
jne L_256_mod_inv_4_store_d
|
|
mov QWORD PTR [rcx], r9
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
jmp L_256_mod_inv_4_store_end
|
|
L_256_mod_inv_4_store_d:
|
|
mov QWORD PTR [rcx], r13
|
|
mov QWORD PTR [rcx+8], r14
|
|
mov QWORD PTR [rcx+16], r15
|
|
mov QWORD PTR [rcx+24], rdi
|
|
L_256_mod_inv_4_store_end:
|
|
add rsp, 513
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mod_inv_4 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_order DWORD 6497617,32001851,62711546,67108863,67043328,0,0,0,41070783,45522014,67108863,1023,4194303,0,0,0
|
|
ptr_L_sp256_mod_inv_avx2_4_order QWORD L_sp256_mod_inv_avx2_4_order
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_one QWORD 1, 0,
|
|
0, 0
|
|
ptr_L_sp256_mod_inv_avx2_4_one QWORD L_sp256_mod_inv_avx2_4_one
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_all_one DWORD 1,1,1,1,1,1,1,1
|
|
ptr_L_sp256_mod_inv_avx2_4_all_one QWORD L_sp256_mod_inv_avx2_4_all_one
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_mask01111 DWORD 0,1,1,1,1,0,0,0
|
|
ptr_L_sp256_mod_inv_avx2_4_mask01111 QWORD L_sp256_mod_inv_avx2_4_mask01111
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_down_one_dword DWORD 1,2,3,4,5,6,7,7
|
|
ptr_L_sp256_mod_inv_avx2_4_down_one_dword QWORD L_sp256_mod_inv_avx2_4_down_one_dword
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_neg DWORD 0,0,0,0,2147483648,0,0,0
|
|
ptr_L_sp256_mod_inv_avx2_4_neg QWORD L_sp256_mod_inv_avx2_4_neg
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_up_one_dword DWORD 7,0,1,2,3,7,7,7
|
|
ptr_L_sp256_mod_inv_avx2_4_up_one_dword QWORD L_sp256_mod_inv_avx2_4_up_one_dword
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_sp256_mod_inv_avx2_4_mask26 DWORD 67108863,67108863,67108863,67108863,67108863,0,0,0
|
|
ptr_L_sp256_mod_inv_avx2_4_mask26 QWORD L_sp256_mod_inv_avx2_4_mask26
|
|
_DATA ENDS
|
|
; /* Non-constant time modular inversion.
|
|
; *
|
|
; * @param [out] r Resulting number.
|
|
; * @param [in] a Number to invert.
|
|
; * @param [in] m Modulus.
|
|
; * @return MP_OKAY on success.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_256_mod_inv_avx2_4 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov rax, QWORD PTR [r8]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r12, QWORD PTR [rdx]
|
|
mov r13, QWORD PTR [rdx+8]
|
|
mov r14, QWORD PTR [rdx+16]
|
|
mov r15, QWORD PTR [rdx+24]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_order
|
|
vmovupd ymm6, [rbx]
|
|
vmovupd ymm7, [rbx+32]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_one
|
|
vmovupd ymm8, [rbx]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_mask01111
|
|
vmovupd ymm9, [rbx]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_all_one
|
|
vmovupd ymm10, [rbx]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_down_one_dword
|
|
vmovupd ymm11, [rbx]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_neg
|
|
vmovupd ymm12, [rbx]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_up_one_dword
|
|
vmovupd ymm13, [rbx]
|
|
mov rbx, ptr_L_sp256_mod_inv_avx2_4_mask26
|
|
vmovupd ymm14, [rbx]
|
|
vpxor xmm0, xmm0, xmm0
|
|
vpxor xmm1, xmm1, xmm1
|
|
vmovdqu ymm2, ymm8
|
|
vpxor xmm3, xmm3, xmm3
|
|
test r12b, 1
|
|
jnz L_256_mod_inv_avx2_4_v_even_end
|
|
L_256_mod_inv_avx2_4_v_even_start:
|
|
shrd r12, r13, 1
|
|
shrd r13, r14, 1
|
|
shrd r14, r15, 1
|
|
shr r15, 1
|
|
vptest ymm2, ymm8
|
|
jz L_256_mod_inv_avx2_4_v_even_shr1
|
|
vpaddd ymm2, ymm2, ymm6
|
|
vpaddd ymm3, ymm3, ymm7
|
|
L_256_mod_inv_avx2_4_v_even_shr1:
|
|
vpand ymm4, ymm2, ymm9
|
|
vpand ymm5, ymm3, ymm10
|
|
vpermd ymm4, ymm11, ymm4
|
|
vpsrad ymm2, ymm2, 1
|
|
vpsrad ymm3, ymm3, 1
|
|
vpslld ymm5, ymm5, 25
|
|
vpslld xmm4, xmm4, 25
|
|
vpaddd ymm2, ymm2, ymm5
|
|
vpaddd ymm3, ymm3, ymm4
|
|
test r12b, 1
|
|
jz L_256_mod_inv_avx2_4_v_even_start
|
|
L_256_mod_inv_avx2_4_v_even_end:
|
|
L_256_mod_inv_avx2_4_uv_start:
|
|
cmp r11, r15
|
|
jb L_256_mod_inv_avx2_4_uv_v
|
|
ja L_256_mod_inv_avx2_4_uv_u
|
|
cmp r10, r14
|
|
jb L_256_mod_inv_avx2_4_uv_v
|
|
ja L_256_mod_inv_avx2_4_uv_u
|
|
cmp r9, r13
|
|
jb L_256_mod_inv_avx2_4_uv_v
|
|
ja L_256_mod_inv_avx2_4_uv_u
|
|
cmp rax, r12
|
|
jb L_256_mod_inv_avx2_4_uv_v
|
|
L_256_mod_inv_avx2_4_uv_u:
|
|
sub rax, r12
|
|
sbb r9, r13
|
|
vpsubd ymm0, ymm0, ymm2
|
|
sbb r10, r14
|
|
vpsubd ymm1, ymm1, ymm3
|
|
sbb r11, r15
|
|
vptest ymm1, ymm12
|
|
jz L_256_mod_inv_avx2_4_usubv_done_neg
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm1, ymm1, ymm7
|
|
L_256_mod_inv_avx2_4_usubv_done_neg:
|
|
L_256_mod_inv_avx2_4_usubv_shr1:
|
|
shrd rax, r9, 1
|
|
shrd r9, r10, 1
|
|
shrd r10, r11, 1
|
|
shr r11, 1
|
|
vptest ymm0, ymm8
|
|
jz L_256_mod_inv_avx2_4_usubv_sub_shr1
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm1, ymm1, ymm7
|
|
L_256_mod_inv_avx2_4_usubv_sub_shr1:
|
|
vpand ymm4, ymm0, ymm9
|
|
vpand ymm5, ymm1, ymm10
|
|
vpermd ymm4, ymm11, ymm4
|
|
vpsrad ymm0, ymm0, 1
|
|
vpsrad ymm1, ymm1, 1
|
|
vpslld ymm5, ymm5, 25
|
|
vpslld xmm4, xmm4, 25
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm1, ymm1, ymm4
|
|
test al, 1
|
|
jz L_256_mod_inv_avx2_4_usubv_shr1
|
|
cmp rax, 1
|
|
jne L_256_mod_inv_avx2_4_uv_start
|
|
mov rdx, r9
|
|
or rdx, r10
|
|
jne L_256_mod_inv_avx2_4_uv_start
|
|
or rdx, r11
|
|
jne L_256_mod_inv_avx2_4_uv_start
|
|
vpsrad ymm5, ymm1, 26
|
|
vpsrad ymm4, ymm0, 26
|
|
vpermd ymm5, ymm13, ymm5
|
|
vpand ymm0, ymm0, ymm14
|
|
vpand ymm1, ymm1, ymm14
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm1, ymm1, ymm4
|
|
vpsrad ymm5, ymm1, 26
|
|
vpsrad ymm4, ymm0, 26
|
|
vpermd ymm5, ymm13, ymm5
|
|
vpand ymm0, ymm0, ymm14
|
|
vpand ymm1, ymm1, ymm14
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm1, ymm1, ymm4
|
|
vpextrd eax, xmm0, 0
|
|
vpextrd r10d, xmm0, 1
|
|
vpextrd r12d, xmm0, 2
|
|
vpextrd r14d, xmm0, 3
|
|
vextracti128 xmm0, ymm0, 1
|
|
vpextrd r9d, xmm1, 0
|
|
vpextrd r11d, xmm1, 1
|
|
vpextrd r13d, xmm1, 2
|
|
vpextrd r15d, xmm1, 3
|
|
vextracti128 xmm1, ymm1, 1
|
|
vpextrd edi, xmm0, 0
|
|
vpextrd esi, xmm1, 0
|
|
jmp L_256_mod_inv_avx2_4_store_done
|
|
L_256_mod_inv_avx2_4_uv_v:
|
|
sub r12, rax
|
|
sbb r13, r9
|
|
vpsubd ymm2, ymm2, ymm0
|
|
sbb r14, r10
|
|
vpsubd ymm3, ymm3, ymm1
|
|
sbb r15, r11
|
|
vptest ymm3, ymm12
|
|
jz L_256_mod_inv_avx2_4_vsubu_done_neg
|
|
vpaddd ymm2, ymm2, ymm6
|
|
vpaddd ymm3, ymm3, ymm7
|
|
L_256_mod_inv_avx2_4_vsubu_done_neg:
|
|
L_256_mod_inv_avx2_4_vsubu_shr1:
|
|
shrd r12, r13, 1
|
|
shrd r13, r14, 1
|
|
shrd r14, r15, 1
|
|
shr r15, 1
|
|
vptest ymm2, ymm8
|
|
jz L_256_mod_inv_avx2_4_vsubu_sub_shr1
|
|
vpaddd ymm2, ymm2, ymm6
|
|
vpaddd ymm3, ymm3, ymm7
|
|
L_256_mod_inv_avx2_4_vsubu_sub_shr1:
|
|
vpand ymm4, ymm2, ymm9
|
|
vpand ymm5, ymm3, ymm10
|
|
vpermd ymm4, ymm11, ymm4
|
|
vpsrad ymm2, ymm2, 1
|
|
vpsrad ymm3, ymm3, 1
|
|
vpslld ymm5, ymm5, 25
|
|
vpslld xmm4, xmm4, 25
|
|
vpaddd ymm2, ymm2, ymm5
|
|
vpaddd ymm3, ymm3, ymm4
|
|
test r12b, 1
|
|
jz L_256_mod_inv_avx2_4_vsubu_shr1
|
|
cmp r12, 1
|
|
jne L_256_mod_inv_avx2_4_uv_start
|
|
mov rdx, r13
|
|
or rdx, r14
|
|
jne L_256_mod_inv_avx2_4_uv_start
|
|
or rdx, r15
|
|
jne L_256_mod_inv_avx2_4_uv_start
|
|
vpsrad ymm5, ymm3, 26
|
|
vpsrad ymm4, ymm2, 26
|
|
vpermd ymm5, ymm13, ymm5
|
|
vpand ymm2, ymm2, ymm14
|
|
vpand ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm5
|
|
vpaddd ymm3, ymm3, ymm4
|
|
vpsrad ymm5, ymm3, 26
|
|
vpsrad ymm4, ymm2, 26
|
|
vpermd ymm5, ymm13, ymm5
|
|
vpand ymm2, ymm2, ymm14
|
|
vpand ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm5
|
|
vpaddd ymm3, ymm3, ymm4
|
|
vpextrd eax, xmm2, 0
|
|
vpextrd r10d, xmm2, 1
|
|
vpextrd r12d, xmm2, 2
|
|
vpextrd r14d, xmm2, 3
|
|
vextracti128 xmm2, ymm2, 1
|
|
vpextrd r9d, xmm3, 0
|
|
vpextrd r11d, xmm3, 1
|
|
vpextrd r13d, xmm3, 2
|
|
vpextrd r15d, xmm3, 3
|
|
vextracti128 xmm3, ymm3, 1
|
|
vpextrd edi, xmm2, 0
|
|
vpextrd esi, xmm3, 0
|
|
L_256_mod_inv_avx2_4_store_done:
|
|
movsxd rax, eax
|
|
shl r9, 26
|
|
add rax, r9
|
|
movsxd r10, r10d
|
|
shl r11, 26
|
|
add r10, r11
|
|
movsxd r12, r12d
|
|
shl r13, 26
|
|
add r12, r13
|
|
movsxd r14, r14d
|
|
shl r15, 26
|
|
add r14, r15
|
|
movsxd rdi, edi
|
|
shl rsi, 26
|
|
add rdi, rsi
|
|
mov r9, r10
|
|
mov r11, r12
|
|
mov r13, r14
|
|
shl r9, 52
|
|
sar r10, 12
|
|
shl r11, 40
|
|
sar r12, 24
|
|
shl r13, 28
|
|
sar r14, 36
|
|
shl rdi, 16
|
|
add rax, r9
|
|
adc r10, r11
|
|
adc r12, r13
|
|
adc r14, rdi
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_256_mod_inv_avx2_4 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFDEF WOLFSSL_SP_384
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mul_6 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
sub rsp, 48
|
|
; A[0] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
mov QWORD PTR [rsp], rax
|
|
mov r11, rdx
|
|
; A[0] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+8], r11
|
|
; A[0] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+16], r12
|
|
; A[0] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+24], r10
|
|
; A[0] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+32], r11
|
|
; A[0] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+40], r12
|
|
; A[1] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+8]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+48], r10
|
|
; A[2] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+16]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+56], r11
|
|
; A[3] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+24]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+64], r12
|
|
; A[4] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+32]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+72], r10
|
|
; A[5] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
mov QWORD PTR [rcx+80], r11
|
|
mov QWORD PTR [rcx+88], r12
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r10, QWORD PTR [rsp+16]
|
|
mov r11, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
add rsp, 48
|
|
pop r12
|
|
ret
|
|
sp_384_mul_6 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_sqr_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r8, rdx
|
|
sub rsp, 48
|
|
; A[0] * A[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul rax
|
|
xor r11, r11
|
|
mov QWORD PTR [rsp], rax
|
|
mov r10, rdx
|
|
; A[0] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+8], r10
|
|
; A[0] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[1] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+16], r11
|
|
; A[0] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8+8]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+24], r9
|
|
; A[0] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[1] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[2] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+32], r10
|
|
; A[0] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+40], r11
|
|
; A[1] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+8]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+16]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul rax
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+48], r9
|
|
; A[2] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+16]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[3] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+56], r10
|
|
; A[3] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+24]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[4] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[4] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+32]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+72], r9
|
|
; A[5] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r12, QWORD PTR [rsp+16]
|
|
mov r13, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
add rsp, 48
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_sqr_6 ENDP
|
|
_text ENDS
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_add_6 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_384_add_6 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into r. (r = a - b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_sub_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
xor rax, rax
|
|
mov r9, QWORD PTR [rdx]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov r11, QWORD PTR [rdx+16]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
mov r13, QWORD PTR [rdx+32]
|
|
mov r14, QWORD PTR [rdx+40]
|
|
sub r9, QWORD PTR [r8]
|
|
sbb r10, QWORD PTR [r8+8]
|
|
sbb r11, QWORD PTR [r8+16]
|
|
sbb r12, QWORD PTR [r8+24]
|
|
sbb r13, QWORD PTR [r8+32]
|
|
sbb r14, QWORD PTR [r8+40]
|
|
mov QWORD PTR [rcx], r9
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov QWORD PTR [rcx+32], r13
|
|
mov QWORD PTR [rcx+40], r14
|
|
sbb rax, 0
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_sub_6 ENDP
|
|
_text ENDS
|
|
; /* Conditionally copy a into r using the mask m.
|
|
; * m is -1 to copy and 0 when not.
|
|
; *
|
|
; * r A single precision number to copy over.
|
|
; * a A single precision number to copy.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_cond_copy_6 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [rcx+16]
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rcx+32]
|
|
mov r13, QWORD PTR [rcx+40]
|
|
xor rax, QWORD PTR [rdx]
|
|
xor r9, QWORD PTR [rdx+8]
|
|
xor r10, QWORD PTR [rdx+16]
|
|
xor r11, QWORD PTR [rdx+24]
|
|
xor r12, QWORD PTR [rdx+32]
|
|
xor r13, QWORD PTR [rdx+40]
|
|
and rax, r8
|
|
and r9, r8
|
|
and r10, r8
|
|
and r11, r8
|
|
and r12, r8
|
|
and r13, r8
|
|
xor QWORD PTR [rcx], rax
|
|
xor QWORD PTR [rcx+8], r9
|
|
xor QWORD PTR [rcx+16], r10
|
|
xor QWORD PTR [rcx+24], r11
|
|
xor QWORD PTR [rcx+32], r12
|
|
xor QWORD PTR [rcx+40], r13
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_cond_copy_6 ENDP
|
|
_text ENDS
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_cond_sub_6 PROC
|
|
sub rsp, 48
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov QWORD PTR [rcx+40], r11
|
|
sbb rax, 0
|
|
add rsp, 48
|
|
ret
|
|
sp_384_cond_sub_6 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 384 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mont_reduce_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r12, QWORD PTR [rcx]
|
|
mov r13, QWORD PTR [rcx+8]
|
|
mov r14, QWORD PTR [rcx+16]
|
|
mov r15, QWORD PTR [rcx+24]
|
|
mov rdi, QWORD PTR [rcx+32]
|
|
mov rsi, QWORD PTR [rcx+40]
|
|
xor r11, r11
|
|
; a[0-7] += m[0-5] * mu[0..1] = m[0-5] * (a[0..1] * mp)
|
|
mov rbx, QWORD PTR [rcx+48]
|
|
mov rbp, QWORD PTR [rcx+56]
|
|
mov rdx, r12
|
|
mov rax, r13
|
|
shld rax, rdx, 32
|
|
shl rdx, 32
|
|
add rdx, r12
|
|
adc rax, r13
|
|
add rax, r12
|
|
mov r8, rdx
|
|
mov r9, rax
|
|
mov r10, rax
|
|
shld r9, r8, 32
|
|
shl r8, 32
|
|
shr r10, 32
|
|
add r12, r8
|
|
adc r13, r9
|
|
adc r14, r10
|
|
adc r15, 0
|
|
adc rdi, 0
|
|
adc rsi, 0
|
|
adc rbx, rdx
|
|
adc rbp, rax
|
|
adc r11, 0
|
|
add r8, rax
|
|
adc r9, rdx
|
|
adc r10, rax
|
|
mov rax, 0
|
|
adc rax, 0
|
|
sub r14, r9
|
|
sbb r15, r10
|
|
sbb rdi, rax
|
|
sbb rsi, 0
|
|
sbb rbx, 0
|
|
sbb rbp, 0
|
|
sbb r11, 0
|
|
; a[2-9] += m[0-5] * mu[0..1] = m[0-5] * (a[2..3] * mp)
|
|
mov r12, QWORD PTR [rcx+64]
|
|
mov r13, QWORD PTR [rcx+72]
|
|
mov rdx, r14
|
|
mov rax, r15
|
|
shld rax, rdx, 32
|
|
shl rdx, 32
|
|
add rdx, r14
|
|
adc rax, r15
|
|
add rax, r14
|
|
mov r8, rdx
|
|
mov r9, rax
|
|
mov r10, rax
|
|
shld r9, r8, 32
|
|
shl r8, 32
|
|
shr r10, 32
|
|
add r12, r11
|
|
adc r13, 0
|
|
mov r11, 0
|
|
adc r11, 0
|
|
add r14, r8
|
|
adc r15, r9
|
|
adc rdi, r10
|
|
adc rsi, 0
|
|
adc rbx, 0
|
|
adc rbp, 0
|
|
adc r12, rdx
|
|
adc r13, rax
|
|
adc r11, 0
|
|
add r8, rax
|
|
adc r9, rdx
|
|
adc r10, rax
|
|
mov rax, 0
|
|
adc rax, 0
|
|
sub rdi, r9
|
|
sbb rsi, r10
|
|
sbb rbx, rax
|
|
sbb rbp, 0
|
|
sbb r12, 0
|
|
sbb r13, 0
|
|
sbb r11, 0
|
|
; a[4-11] += m[0-5] * mu[0..1] = m[0-5] * (a[4..5] * mp)
|
|
mov r14, QWORD PTR [rcx+80]
|
|
mov r15, QWORD PTR [rcx+88]
|
|
mov rdx, rdi
|
|
mov rax, rsi
|
|
shld rax, rdx, 32
|
|
shl rdx, 32
|
|
add rdx, rdi
|
|
adc rax, rsi
|
|
add rax, rdi
|
|
mov r8, rdx
|
|
mov r9, rax
|
|
mov r10, rax
|
|
shld r9, r8, 32
|
|
shl r8, 32
|
|
shr r10, 32
|
|
add r14, r11
|
|
adc r15, 0
|
|
mov r11, 0
|
|
adc r11, 0
|
|
add rdi, r8
|
|
adc rsi, r9
|
|
adc rbx, r10
|
|
adc rbp, 0
|
|
adc r12, 0
|
|
adc r13, 0
|
|
adc r14, rdx
|
|
adc r15, rax
|
|
adc r11, 0
|
|
add r8, rax
|
|
adc r9, rdx
|
|
adc r10, rax
|
|
mov rax, 0
|
|
adc rax, 0
|
|
sub rbx, r9
|
|
sbb rbp, r10
|
|
sbb r12, rax
|
|
sbb r13, 0
|
|
sbb r14, 0
|
|
sbb r15, 0
|
|
sbb r11, 0
|
|
; Subtract mod if carry
|
|
neg r11
|
|
mov r10, 18446744073709551614
|
|
mov r8, r11
|
|
mov r9, r11
|
|
shr r8, 32
|
|
shl r9, 32
|
|
and r10, r11
|
|
sub rbx, r8
|
|
sbb rbp, r9
|
|
sbb r12, r10
|
|
sbb r13, r11
|
|
sbb r14, r11
|
|
sbb r15, r11
|
|
mov QWORD PTR [rcx], rbx
|
|
mov QWORD PTR [rcx+8], rbp
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
mov QWORD PTR [rcx+32], r14
|
|
mov QWORD PTR [rcx+40], r15
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_mont_reduce_6 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 384 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mont_reduce_order_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 6
|
|
mov r10, 6
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_384_mont_loop_order_6:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc QWORD PTR [rcx+48], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_384_mont_loop_order_6
|
|
mov QWORD PTR [rcx], r15
|
|
mov QWORD PTR [rcx+8], rdi
|
|
neg rsi
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 48
|
|
call sp_384_cond_sub_6
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_mont_reduce_order_6 ENDP
|
|
_text ENDS
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_cmp_6 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_384_cmp_6 ENDP
|
|
_text ENDS
|
|
; /* Add a to a into r. (r = a + a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_dbl_6 PROC
|
|
mov r8, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r8, r8
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
adc r9, r9
|
|
mov r8, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
adc r8, r8
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
adc r9, r9
|
|
mov QWORD PTR [rcx+40], r9
|
|
adc rax, 0
|
|
ret
|
|
sp_384_dbl_6 ENDP
|
|
_text ENDS
|
|
; /* Conditionally add a and b using the mask m.
|
|
; * m is -1 to add and 0 when not.
|
|
; *
|
|
; * r A single precision number representing conditional add result.
|
|
; * a A single precision number to add with.
|
|
; * b A single precision number to add.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_cond_add_6 PROC
|
|
sub rsp, 48
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
add r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
adc r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
adc r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov QWORD PTR [rcx+40], r11
|
|
adc rax, 0
|
|
add rsp, 48
|
|
ret
|
|
sp_384_cond_add_6 ENDP
|
|
_text ENDS
|
|
; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_div2_6 PROC
|
|
push r12
|
|
push r13
|
|
sub rsp, 48
|
|
mov r13, QWORD PTR [rdx]
|
|
xor r12, r12
|
|
mov rax, r13
|
|
and r13, 1
|
|
neg r13
|
|
mov r10, QWORD PTR [r8]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp], r10
|
|
mov r10, QWORD PTR [r8+8]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+8], r10
|
|
mov r10, QWORD PTR [r8+16]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov r10, QWORD PTR [r8+24]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+24], r10
|
|
mov r10, QWORD PTR [r8+32]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov r10, QWORD PTR [r8+40]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+40], r10
|
|
add QWORD PTR [rsp], rax
|
|
mov rax, QWORD PTR [rdx+8]
|
|
adc QWORD PTR [rsp+8], rax
|
|
mov rax, QWORD PTR [rdx+16]
|
|
adc QWORD PTR [rsp+16], rax
|
|
mov rax, QWORD PTR [rdx+24]
|
|
adc QWORD PTR [rsp+24], rax
|
|
mov rax, QWORD PTR [rdx+32]
|
|
adc QWORD PTR [rsp+32], rax
|
|
mov rax, QWORD PTR [rdx+40]
|
|
adc QWORD PTR [rsp+40], rax
|
|
adc r12, 0
|
|
mov rax, QWORD PTR [rsp]
|
|
mov r9, QWORD PTR [rsp+8]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx], rax
|
|
mov rax, QWORD PTR [rsp+16]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov r9, QWORD PTR [rsp+24]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov rax, QWORD PTR [rsp+32]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov r9, QWORD PTR [rsp+40]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+32], rax
|
|
shrd r9, r12, 1
|
|
mov QWORD PTR [rcx+40], r9
|
|
add rsp, 48
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_div2_6 ENDP
|
|
_text ENDS
|
|
IFNDEF WC_NO_CACHE_RESISTANT
|
|
; /* Touch each possible point that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of point to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_get_point_33_6 PROC
|
|
mov rax, 1
|
|
movd xmm13, r8d
|
|
add rdx, 296
|
|
movd xmm15, eax
|
|
mov rax, 32
|
|
pshufd xmm15, xmm15, 0
|
|
pshufd xmm13, xmm13, 0
|
|
pxor xmm14, xmm14
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
pxor xmm3, xmm3
|
|
pxor xmm4, xmm4
|
|
pxor xmm5, xmm5
|
|
movdqa xmm14, xmm15
|
|
L_384_get_point_33_6_start:
|
|
movdqa xmm12, xmm14
|
|
paddd xmm14, xmm15
|
|
pcmpeqd xmm12, xmm13
|
|
movdqu xmm6, [rdx]
|
|
movdqu xmm7, [rdx+16]
|
|
movdqu xmm8, [rdx+32]
|
|
movdqu xmm9, [rdx+96]
|
|
movdqu xmm10, [rdx+112]
|
|
movdqu xmm11, [rdx+128]
|
|
add rdx, 296
|
|
pand xmm6, xmm12
|
|
pand xmm7, xmm12
|
|
pand xmm8, xmm12
|
|
pand xmm9, xmm12
|
|
pand xmm10, xmm12
|
|
pand xmm11, xmm12
|
|
por xmm0, xmm6
|
|
por xmm1, xmm7
|
|
por xmm2, xmm8
|
|
por xmm3, xmm9
|
|
por xmm4, xmm10
|
|
por xmm5, xmm11
|
|
dec rax
|
|
jnz L_384_get_point_33_6_start
|
|
movdqu [rcx], xmm0
|
|
movdqu [rcx+16], xmm1
|
|
movdqu [rcx+32], xmm2
|
|
movdqu [rcx+96], xmm3
|
|
movdqu [rcx+112], xmm4
|
|
movdqu [rcx+128], xmm5
|
|
mov rax, 1
|
|
movd xmm13, r8d
|
|
sub rdx, 9472
|
|
movd xmm15, eax
|
|
mov rax, 32
|
|
pshufd xmm15, xmm15, 0
|
|
pshufd xmm13, xmm13, 0
|
|
pxor xmm14, xmm14
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
movdqa xmm14, xmm15
|
|
L_384_get_point_33_6_start_2:
|
|
movdqa xmm12, xmm14
|
|
paddd xmm14, xmm15
|
|
pcmpeqd xmm12, xmm13
|
|
movdqu xmm6, [rdx+192]
|
|
movdqu xmm7, [rdx+208]
|
|
movdqu xmm8, [rdx+224]
|
|
add rdx, 296
|
|
pand xmm6, xmm12
|
|
pand xmm7, xmm12
|
|
pand xmm8, xmm12
|
|
por xmm0, xmm6
|
|
por xmm1, xmm7
|
|
por xmm2, xmm8
|
|
dec rax
|
|
jnz L_384_get_point_33_6_start_2
|
|
movdqu [rcx+192], xmm0
|
|
movdqu [rcx+208], xmm1
|
|
movdqu [rcx+224], xmm2
|
|
ret
|
|
sp_384_get_point_33_6 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Touch each possible point that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of point to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_get_point_33_avx2_6 PROC
|
|
mov rax, 1
|
|
movd xmm13, r8d
|
|
add rdx, 296
|
|
movd xmm15, eax
|
|
mov rax, 32
|
|
vpxor ymm14, ymm14, ymm14
|
|
vpermd ymm13, ymm14, ymm13
|
|
vpermd ymm15, ymm14, ymm15
|
|
vpxor ymm0, ymm0, ymm0
|
|
vpxor xmm1, xmm1, xmm1
|
|
vpxor ymm2, ymm2, ymm2
|
|
vpxor xmm3, xmm3, xmm3
|
|
vpxor ymm4, ymm4, ymm4
|
|
vpxor xmm5, xmm5, xmm5
|
|
vmovdqa ymm14, ymm15
|
|
L_384_get_point_33_avx2_6_start:
|
|
vpcmpeqd ymm12, ymm14, ymm13
|
|
vpaddd ymm14, ymm14, ymm15
|
|
vmovupd ymm6, [rdx]
|
|
vmovdqu xmm7, OWORD PTR [rdx+32]
|
|
vmovupd ymm8, [rdx+96]
|
|
vmovdqu xmm9, OWORD PTR [rdx+128]
|
|
vmovupd ymm10, [rdx+192]
|
|
vmovdqu xmm11, OWORD PTR [rdx+224]
|
|
add rdx, 296
|
|
vpand ymm6, ymm6, ymm12
|
|
vpand xmm7, xmm7, xmm12
|
|
vpand ymm8, ymm8, ymm12
|
|
vpand xmm9, xmm9, xmm12
|
|
vpand ymm10, ymm10, ymm12
|
|
vpand xmm11, xmm11, xmm12
|
|
vpor ymm0, ymm0, ymm6
|
|
vpor xmm1, xmm1, xmm7
|
|
vpor ymm2, ymm2, ymm8
|
|
vpor xmm3, xmm3, xmm9
|
|
vpor ymm4, ymm4, ymm10
|
|
vpor xmm5, xmm5, xmm11
|
|
dec rax
|
|
jnz L_384_get_point_33_avx2_6_start
|
|
vmovupd YMMWORD PTR [rcx], ymm0
|
|
vmovdqu [rcx+32], xmm1
|
|
vmovupd YMMWORD PTR [rcx+96], ymm2
|
|
vmovdqu [rcx+128], xmm3
|
|
vmovupd YMMWORD PTR [rcx+192], ymm4
|
|
vmovdqu [rcx+224], xmm5
|
|
ret
|
|
sp_384_get_point_33_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply.
|
|
; * b Second number to multiply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mul_avx2_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov rax, rdx
|
|
sub rsp, 40
|
|
xor rbx, rbx
|
|
mov rdx, QWORD PTR [rax]
|
|
; A[0] * B[0]
|
|
mulx r12, r11, QWORD PTR [r8]
|
|
; A[0] * B[1]
|
|
mulx r13, r9, QWORD PTR [r8+8]
|
|
adcx r12, r9
|
|
; A[0] * B[2]
|
|
mulx r14, r9, QWORD PTR [r8+16]
|
|
adcx r13, r9
|
|
; A[0] * B[3]
|
|
mulx r15, r9, QWORD PTR [r8+24]
|
|
adcx r14, r9
|
|
; A[0] * B[4]
|
|
mulx rdi, r9, QWORD PTR [r8+32]
|
|
adcx r15, r9
|
|
; A[0] * B[5]
|
|
mulx rsi, r9, QWORD PTR [r8+40]
|
|
adcx rdi, r9
|
|
adcx rsi, rbx
|
|
mov QWORD PTR [rsp], r11
|
|
mov r11, 0
|
|
adcx r11, rbx
|
|
xor rbx, rbx
|
|
mov rdx, QWORD PTR [rax+8]
|
|
; A[1] * B[0]
|
|
mulx r10, r9, QWORD PTR [r8]
|
|
adcx r12, r9
|
|
adox r13, r10
|
|
; A[1] * B[1]
|
|
mulx r10, r9, QWORD PTR [r8+8]
|
|
adcx r13, r9
|
|
adox r14, r10
|
|
; A[1] * B[2]
|
|
mulx r10, r9, QWORD PTR [r8+16]
|
|
adcx r14, r9
|
|
adox r15, r10
|
|
; A[1] * B[3]
|
|
mulx r10, r9, QWORD PTR [r8+24]
|
|
adcx r15, r9
|
|
adox rdi, r10
|
|
; A[1] * B[4]
|
|
mulx r10, r9, QWORD PTR [r8+32]
|
|
adcx rdi, r9
|
|
adox rsi, r10
|
|
; A[1] * B[5]
|
|
mulx r10, r9, QWORD PTR [r8+40]
|
|
adcx rsi, r9
|
|
adox r11, r10
|
|
adcx r11, rbx
|
|
mov QWORD PTR [rsp+8], r12
|
|
mov r12, 0
|
|
adcx r12, rbx
|
|
adox r12, rbx
|
|
xor rbx, rbx
|
|
mov rdx, QWORD PTR [rax+16]
|
|
; A[2] * B[0]
|
|
mulx r10, r9, QWORD PTR [r8]
|
|
adcx r13, r9
|
|
adox r14, r10
|
|
; A[2] * B[1]
|
|
mulx r10, r9, QWORD PTR [r8+8]
|
|
adcx r14, r9
|
|
adox r15, r10
|
|
; A[2] * B[2]
|
|
mulx r10, r9, QWORD PTR [r8+16]
|
|
adcx r15, r9
|
|
adox rdi, r10
|
|
; A[2] * B[3]
|
|
mulx r10, r9, QWORD PTR [r8+24]
|
|
adcx rdi, r9
|
|
adox rsi, r10
|
|
; A[2] * B[4]
|
|
mulx r10, r9, QWORD PTR [r8+32]
|
|
adcx rsi, r9
|
|
adox r11, r10
|
|
; A[2] * B[5]
|
|
mulx r10, r9, QWORD PTR [r8+40]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
adcx r12, rbx
|
|
mov QWORD PTR [rsp+16], r13
|
|
mov r13, 0
|
|
adcx r13, rbx
|
|
adox r13, rbx
|
|
xor rbx, rbx
|
|
mov rdx, QWORD PTR [rax+24]
|
|
; A[3] * B[0]
|
|
mulx r10, r9, QWORD PTR [r8]
|
|
adcx r14, r9
|
|
adox r15, r10
|
|
; A[3] * B[1]
|
|
mulx r10, r9, QWORD PTR [r8+8]
|
|
adcx r15, r9
|
|
adox rdi, r10
|
|
; A[3] * B[2]
|
|
mulx r10, r9, QWORD PTR [r8+16]
|
|
adcx rdi, r9
|
|
adox rsi, r10
|
|
; A[3] * B[3]
|
|
mulx r10, r9, QWORD PTR [r8+24]
|
|
adcx rsi, r9
|
|
adox r11, r10
|
|
; A[3] * B[4]
|
|
mulx r10, r9, QWORD PTR [r8+32]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
; A[3] * B[5]
|
|
mulx r10, r9, QWORD PTR [r8+40]
|
|
adcx r12, r9
|
|
adox r13, r10
|
|
adcx r13, rbx
|
|
mov QWORD PTR [rsp+24], r14
|
|
mov r14, 0
|
|
adcx r14, rbx
|
|
adox r14, rbx
|
|
xor rbx, rbx
|
|
mov rdx, QWORD PTR [rax+32]
|
|
; A[4] * B[0]
|
|
mulx r10, r9, QWORD PTR [r8]
|
|
adcx r15, r9
|
|
adox rdi, r10
|
|
; A[4] * B[1]
|
|
mulx r10, r9, QWORD PTR [r8+8]
|
|
adcx rdi, r9
|
|
adox rsi, r10
|
|
; A[4] * B[2]
|
|
mulx r10, r9, QWORD PTR [r8+16]
|
|
adcx rsi, r9
|
|
adox r11, r10
|
|
; A[4] * B[3]
|
|
mulx r10, r9, QWORD PTR [r8+24]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
; A[4] * B[4]
|
|
mulx r10, r9, QWORD PTR [r8+32]
|
|
adcx r12, r9
|
|
adox r13, r10
|
|
; A[4] * B[5]
|
|
mulx r10, r9, QWORD PTR [r8+40]
|
|
adcx r13, r9
|
|
adox r14, r10
|
|
adcx r14, rbx
|
|
mov QWORD PTR [rsp+32], r15
|
|
mov rdx, QWORD PTR [rax+40]
|
|
; A[5] * B[0]
|
|
mulx r10, r9, QWORD PTR [r8]
|
|
adcx rdi, r9
|
|
adox rsi, r10
|
|
; A[5] * B[1]
|
|
mulx r10, r9, QWORD PTR [r8+8]
|
|
adcx rsi, r9
|
|
adox r11, r10
|
|
; A[5] * B[2]
|
|
mulx r10, r9, QWORD PTR [r8+16]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
; A[5] * B[3]
|
|
mulx r10, r9, QWORD PTR [r8+24]
|
|
adcx r12, r9
|
|
adox r13, r10
|
|
; A[5] * B[4]
|
|
mulx r10, r9, QWORD PTR [r8+32]
|
|
adcx r13, r9
|
|
adox r14, r10
|
|
; A[5] * B[5]
|
|
mulx r15, r9, QWORD PTR [r8+40]
|
|
adcx r14, r9
|
|
adox r15, rbx
|
|
adcx r15, rbx
|
|
mov QWORD PTR [rcx+40], rdi
|
|
mov QWORD PTR [rcx+48], rsi
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov QWORD PTR [rcx+64], r12
|
|
mov QWORD PTR [rcx+72], r13
|
|
mov QWORD PTR [rcx+80], r14
|
|
mov QWORD PTR [rcx+88], r15
|
|
mov r11, QWORD PTR [rsp]
|
|
mov r12, QWORD PTR [rsp+8]
|
|
mov r13, QWORD PTR [rsp+16]
|
|
mov r14, QWORD PTR [rsp+24]
|
|
mov r15, QWORD PTR [rsp+32]
|
|
mov QWORD PTR [rcx], r11
|
|
mov QWORD PTR [rcx+8], r12
|
|
mov QWORD PTR [rcx+16], r13
|
|
mov QWORD PTR [rcx+24], r14
|
|
mov QWORD PTR [rcx+32], r15
|
|
add rsp, 40
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_mul_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 384 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mont_reduce_order_avx2_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
mov rax, rdx
|
|
xor r15, r15
|
|
mov r14, QWORD PTR [rcx]
|
|
xor r13, r13
|
|
L_mont_loop_order_avx2_6:
|
|
; mu = a[i] * mp
|
|
mov rdx, r14
|
|
mov r11, r14
|
|
imul rdx, r8
|
|
xor r13, r13
|
|
; a[i+0] += m[0] * mu
|
|
mulx r10, r9, QWORD PTR [rax]
|
|
mov r14, QWORD PTR [rcx+8]
|
|
adcx r11, r9
|
|
adox r14, r10
|
|
; a[i+1] += m[1] * mu
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, QWORD PTR [rcx+16]
|
|
adcx r14, r9
|
|
adox r11, r10
|
|
; a[i+2] += m[2] * mu
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, QWORD PTR [rcx+24]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; a[i+3] += m[3] * mu
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, QWORD PTR [rcx+32]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; a[i+4] += m[4] * mu
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, QWORD PTR [rcx+40]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; a[i+5] += m[5] * mu
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, QWORD PTR [rcx+48]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
adcx r11, r15
|
|
mov QWORD PTR [rcx+48], r11
|
|
mov r15, r13
|
|
adox r15, r13
|
|
adcx r15, r13
|
|
; mu = a[i] * mp
|
|
mov rdx, r14
|
|
mov r11, r14
|
|
imul rdx, r8
|
|
xor r13, r13
|
|
; a[i+0] += m[0] * mu
|
|
mulx r10, r9, QWORD PTR [rax]
|
|
mov r14, QWORD PTR [rcx+16]
|
|
adcx r11, r9
|
|
adox r14, r10
|
|
; a[i+1] += m[1] * mu
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, QWORD PTR [rcx+24]
|
|
adcx r14, r9
|
|
adox r11, r10
|
|
; a[i+2] += m[2] * mu
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, QWORD PTR [rcx+32]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
; a[i+3] += m[3] * mu
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, QWORD PTR [rcx+40]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+32], r12
|
|
; a[i+4] += m[4] * mu
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, QWORD PTR [rcx+48]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+40], r11
|
|
; a[i+5] += m[5] * mu
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+48], r12
|
|
adcx r11, r15
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r15, r13
|
|
adox r15, r13
|
|
adcx r15, r13
|
|
; mu = a[i] * mp
|
|
mov rdx, r14
|
|
mov r11, r14
|
|
imul rdx, r8
|
|
xor r13, r13
|
|
; a[i+0] += m[0] * mu
|
|
mulx r10, r9, QWORD PTR [rax]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
adcx r11, r9
|
|
adox r14, r10
|
|
; a[i+1] += m[1] * mu
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, QWORD PTR [rcx+32]
|
|
adcx r14, r9
|
|
adox r11, r10
|
|
; a[i+2] += m[2] * mu
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, QWORD PTR [rcx+40]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; a[i+3] += m[3] * mu
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, QWORD PTR [rcx+48]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; a[i+4] += m[4] * mu
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, QWORD PTR [rcx+56]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; a[i+5] += m[5] * mu
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, QWORD PTR [rcx+64]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
adcx r11, r15
|
|
mov QWORD PTR [rcx+64], r11
|
|
mov r15, r13
|
|
adox r15, r13
|
|
adcx r15, r13
|
|
; mu = a[i] * mp
|
|
mov rdx, r14
|
|
mov r11, r14
|
|
imul rdx, r8
|
|
xor r13, r13
|
|
; a[i+0] += m[0] * mu
|
|
mulx r10, r9, QWORD PTR [rax]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
adcx r11, r9
|
|
adox r14, r10
|
|
; a[i+1] += m[1] * mu
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, QWORD PTR [rcx+40]
|
|
adcx r14, r9
|
|
adox r11, r10
|
|
; a[i+2] += m[2] * mu
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, QWORD PTR [rcx+48]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+40], r11
|
|
; a[i+3] += m[3] * mu
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+48], r12
|
|
; a[i+4] += m[4] * mu
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, QWORD PTR [rcx+64]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
; a[i+5] += m[5] * mu
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, QWORD PTR [rcx+72]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+64], r12
|
|
adcx r11, r15
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r15, r13
|
|
adox r15, r13
|
|
adcx r15, r13
|
|
; mu = a[i] * mp
|
|
mov rdx, r14
|
|
mov r11, r14
|
|
imul rdx, r8
|
|
xor r13, r13
|
|
; a[i+0] += m[0] * mu
|
|
mulx r10, r9, QWORD PTR [rax]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
adcx r11, r9
|
|
adox r14, r10
|
|
; a[i+1] += m[1] * mu
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, QWORD PTR [rcx+48]
|
|
adcx r14, r9
|
|
adox r11, r10
|
|
; a[i+2] += m[2] * mu
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, QWORD PTR [rcx+56]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; a[i+3] += m[3] * mu
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, QWORD PTR [rcx+64]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; a[i+4] += m[4] * mu
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, QWORD PTR [rcx+72]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; a[i+5] += m[5] * mu
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, QWORD PTR [rcx+80]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
adcx r11, r15
|
|
mov QWORD PTR [rcx+80], r11
|
|
mov r15, r13
|
|
adox r15, r13
|
|
adcx r15, r13
|
|
; mu = a[i] * mp
|
|
mov rdx, r14
|
|
mov r11, r14
|
|
imul rdx, r8
|
|
xor r13, r13
|
|
; a[i+0] += m[0] * mu
|
|
mulx r10, r9, QWORD PTR [rax]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
adcx r11, r9
|
|
adox r14, r10
|
|
; a[i+1] += m[1] * mu
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
adcx r14, r9
|
|
adox r11, r10
|
|
; a[i+2] += m[2] * mu
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, QWORD PTR [rcx+64]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
; a[i+3] += m[3] * mu
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, QWORD PTR [rcx+72]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+64], r12
|
|
; a[i+4] += m[4] * mu
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, QWORD PTR [rcx+80]
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+72], r11
|
|
; a[i+5] += m[5] * mu
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, QWORD PTR [rcx+88]
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+80], r12
|
|
adcx r11, r15
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r15, r13
|
|
adox r15, r13
|
|
adcx r15, r13
|
|
neg r15
|
|
mov r8, rcx
|
|
add rcx, 48
|
|
mov r10, QWORD PTR [rax]
|
|
mov rdx, r14
|
|
pext r10, r10, r15
|
|
sub rdx, r10
|
|
mov r10, QWORD PTR [rax+8]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [r8], rdx
|
|
sbb r9, r10
|
|
mov rdx, QWORD PTR [rax+16]
|
|
mov r10, QWORD PTR [rcx+16]
|
|
pext rdx, rdx, r15
|
|
mov QWORD PTR [r8+8], r9
|
|
sbb r10, rdx
|
|
mov r9, QWORD PTR [rax+24]
|
|
mov rdx, QWORD PTR [rcx+24]
|
|
pext r9, r9, r15
|
|
mov QWORD PTR [r8+16], r10
|
|
sbb rdx, r9
|
|
mov r10, QWORD PTR [rax+32]
|
|
mov r9, QWORD PTR [rcx+32]
|
|
pext r10, r10, r15
|
|
mov QWORD PTR [r8+24], rdx
|
|
sbb r9, r10
|
|
mov rdx, QWORD PTR [rax+40]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
pext rdx, rdx, r15
|
|
mov QWORD PTR [r8+32], r9
|
|
sbb r10, rdx
|
|
mov QWORD PTR [r8+40], r10
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_mont_reduce_order_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r Result of squaring.
|
|
; * a Number to square in Montogmery form.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_sqr_avx2_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov rax, rdx
|
|
push rcx
|
|
xor rcx, rcx
|
|
mov rdx, QWORD PTR [rax]
|
|
mov rsi, QWORD PTR [rax+8]
|
|
mov rbx, QWORD PTR [rax+16]
|
|
mov rbp, QWORD PTR [rax+24]
|
|
; Diagonal 0
|
|
; A[1] * A[0]
|
|
mulx r11, r10, QWORD PTR [rax+8]
|
|
; A[2] * A[0]
|
|
mulx r12, r8, QWORD PTR [rax+16]
|
|
adcx r11, r8
|
|
; A[3] * A[0]
|
|
mulx r13, r8, QWORD PTR [rax+24]
|
|
adcx r12, r8
|
|
; A[4] * A[0]
|
|
mulx r14, r8, QWORD PTR [rax+32]
|
|
adcx r13, r8
|
|
; A[5] * A[0]
|
|
mulx r15, r8, QWORD PTR [rax+40]
|
|
adcx r14, r8
|
|
adcx r15, rcx
|
|
; Diagonal 1
|
|
mov rdx, rsi
|
|
; A[2] * A[1]
|
|
mulx r9, r8, QWORD PTR [rax+16]
|
|
adcx r12, r8
|
|
adox r13, r9
|
|
; A[3] * A[1]
|
|
mulx r9, r8, QWORD PTR [rax+24]
|
|
adcx r13, r8
|
|
adox r14, r9
|
|
; A[4] * A[1]
|
|
mulx r9, r8, QWORD PTR [rax+32]
|
|
adcx r14, r8
|
|
adox r15, r9
|
|
; A[5] * A[1]
|
|
mulx rdi, r8, QWORD PTR [rax+40]
|
|
adcx r15, r8
|
|
adox rdi, rcx
|
|
mov rdx, rbx
|
|
; A[5] * A[2]
|
|
mulx rsi, r8, QWORD PTR [rax+40]
|
|
adcx rdi, r8
|
|
adox rsi, rcx
|
|
adcx rsi, rcx
|
|
adcx rbx, rcx
|
|
; Diagonal 2
|
|
; A[3] * A[2]
|
|
mulx r9, r8, QWORD PTR [rax+24]
|
|
adcx r14, r8
|
|
adox r15, r9
|
|
; A[4] * A[2]
|
|
mulx r9, r8, QWORD PTR [rax+32]
|
|
adcx r15, r8
|
|
adox rdi, r9
|
|
mov rdx, rbp
|
|
; A[4] * A[3]
|
|
mulx r9, r8, QWORD PTR [rax+32]
|
|
adcx rdi, r8
|
|
adox rsi, r9
|
|
; A[5] * A[3]
|
|
mulx rbx, r8, QWORD PTR [rax+40]
|
|
adcx rsi, r8
|
|
adox rbx, rcx
|
|
mov rdx, QWORD PTR [rax+32]
|
|
; A[5] * A[4]
|
|
mulx rbp, r8, QWORD PTR [rax+40]
|
|
adcx rbx, r8
|
|
adox rbp, rcx
|
|
adcx rbp, rcx
|
|
adcx rcx, rcx
|
|
; Doubling previous result as we add in square words results
|
|
; A[0] * A[0]
|
|
mov rdx, QWORD PTR [rax]
|
|
mulx r9, r8, rdx
|
|
pop rdx
|
|
mov QWORD PTR [rdx], r8
|
|
adox r10, r10
|
|
push rdx
|
|
adcx r10, r9
|
|
; A[1] * A[1]
|
|
mov rdx, QWORD PTR [rax+8]
|
|
mulx r9, r8, rdx
|
|
adox r11, r11
|
|
adcx r11, r8
|
|
adox r12, r12
|
|
adcx r12, r9
|
|
; A[2] * A[2]
|
|
mov rdx, QWORD PTR [rax+16]
|
|
mulx r9, r8, rdx
|
|
adox r13, r13
|
|
adcx r13, r8
|
|
adox r14, r14
|
|
adcx r14, r9
|
|
; A[3] * A[3]
|
|
mov rdx, QWORD PTR [rax+24]
|
|
mulx r9, r8, rdx
|
|
adox r15, r15
|
|
adcx r15, r8
|
|
adox rdi, rdi
|
|
adcx rdi, r9
|
|
; A[4] * A[4]
|
|
mov rdx, QWORD PTR [rax+32]
|
|
mulx r9, r8, rdx
|
|
adox rsi, rsi
|
|
adcx rsi, r8
|
|
adox rbx, rbx
|
|
adcx rbx, r9
|
|
; A[5] * A[5]
|
|
mov rdx, QWORD PTR [rax+40]
|
|
mulx r9, r8, rdx
|
|
adox rbp, rbp
|
|
adcx rbp, r8
|
|
adcx r9, rcx
|
|
mov r8, 0
|
|
adox r9, r8
|
|
pop rcx
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov QWORD PTR [rcx+32], r13
|
|
mov QWORD PTR [rcx+40], r14
|
|
mov QWORD PTR [rcx+48], r15
|
|
mov QWORD PTR [rcx+56], rdi
|
|
mov QWORD PTR [rcx+64], rsi
|
|
mov QWORD PTR [rcx+72], rbx
|
|
mov QWORD PTR [rcx+80], rbp
|
|
mov QWORD PTR [rcx+88], r9
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_sqr_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_cond_sub_avx2_6 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_384_cond_sub_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_div2_avx2_6 PROC
|
|
push r12
|
|
push r13
|
|
mov r13, QWORD PTR [rdx]
|
|
xor r12, r12
|
|
mov r10, r13
|
|
and r13, 1
|
|
neg r13
|
|
mov rax, QWORD PTR [r8]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
add r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx], r10
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov rax, QWORD PTR [r8+16]
|
|
mov r9, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [r8+32]
|
|
mov r9, QWORD PTR [r8+40]
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r11, QWORD PTR [rdx+40]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov QWORD PTR [rcx+40], r11
|
|
adc r12, 0
|
|
mov r10, QWORD PTR [rcx]
|
|
mov r11, QWORD PTR [rcx+8]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rcx+16]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rcx+24]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rcx+32]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rcx+40]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+32], r10
|
|
shrd r11, r12, 1
|
|
mov QWORD PTR [rcx+40], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_div2_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFNDEF WC_NO_CACHE_RESISTANT
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_get_entry_64_6 PROC
|
|
mov rax, 1
|
|
movd xmm13, r8d
|
|
add rdx, 96
|
|
movd xmm15, eax
|
|
mov rax, 63
|
|
pshufd xmm15, xmm15, 0
|
|
pshufd xmm13, xmm13, 0
|
|
pxor xmm14, xmm14
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
pxor xmm3, xmm3
|
|
pxor xmm4, xmm4
|
|
pxor xmm5, xmm5
|
|
movdqa xmm14, xmm15
|
|
L_384_get_entry_64_6_start:
|
|
movdqa xmm12, xmm14
|
|
paddd xmm14, xmm15
|
|
pcmpeqd xmm12, xmm13
|
|
movdqu xmm6, [rdx]
|
|
movdqu xmm7, [rdx+16]
|
|
movdqu xmm8, [rdx+32]
|
|
movdqu xmm9, [rdx+48]
|
|
movdqu xmm10, [rdx+64]
|
|
movdqu xmm11, [rdx+80]
|
|
add rdx, 96
|
|
pand xmm6, xmm12
|
|
pand xmm7, xmm12
|
|
pand xmm8, xmm12
|
|
pand xmm9, xmm12
|
|
pand xmm10, xmm12
|
|
pand xmm11, xmm12
|
|
por xmm0, xmm6
|
|
por xmm1, xmm7
|
|
por xmm2, xmm8
|
|
por xmm3, xmm9
|
|
por xmm4, xmm10
|
|
por xmm5, xmm11
|
|
dec rax
|
|
jnz L_384_get_entry_64_6_start
|
|
movdqu [rcx], xmm0
|
|
movdqu [rcx+16], xmm1
|
|
movdqu [rcx+32], xmm2
|
|
movdqu [rcx+96], xmm3
|
|
movdqu [rcx+112], xmm4
|
|
movdqu [rcx+128], xmm5
|
|
ret
|
|
sp_384_get_entry_64_6 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_get_entry_64_avx2_6 PROC
|
|
mov rax, 1
|
|
movd xmm9, r8d
|
|
add rdx, 96
|
|
movd xmm11, eax
|
|
mov rax, 64
|
|
vpxor ymm10, ymm10, ymm10
|
|
vpermd ymm9, ymm10, ymm9
|
|
vpermd ymm11, ymm10, ymm11
|
|
vpxor ymm0, ymm0, ymm0
|
|
vpxor xmm1, xmm1, xmm1
|
|
vpxor ymm2, ymm2, ymm2
|
|
vpxor xmm3, xmm3, xmm3
|
|
vmovdqa ymm10, ymm11
|
|
L_384_get_entry_64_avx2_6_start:
|
|
vpcmpeqd ymm8, ymm10, ymm9
|
|
vpaddd ymm10, ymm10, ymm11
|
|
vmovupd ymm4, [rdx]
|
|
vmovdqu xmm5, OWORD PTR [rdx+32]
|
|
vmovupd ymm6, [rdx+48]
|
|
vmovdqu xmm7, OWORD PTR [rdx+80]
|
|
add rdx, 96
|
|
vpand ymm4, ymm4, ymm8
|
|
vpand xmm5, xmm5, xmm8
|
|
vpand ymm6, ymm6, ymm8
|
|
vpand xmm7, xmm7, xmm8
|
|
vpor ymm0, ymm0, ymm4
|
|
vpor xmm1, xmm1, xmm5
|
|
vpor ymm2, ymm2, ymm6
|
|
vpor xmm3, xmm3, xmm7
|
|
dec rax
|
|
jnz L_384_get_entry_64_avx2_6_start
|
|
vmovupd YMMWORD PTR [rcx], ymm0
|
|
vmovdqu [rcx+32], xmm1
|
|
vmovupd YMMWORD PTR [rcx+96], ymm2
|
|
vmovdqu [rcx+128], xmm3
|
|
ret
|
|
sp_384_get_entry_64_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
IFNDEF WC_NO_CACHE_RESISTANT
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_get_entry_65_6 PROC
|
|
mov rax, 1
|
|
movd xmm13, r8d
|
|
add rdx, 96
|
|
movd xmm15, eax
|
|
mov rax, 64
|
|
pshufd xmm15, xmm15, 0
|
|
pshufd xmm13, xmm13, 0
|
|
pxor xmm14, xmm14
|
|
pxor xmm0, xmm0
|
|
pxor xmm1, xmm1
|
|
pxor xmm2, xmm2
|
|
pxor xmm3, xmm3
|
|
pxor xmm4, xmm4
|
|
pxor xmm5, xmm5
|
|
movdqa xmm14, xmm15
|
|
L_384_get_entry_65_6_start:
|
|
movdqa xmm12, xmm14
|
|
paddd xmm14, xmm15
|
|
pcmpeqd xmm12, xmm13
|
|
movdqu xmm6, [rdx]
|
|
movdqu xmm7, [rdx+16]
|
|
movdqu xmm8, [rdx+32]
|
|
movdqu xmm9, [rdx+48]
|
|
movdqu xmm10, [rdx+64]
|
|
movdqu xmm11, [rdx+80]
|
|
add rdx, 96
|
|
pand xmm6, xmm12
|
|
pand xmm7, xmm12
|
|
pand xmm8, xmm12
|
|
pand xmm9, xmm12
|
|
pand xmm10, xmm12
|
|
pand xmm11, xmm12
|
|
por xmm0, xmm6
|
|
por xmm1, xmm7
|
|
por xmm2, xmm8
|
|
por xmm3, xmm9
|
|
por xmm4, xmm10
|
|
por xmm5, xmm11
|
|
dec rax
|
|
jnz L_384_get_entry_65_6_start
|
|
movdqu [rcx], xmm0
|
|
movdqu [rcx+16], xmm1
|
|
movdqu [rcx+32], xmm2
|
|
movdqu [rcx+96], xmm3
|
|
movdqu [rcx+112], xmm4
|
|
movdqu [rcx+128], xmm5
|
|
ret
|
|
sp_384_get_entry_65_6 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Touch each possible entry that could be being copied.
|
|
; *
|
|
; * r Point to copy into.
|
|
; * table Table - start of the entires to access
|
|
; * idx Index of entry to retrieve.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_get_entry_65_avx2_6 PROC
|
|
mov rax, 1
|
|
movd xmm9, r8d
|
|
add rdx, 96
|
|
movd xmm11, eax
|
|
mov rax, 65
|
|
vpxor ymm10, ymm10, ymm10
|
|
vpermd ymm9, ymm10, ymm9
|
|
vpermd ymm11, ymm10, ymm11
|
|
vpxor ymm0, ymm0, ymm0
|
|
vpxor xmm1, xmm1, xmm1
|
|
vpxor ymm2, ymm2, ymm2
|
|
vpxor xmm3, xmm3, xmm3
|
|
vmovdqa ymm10, ymm11
|
|
L_384_get_entry_65_avx2_6_start:
|
|
vpcmpeqd ymm8, ymm10, ymm9
|
|
vpaddd ymm10, ymm10, ymm11
|
|
vmovupd ymm4, [rdx]
|
|
vmovdqu xmm5, OWORD PTR [rdx+32]
|
|
vmovupd ymm6, [rdx+48]
|
|
vmovdqu xmm7, OWORD PTR [rdx+80]
|
|
add rdx, 96
|
|
vpand ymm4, ymm4, ymm8
|
|
vpand xmm5, xmm5, xmm8
|
|
vpand ymm6, ymm6, ymm8
|
|
vpand xmm7, xmm7, xmm8
|
|
vpor ymm0, ymm0, ymm4
|
|
vpor xmm1, xmm1, xmm5
|
|
vpor ymm2, ymm2, ymm6
|
|
vpor xmm3, xmm3, xmm7
|
|
dec rax
|
|
jnz L_384_get_entry_65_avx2_6_start
|
|
vmovupd YMMWORD PTR [rcx], ymm0
|
|
vmovdqu [rcx+32], xmm1
|
|
vmovupd YMMWORD PTR [rcx+96], ymm2
|
|
vmovdqu [rcx+128], xmm3
|
|
ret
|
|
sp_384_get_entry_65_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
; /* Add 1 to a. (a = a + 1)
|
|
; *
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_add_one_6 PROC
|
|
add QWORD PTR [rcx], 1
|
|
adc QWORD PTR [rcx+8], 0
|
|
adc QWORD PTR [rcx+16], 0
|
|
adc QWORD PTR [rcx+24], 0
|
|
adc QWORD PTR [rcx+32], 0
|
|
adc QWORD PTR [rcx+40], 0
|
|
ret
|
|
sp_384_add_one_6 ENDP
|
|
_text ENDS
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_from_bin_bswap PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 48
|
|
xor r13, r13
|
|
jmp L_384_from_bin_bswap_64_end
|
|
L_384_from_bin_bswap_64_start:
|
|
sub r11, 64
|
|
mov rax, QWORD PTR [r11+56]
|
|
mov r10, QWORD PTR [r11+48]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [r11+40]
|
|
mov r10, QWORD PTR [r11+32]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [r11+24]
|
|
mov r10, QWORD PTR [r11+16]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [r11+8]
|
|
mov r10, QWORD PTR [r11]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_384_from_bin_bswap_64_end:
|
|
cmp r9, 63
|
|
jg L_384_from_bin_bswap_64_start
|
|
jmp L_384_from_bin_bswap_8_end
|
|
L_384_from_bin_bswap_8_start:
|
|
sub r11, 8
|
|
mov rax, QWORD PTR [r11]
|
|
bswap rax
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_384_from_bin_bswap_8_end:
|
|
cmp r9, 7
|
|
jg L_384_from_bin_bswap_8_start
|
|
cmp r9, r13
|
|
je L_384_from_bin_bswap_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_384_from_bin_bswap_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_384_from_bin_bswap_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_384_from_bin_bswap_hi_end:
|
|
cmp rcx, r12
|
|
je L_384_from_bin_bswap_zero_end
|
|
L_384_from_bin_bswap_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_384_from_bin_bswap_zero_start
|
|
L_384_from_bin_bswap_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_from_bin_bswap ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the movbe instruction which is an optional instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_from_bin_movbe PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 48
|
|
xor r13, r13
|
|
jmp L_384_from_bin_movbe_64_end
|
|
L_384_from_bin_movbe_64_start:
|
|
sub r11, 64
|
|
movbe rax, QWORD PTR [r11+56]
|
|
movbe r10, QWORD PTR [r11+48]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
movbe rax, QWORD PTR [r11+40]
|
|
movbe r10, QWORD PTR [r11+32]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
movbe rax, QWORD PTR [r11+24]
|
|
movbe r10, QWORD PTR [r11+16]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
movbe rax, QWORD PTR [r11+8]
|
|
movbe r10, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_384_from_bin_movbe_64_end:
|
|
cmp r9, 63
|
|
jg L_384_from_bin_movbe_64_start
|
|
jmp L_384_from_bin_movbe_8_end
|
|
L_384_from_bin_movbe_8_start:
|
|
sub r11, 8
|
|
movbe rax, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_384_from_bin_movbe_8_end:
|
|
cmp r9, 7
|
|
jg L_384_from_bin_movbe_8_start
|
|
cmp r9, r13
|
|
je L_384_from_bin_movbe_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_384_from_bin_movbe_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_384_from_bin_movbe_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_384_from_bin_movbe_hi_end:
|
|
cmp rcx, r12
|
|
je L_384_from_bin_movbe_zero_end
|
|
L_384_from_bin_movbe_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_384_from_bin_movbe_zero_start
|
|
L_384_from_bin_movbe_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_from_bin_movbe ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 48
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_to_bin_bswap_6 PROC
|
|
mov rax, QWORD PTR [rcx+40]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
mov rax, QWORD PTR [rcx+24]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
mov rax, QWORD PTR [rcx+8]
|
|
mov r8, QWORD PTR [rcx]
|
|
bswap rax
|
|
bswap r8
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
ret
|
|
sp_384_to_bin_bswap_6 ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Write r as big endian to byte array.
|
|
; * Fixed length number of bytes written: 48
|
|
; * Uses the movbe instruction which is optional.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a Byte array.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_to_bin_movbe_6 PROC
|
|
movbe rax, QWORD PTR [rcx+40]
|
|
movbe r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rdx], rax
|
|
mov QWORD PTR [rdx+8], r8
|
|
movbe rax, QWORD PTR [rcx+24]
|
|
movbe r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rdx+16], rax
|
|
mov QWORD PTR [rdx+24], r8
|
|
movbe rax, QWORD PTR [rcx+8]
|
|
movbe r8, QWORD PTR [rcx]
|
|
mov QWORD PTR [rdx+32], rax
|
|
mov QWORD PTR [rdx+40], r8
|
|
ret
|
|
sp_384_to_bin_movbe_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_sub_in_place_6 PROC
|
|
xor rax, rax
|
|
mov r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
mov r13, QWORD PTR [rdx+40]
|
|
sub QWORD PTR [rcx], r8
|
|
sbb QWORD PTR [rcx+8], r9
|
|
sbb QWORD PTR [rcx+16], r10
|
|
sbb QWORD PTR [rcx+24], r11
|
|
sbb QWORD PTR [rcx+32], r12
|
|
sbb QWORD PTR [rcx+40], r13
|
|
sbb rax, 0
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_sub_in_place_6 ENDP
|
|
_text ENDS
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mul_d_6 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
mov QWORD PTR [rcx+40], r12
|
|
mov QWORD PTR [rcx+48], r10
|
|
pop r12
|
|
ret
|
|
sp_384_mul_d_6 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_mul_d_avx2_6 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+40], r12
|
|
mov QWORD PTR [rcx+48], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_mul_d_avx2_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_384_word_asm_6 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_384_word_asm_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Shift number right by 1 bit. (r = a >> 1)
|
|
; *
|
|
; * r Result of right shift by 1.
|
|
; * a Number to shift.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_rshift1_6 PROC
|
|
push r12
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
shrd rax, r8, 1
|
|
shrd r8, r9, 1
|
|
shrd r9, r10, 1
|
|
shrd r10, r11, 1
|
|
shrd r11, r12, 1
|
|
shr r12, 1
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r8
|
|
mov QWORD PTR [rcx+16], r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
mov QWORD PTR [rcx+40], r12
|
|
pop r12
|
|
ret
|
|
sp_384_rshift1_6 ENDP
|
|
_text ENDS
|
|
; /* Divide the number by 2 mod the prime. (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_div2_mod_6 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
mov r13, QWORD PTR [rdx+40]
|
|
mov r14, QWORD PTR [r8]
|
|
mov r15, QWORD PTR [r8+8]
|
|
mov rdi, QWORD PTR [r8+16]
|
|
mov rsi, QWORD PTR [r8+24]
|
|
mov rbx, QWORD PTR [r8+32]
|
|
mov rbp, QWORD PTR [r8+40]
|
|
mov r8, rax
|
|
and r8, 1
|
|
je L_384_mod_inv_6_div2_mod_no_add
|
|
add rax, r14
|
|
adc r9, r15
|
|
adc r10, rdi
|
|
adc r11, rsi
|
|
adc r12, rbx
|
|
adc r13, rbp
|
|
mov r8, 0
|
|
adc r8, 0
|
|
L_384_mod_inv_6_div2_mod_no_add:
|
|
shrd rax, r9, 1
|
|
shrd r9, r10, 1
|
|
shrd r10, r11, 1
|
|
shrd r11, r12, 1
|
|
shrd r12, r13, 1
|
|
shrd r13, r8, 1
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov QWORD PTR [rcx+32], r12
|
|
mov QWORD PTR [rcx+40], r13
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_384_div2_mod_6 ENDP
|
|
_text ENDS
|
|
_text SEGMENT READONLY PARA
|
|
sp_384_num_bits_6 PROC
|
|
xor rax, rax
|
|
mov rdx, QWORD PTR [rcx+40]
|
|
cmp rdx, 0
|
|
je L_384_num_bits_6_end_320
|
|
mov rax, -1
|
|
bsr rax, rdx
|
|
add rax, 321
|
|
jmp L_384_num_bits_6_done
|
|
L_384_num_bits_6_end_320:
|
|
mov rdx, QWORD PTR [rcx+32]
|
|
cmp rdx, 0
|
|
je L_384_num_bits_6_end_256
|
|
mov rax, -1
|
|
bsr rax, rdx
|
|
add rax, 257
|
|
jmp L_384_num_bits_6_done
|
|
L_384_num_bits_6_end_256:
|
|
mov rdx, QWORD PTR [rcx+24]
|
|
cmp rdx, 0
|
|
je L_384_num_bits_6_end_192
|
|
mov rax, -1
|
|
bsr rax, rdx
|
|
add rax, 193
|
|
jmp L_384_num_bits_6_done
|
|
L_384_num_bits_6_end_192:
|
|
mov rdx, QWORD PTR [rcx+16]
|
|
cmp rdx, 0
|
|
je L_384_num_bits_6_end_128
|
|
mov rax, -1
|
|
bsr rax, rdx
|
|
add rax, 129
|
|
jmp L_384_num_bits_6_done
|
|
L_384_num_bits_6_end_128:
|
|
mov rdx, QWORD PTR [rcx+8]
|
|
cmp rdx, 0
|
|
je L_384_num_bits_6_end_64
|
|
mov rax, -1
|
|
bsr rax, rdx
|
|
add rax, 65
|
|
jmp L_384_num_bits_6_done
|
|
L_384_num_bits_6_end_64:
|
|
mov rdx, QWORD PTR [rcx]
|
|
cmp rdx, 0
|
|
je L_384_num_bits_6_end_0
|
|
mov rax, -1
|
|
bsr rax, rdx
|
|
add rax, 1
|
|
jmp L_384_num_bits_6_done
|
|
L_384_num_bits_6_end_0:
|
|
L_384_num_bits_6_done:
|
|
ret
|
|
sp_384_num_bits_6 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF WOLFSSL_SP_1024
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mul_16 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
sub rsp, 128
|
|
; A[0] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
mov QWORD PTR [rsp], rax
|
|
mov r11, rdx
|
|
; A[0] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+8], r11
|
|
; A[0] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+16], r12
|
|
; A[0] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+24], r10
|
|
; A[0] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+32], r11
|
|
; A[0] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+40], r12
|
|
; A[0] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+48], r10
|
|
; A[0] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+56], r11
|
|
; A[0] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+64], r12
|
|
; A[0] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+72], r10
|
|
; A[0] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+80], r11
|
|
; A[0] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+88], r12
|
|
; A[0] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+96], r10
|
|
; A[0] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[1] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+104], r11
|
|
; A[0] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+8]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[2] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+112], r12
|
|
; A[0] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[1] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[2] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+16]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[3] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rsp+120], r10
|
|
; A[1] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+16]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[3] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+24]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[4] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+128], r11
|
|
; A[2] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+16]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+24]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[4] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+32]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[5] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+136], r12
|
|
; A[3] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+24]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+32]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[5] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+40]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[6] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+144], r10
|
|
; A[4] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+32]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+40]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[6] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+48]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[7] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+152], r11
|
|
; A[5] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+40]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+48]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[7] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+56]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[8] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+160], r12
|
|
; A[6] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+48]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+56]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[8] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+64]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[9] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+168], r10
|
|
; A[7] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+56]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+64]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[9] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+72]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[10] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+176], r11
|
|
; A[8] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+64]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+72]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[10] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+80]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[11] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+184], r12
|
|
; A[9] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+72]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+80]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[11] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+88]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[12] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+192], r10
|
|
; A[10] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+80]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+88]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[12] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+96]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[13] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+200], r11
|
|
; A[11] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+88]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+96]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+104]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[14] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+208], r12
|
|
; A[12] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+96]
|
|
xor r12, r12
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+104]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[14] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+112]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[15] * B[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
mov QWORD PTR [rcx+216], r10
|
|
; A[13] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+104]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+112]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[15] * B[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r9+120]
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+224], r11
|
|
; A[14] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+112]
|
|
xor r11, r11
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r9+120]
|
|
add r12, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+232], r12
|
|
; A[15] * B[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+240], r10
|
|
mov QWORD PTR [rcx+248], r11
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r10, QWORD PTR [rsp+16]
|
|
mov r11, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov r10, QWORD PTR [rsp+48]
|
|
mov r11, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rsp+64]
|
|
mov rdx, QWORD PTR [rsp+72]
|
|
mov r10, QWORD PTR [rsp+80]
|
|
mov r11, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], rdx
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rsp+96]
|
|
mov rdx, QWORD PTR [rsp+104]
|
|
mov r10, QWORD PTR [rsp+112]
|
|
mov r11, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], rdx
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
add rsp, 128
|
|
pop r12
|
|
ret
|
|
sp_1024_mul_16 ENDP
|
|
_text ENDS
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_sqr_16 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r8, rdx
|
|
sub rsp, 128
|
|
; A[0] * A[0]
|
|
mov rax, QWORD PTR [r8]
|
|
mul rax
|
|
xor r11, r11
|
|
mov QWORD PTR [rsp], rax
|
|
mov r10, rdx
|
|
; A[0] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+8], r10
|
|
; A[0] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[1] * A[1]
|
|
mov rax, QWORD PTR [r8+8]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rsp+16], r11
|
|
; A[0] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[1] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul QWORD PTR [r8+8]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rsp+24], r9
|
|
; A[0] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[1] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+8]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[2] * A[2]
|
|
mov rax, QWORD PTR [r8+16]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rsp+32], r10
|
|
; A[0] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+40], r11
|
|
; A[0] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[3]
|
|
mov rax, QWORD PTR [r8+24]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+48], r9
|
|
; A[0] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+56], r10
|
|
; A[0] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[4]
|
|
mov rax, QWORD PTR [r8+32]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+64], r11
|
|
; A[0] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+72], r9
|
|
; A[0] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[5]
|
|
mov rax, QWORD PTR [r8+40]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+80], r10
|
|
; A[0] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+88], r11
|
|
; A[0] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[6]
|
|
mov rax, QWORD PTR [r8+48]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+96], r9
|
|
; A[0] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rsp+104], r10
|
|
; A[0] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[7]
|
|
mov rax, QWORD PTR [r8+56]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rsp+112], r11
|
|
; A[0] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[1] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+8]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[2] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rsp+120], r9
|
|
; A[1] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+8]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[2] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+16]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[3] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[8]
|
|
mov rax, QWORD PTR [r8+64]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+128], r10
|
|
; A[2] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+16]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[3] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+24]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[4] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+136], r11
|
|
; A[3] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+24]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[4] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+32]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[5] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[9]
|
|
mov rax, QWORD PTR [r8+72]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+144], r9
|
|
; A[4] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+32]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[5] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+40]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[6] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+152], r10
|
|
; A[5] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+40]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[6] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+48]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[7] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[10]
|
|
mov rax, QWORD PTR [r8+80]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+160], r11
|
|
; A[6] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+48]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[7] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+56]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[8] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+168], r9
|
|
; A[7] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+56]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[8] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+64]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[9] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[11] * A[11]
|
|
mov rax, QWORD PTR [r8+88]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+176], r10
|
|
; A[8] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+64]
|
|
xor r10, r10
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[9] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+72]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[10] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[11] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul QWORD PTR [r8+88]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r11, r12
|
|
adc r9, r13
|
|
adc r10, r14
|
|
mov QWORD PTR [rcx+184], r11
|
|
; A[9] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+72]
|
|
xor r11, r11
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[10] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+80]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[11] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+88]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[12] * A[12]
|
|
mov rax, QWORD PTR [r8+96]
|
|
mul rax
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r9, r12
|
|
adc r10, r13
|
|
adc r11, r14
|
|
mov QWORD PTR [rcx+192], r9
|
|
; A[10] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+80]
|
|
xor r9, r9
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; A[11] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+88]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
; A[12] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul QWORD PTR [r8+96]
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
add r12, r12
|
|
adc r13, r13
|
|
adc r14, r14
|
|
add r10, r12
|
|
adc r11, r13
|
|
adc r9, r14
|
|
mov QWORD PTR [rcx+200], r10
|
|
; A[11] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+88]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[12] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+96]
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
; A[13] * A[13]
|
|
mov rax, QWORD PTR [r8+104]
|
|
mul rax
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+208], r11
|
|
; A[12] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+96]
|
|
xor r11, r11
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[13] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul QWORD PTR [r8+104]
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
mov QWORD PTR [rcx+216], r9
|
|
; A[13] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+104]
|
|
xor r9, r9
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
; A[14] * A[14]
|
|
mov rax, QWORD PTR [r8+112]
|
|
mul rax
|
|
add r10, rax
|
|
adc r11, rdx
|
|
adc r9, 0
|
|
mov QWORD PTR [rcx+224], r10
|
|
; A[14] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul QWORD PTR [r8+112]
|
|
xor r10, r10
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
add r11, rax
|
|
adc r9, rdx
|
|
adc r10, 0
|
|
mov QWORD PTR [rcx+232], r11
|
|
; A[15] * A[15]
|
|
mov rax, QWORD PTR [r8+120]
|
|
mul rax
|
|
add r9, rax
|
|
adc r10, rdx
|
|
mov QWORD PTR [rcx+240], r9
|
|
mov QWORD PTR [rcx+248], r10
|
|
mov rax, QWORD PTR [rsp]
|
|
mov rdx, QWORD PTR [rsp+8]
|
|
mov r12, QWORD PTR [rsp+16]
|
|
mov r13, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], rdx
|
|
mov QWORD PTR [rcx+16], r12
|
|
mov QWORD PTR [rcx+24], r13
|
|
mov rax, QWORD PTR [rsp+32]
|
|
mov rdx, QWORD PTR [rsp+40]
|
|
mov r12, QWORD PTR [rsp+48]
|
|
mov r13, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], rdx
|
|
mov QWORD PTR [rcx+48], r12
|
|
mov QWORD PTR [rcx+56], r13
|
|
mov rax, QWORD PTR [rsp+64]
|
|
mov rdx, QWORD PTR [rsp+72]
|
|
mov r12, QWORD PTR [rsp+80]
|
|
mov r13, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], rdx
|
|
mov QWORD PTR [rcx+80], r12
|
|
mov QWORD PTR [rcx+88], r13
|
|
mov rax, QWORD PTR [rsp+96]
|
|
mov rdx, QWORD PTR [rsp+104]
|
|
mov r12, QWORD PTR [rsp+112]
|
|
mov r13, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], rdx
|
|
mov QWORD PTR [rcx+112], r12
|
|
mov QWORD PTR [rcx+120], r13
|
|
add rsp, 128
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_sqr_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Multiply a and b into r. (r = a * b)
|
|
; *
|
|
; * r Result of multiplication.
|
|
; * a First number to multiply.
|
|
; * b Second number to multiply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mul_avx2_16 PROC
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
mov rbp, r8
|
|
mov r8, rcx
|
|
mov r9, rdx
|
|
sub rsp, 128
|
|
cmp r9, r8
|
|
mov rbx, rsp
|
|
cmovne rbx, r8
|
|
cmp rbp, r8
|
|
cmove rbx, rsp
|
|
add r8, 128
|
|
xor rdi, rdi
|
|
mov rdx, QWORD PTR [r9]
|
|
; A[0] * B[0]
|
|
mulx r11, r10, QWORD PTR [rbp]
|
|
; A[0] * B[1]
|
|
mulx r12, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx], r10
|
|
adcx r11, rax
|
|
; A[0] * B[2]
|
|
mulx r13, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+8], r11
|
|
adcx r12, rax
|
|
; A[0] * B[3]
|
|
mulx r14, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r13, rax
|
|
mov QWORD PTR [rbx+24], r13
|
|
; A[0] * B[4]
|
|
mulx r10, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
; A[0] * B[5]
|
|
mulx r11, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
; A[0] * B[6]
|
|
mulx r12, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
; A[0] * B[7]
|
|
mulx r13, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
mov QWORD PTR [rbx+56], r12
|
|
; A[0] * B[8]
|
|
mulx r14, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
; A[0] * B[9]
|
|
mulx r10, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
; A[0] * B[10]
|
|
mulx r11, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
; A[0] * B[11]
|
|
mulx r12, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
mov QWORD PTR [rbx+88], r11
|
|
; A[0] * B[12]
|
|
mulx r13, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
; A[0] * B[13]
|
|
mulx r14, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
; A[0] * B[14]
|
|
mulx r10, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
; A[0] * B[15]
|
|
mulx r11, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adcx r11, rdi
|
|
mov r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov QWORD PTR [r8], r11
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mov r11, QWORD PTR [rbx+8]
|
|
mov r12, QWORD PTR [rbx+16]
|
|
mov r13, QWORD PTR [rbx+24]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
; A[1] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[1] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[1] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+24], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+32], r14
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
; A[1] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[1] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+64], r13
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
; A[1] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[1] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[1] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+96], r12
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[1] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[1] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[1] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[1] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, rdi
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8], r11
|
|
mov QWORD PTR [r8+8], r12
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [rbx+16]
|
|
mov r13, QWORD PTR [rbx+24]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
; A[2] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[2] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+16], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[2] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+24], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[2] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+40], r10
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
; A[2] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[2] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[2] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+72], r14
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
; A[2] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[2] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[2] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+104], r13
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[2] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[2] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[2] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[2] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, rdi
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
adcx r13, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+8], r12
|
|
mov QWORD PTR [r8+16], r13
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mov r13, QWORD PTR [rbx+24]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
; A[3] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[3] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+24], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[3] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+48], r11
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
; A[3] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[3] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[3] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[3] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+80], r10
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
; A[3] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[3] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[3] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[3] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+112], r14
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[3] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[3] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[3] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, rdi
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
adcx r14, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+16], r13
|
|
mov QWORD PTR [r8+24], r14
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rbx+32]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
; A[4] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[4] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+32], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[4] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+56], r12
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
; A[4] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[4] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[4] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+88], r11
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[4] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[4] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[4] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[4] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[4] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[4] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[4] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[4] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, rdi
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
adcx r10, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+24], r14
|
|
mov QWORD PTR [r8+32], r10
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mov r10, QWORD PTR [rbx+40]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
; A[5] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+40], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[5] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[5] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+64], r13
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
; A[5] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[5] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[5] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+96], r12
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[5] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[5] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[5] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[5] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[5] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[5] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[5] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+24], r14
|
|
mov r11, rdi
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+32], r10
|
|
mov QWORD PTR [r8+40], r11
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mov r11, QWORD PTR [rbx+48]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
; A[6] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+48], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[6] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[6] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+72], r14
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
; A[6] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[6] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+104], r13
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[6] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[6] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[6] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[6] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[6] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, rdi
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+40], r11
|
|
mov QWORD PTR [r8+48], r12
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mov r12, QWORD PTR [rbx+56]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
; A[7] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[7] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+56], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[7] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[7] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+80], r10
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
; A[7] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[7] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[7] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+112], r14
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[7] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[7] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[7] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
; A[7] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[7] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[7] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r13, rdi
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
adcx r13, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+48], r12
|
|
mov QWORD PTR [r8+56], r13
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mov r13, QWORD PTR [rbx+64]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
; A[8] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[8] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+64], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[8] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbx+88], r11
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[8] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[8] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[8] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[8] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[8] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[8] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[8] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r14
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
; A[8] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[8] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r14, rdi
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
adcx r14, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+56], r13
|
|
mov QWORD PTR [r8+64], r14
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rbx+72]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
; A[9] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[9] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+72], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[9] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [rbx+96], r12
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[9] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[9] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[9] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[9] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[9] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[9] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[9] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
; A[9] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[9] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[9] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[9] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+56], r13
|
|
mov r10, rdi
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
adcx r10, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+64], r14
|
|
mov QWORD PTR [r8+72], r10
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mov r10, QWORD PTR [rbx+80]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
; A[10] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+80], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[10] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [rbx+104], r13
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[10] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[10] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
; A[10] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[10] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[10] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[10] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[10] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[10] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[10] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[10] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+64], r14
|
|
mov r11, rdi
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+72], r10
|
|
mov QWORD PTR [r8+80], r11
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mov r11, QWORD PTR [rbx+88]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
; A[11] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+88], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[11] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[11] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbx+112], r14
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[11] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[11] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
; A[11] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[11] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[11] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
; A[11] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[11] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[11] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+72], r10
|
|
mov r12, rdi
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
adcx r12, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+80], r11
|
|
mov QWORD PTR [r8+88], r12
|
|
mov rdx, QWORD PTR [r9+96]
|
|
mov r12, QWORD PTR [rbx+96]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
; A[12] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[12] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+96], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[12] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[12] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbx+120], r10
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
; A[12] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[12] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[12] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r14
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
; A[12] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[12] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[12] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+56], r13
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, QWORD PTR [r8+88]
|
|
; A[12] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[12] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[12] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+80], r11
|
|
mov r13, rdi
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
adcx r13, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+88], r12
|
|
mov QWORD PTR [r8+96], r13
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mov r13, QWORD PTR [rbx+104]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[13] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[13] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+104], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[13] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8], r11
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[13] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[13] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[13] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[13] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[13] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[13] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[13] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+64], r14
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, QWORD PTR [r8+88]
|
|
mov r13, QWORD PTR [r8+96]
|
|
; A[13] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+80], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[13] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+88], r12
|
|
mov r14, rdi
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
adcx r14, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+96], r13
|
|
mov QWORD PTR [r8+104], r14
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rbx+112]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
; A[14] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[14] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+112], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[14] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov r14, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
; A[14] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[14] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+16], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[14] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r13, QWORD PTR [r8+56]
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
; A[14] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[14] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+48], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[14] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[14] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+72], r10
|
|
mov r12, QWORD PTR [r8+88]
|
|
mov r13, QWORD PTR [r8+96]
|
|
mov r14, QWORD PTR [r8+104]
|
|
; A[14] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[14] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+80], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[14] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+88], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[14] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+96], r13
|
|
mov r10, rdi
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
adcx r10, r15
|
|
mov r15, rdi
|
|
adox r15, rdi
|
|
adcx r15, rdi
|
|
mov QWORD PTR [r8+104], r14
|
|
mov QWORD PTR [r8+112], r10
|
|
mov rdx, QWORD PTR [r9+120]
|
|
mov r10, QWORD PTR [rbx+120]
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r13, QWORD PTR [r8+16]
|
|
mov r14, QWORD PTR [r8+24]
|
|
; A[15] * B[0]
|
|
mulx rcx, rax, QWORD PTR [rbp]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] * B[1]
|
|
mulx rcx, rax, QWORD PTR [rbp+8]
|
|
mov QWORD PTR [rbx+120], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] * B[2]
|
|
mulx rcx, rax, QWORD PTR [rbp+16]
|
|
mov QWORD PTR [r8], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[15] * B[3]
|
|
mulx rcx, rax, QWORD PTR [rbp+24]
|
|
mov QWORD PTR [r8+8], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
mov QWORD PTR [r8+16], r13
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r13, QWORD PTR [r8+56]
|
|
; A[15] * B[4]
|
|
mulx rcx, rax, QWORD PTR [rbp+32]
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[15] * B[5]
|
|
mulx rcx, rax, QWORD PTR [rbp+40]
|
|
mov QWORD PTR [r8+24], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] * B[6]
|
|
mulx rcx, rax, QWORD PTR [rbp+48]
|
|
mov QWORD PTR [r8+32], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] * B[7]
|
|
mulx rcx, rax, QWORD PTR [rbp+56]
|
|
mov QWORD PTR [r8+40], r11
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r14, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, QWORD PTR [r8+88]
|
|
; A[15] * B[8]
|
|
mulx rcx, rax, QWORD PTR [rbp+64]
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[15] * B[9]
|
|
mulx rcx, rax, QWORD PTR [rbp+72]
|
|
mov QWORD PTR [r8+56], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[15] * B[10]
|
|
mulx rcx, rax, QWORD PTR [rbp+80]
|
|
mov QWORD PTR [r8+64], r14
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] * B[11]
|
|
mulx rcx, rax, QWORD PTR [rbp+88]
|
|
mov QWORD PTR [r8+72], r10
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+80], r11
|
|
mov r13, QWORD PTR [r8+96]
|
|
mov r14, QWORD PTR [r8+104]
|
|
mov r10, QWORD PTR [r8+112]
|
|
; A[15] * B[12]
|
|
mulx rcx, rax, QWORD PTR [rbp+96]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
; A[15] * B[13]
|
|
mulx rcx, rax, QWORD PTR [rbp+104]
|
|
mov QWORD PTR [r8+88], r12
|
|
adcx r13, rax
|
|
adox r14, rcx
|
|
; A[15] * B[14]
|
|
mulx rcx, rax, QWORD PTR [rbp+112]
|
|
mov QWORD PTR [r8+96], r13
|
|
adcx r14, rax
|
|
adox r10, rcx
|
|
; A[15] * B[15]
|
|
mulx rcx, rax, QWORD PTR [rbp+120]
|
|
mov QWORD PTR [r8+104], r14
|
|
mov r11, rdi
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
adcx r11, r15
|
|
mov QWORD PTR [r8+112], r10
|
|
mov QWORD PTR [r8+120], r11
|
|
sub r8, 128
|
|
cmp r9, r8
|
|
je L_start_1024_mul_avx2_16
|
|
cmp rbp, r8
|
|
jne L_end_1024_mul_avx2_16
|
|
L_start_1024_mul_avx2_16:
|
|
vmovdqu xmm0, OWORD PTR [rbx]
|
|
vmovups OWORD PTR [r8], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+16]
|
|
vmovups OWORD PTR [r8+16], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+32]
|
|
vmovups OWORD PTR [r8+32], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+48]
|
|
vmovups OWORD PTR [r8+48], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+64]
|
|
vmovups OWORD PTR [r8+64], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+80]
|
|
vmovups OWORD PTR [r8+80], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+96]
|
|
vmovups OWORD PTR [r8+96], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbx+112]
|
|
vmovups OWORD PTR [r8+112], xmm0
|
|
L_end_1024_mul_avx2_16:
|
|
add rsp, 128
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
sp_1024_mul_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Square a and put result in r. (r = a * a)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_sqr_avx2_16 PROC
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
mov r8, rcx
|
|
mov r9, rdx
|
|
sub rsp, 128
|
|
cmp r9, r8
|
|
mov rbp, rsp
|
|
cmovne rbp, r8
|
|
add r8, 128
|
|
xor r13, r13
|
|
; Diagonal 1
|
|
xor r12, r12
|
|
; A[1] x A[0]
|
|
mov rdx, QWORD PTR [r9]
|
|
mulx r11, r10, QWORD PTR [r9+8]
|
|
; A[2] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+16]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+8], r10
|
|
mov QWORD PTR [rbp+16], r11
|
|
mov r10, r13
|
|
mov r11, r13
|
|
; A[3] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[4] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+24], r12
|
|
mov QWORD PTR [rbp+32], r10
|
|
mov r12, r13
|
|
mov r10, r13
|
|
; A[5] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[6] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+40], r11
|
|
mov QWORD PTR [rbp+48], r12
|
|
mov r11, r13
|
|
mov r12, r13
|
|
; A[7] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[8] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+56], r10
|
|
mov QWORD PTR [rbp+64], r11
|
|
mov r10, r13
|
|
mov r11, r13
|
|
; A[9] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[10] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+72], r12
|
|
mov QWORD PTR [rbp+80], r10
|
|
mov r12, r13
|
|
mov r10, r13
|
|
; A[11] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+88], r11
|
|
mov r15, r12
|
|
mov r11, r13
|
|
mov r12, r13
|
|
; A[13] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov rdi, r10
|
|
mov rsi, r11
|
|
mov r10, r13
|
|
; A[15] x A[0]
|
|
mulx rcx, rax, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov rbx, r12
|
|
; Carry
|
|
adcx r10, r13
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8], r10
|
|
; Diagonal 2
|
|
mov r10, QWORD PTR [rbp+24]
|
|
mov r11, QWORD PTR [rbp+32]
|
|
mov r12, QWORD PTR [rbp+40]
|
|
; A[2] x A[1]
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mulx rcx, rax, QWORD PTR [r9+16]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[3] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+24], r10
|
|
mov QWORD PTR [rbp+32], r11
|
|
mov r10, QWORD PTR [rbp+48]
|
|
mov r11, QWORD PTR [rbp+56]
|
|
; A[4] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[5] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+40], r12
|
|
mov QWORD PTR [rbp+48], r10
|
|
mov r12, QWORD PTR [rbp+64]
|
|
mov r10, QWORD PTR [rbp+72]
|
|
; A[6] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[7] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+56], r11
|
|
mov QWORD PTR [rbp+64], r12
|
|
mov r11, QWORD PTR [rbp+80]
|
|
mov r12, QWORD PTR [rbp+88]
|
|
; A[8] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[9] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+72], r10
|
|
mov QWORD PTR [rbp+80], r11
|
|
; No load %r13 - %r8
|
|
; No load %r14 - %r9
|
|
; A[10] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; A[11] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r12
|
|
; No store %r13
|
|
; No load %r15 - %r10
|
|
; No load %rbx - %r8
|
|
; A[12] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[13] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, r13
|
|
; A[14] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; A[15] x A[1]
|
|
mulx rcx, rax, QWORD PTR [r9+120]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r11
|
|
mov r10, r13
|
|
; A[15] x A[2]
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mulx rcx, rax, QWORD PTR [r9+120]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+16], r10
|
|
; Diagonal 3
|
|
mov r10, QWORD PTR [rbp+40]
|
|
mov r11, QWORD PTR [rbp+48]
|
|
mov r12, QWORD PTR [rbp+56]
|
|
; A[3] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[4] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+40], r10
|
|
mov QWORD PTR [rbp+48], r11
|
|
mov r10, QWORD PTR [rbp+64]
|
|
mov r11, QWORD PTR [rbp+72]
|
|
; A[5] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[6] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+56], r12
|
|
mov QWORD PTR [rbp+64], r10
|
|
mov r12, QWORD PTR [rbp+80]
|
|
mov r10, QWORD PTR [rbp+88]
|
|
; A[7] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[8] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [rbp+72], r11
|
|
mov QWORD PTR [rbp+80], r12
|
|
; No load %r13 - %r9
|
|
; No load %r14 - %r10
|
|
; A[9] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r15, rcx
|
|
; A[10] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r10
|
|
; No store %r13
|
|
; No load %r15 - %r8
|
|
; No load %rbx - %r9
|
|
; A[11] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[12] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [r8+8]
|
|
; A[13] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx rbx, rax
|
|
adox r12, rcx
|
|
; A[14] x A[2]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r12
|
|
mov r11, QWORD PTR [r8+16]
|
|
mov r12, r13
|
|
; A[14] x A[3]
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r10
|
|
mov QWORD PTR [r8+16], r11
|
|
mov r10, r13
|
|
; A[14] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+32], r10
|
|
; Diagonal 4
|
|
mov r10, QWORD PTR [rbp+56]
|
|
mov r11, QWORD PTR [rbp+64]
|
|
mov r12, QWORD PTR [rbp+72]
|
|
; A[4] x A[3]
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[5] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+56], r10
|
|
mov QWORD PTR [rbp+64], r11
|
|
mov r10, QWORD PTR [rbp+80]
|
|
mov r11, QWORD PTR [rbp+88]
|
|
; A[6] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[7] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [rbp+72], r12
|
|
mov QWORD PTR [rbp+80], r10
|
|
; No load %r13 - %r10
|
|
; No load %r14 - %r8
|
|
; A[8] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r11, rax
|
|
adox r15, rcx
|
|
; A[9] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r11
|
|
; No store %r13
|
|
; No load %r15 - %r9
|
|
; No load %rbx - %r10
|
|
; A[10] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[11] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[12] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; A[13] x A[3]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[13] x A[4]
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, r13
|
|
; A[13] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, r13
|
|
; A[13] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+48], r10
|
|
; Diagonal 5
|
|
mov r10, QWORD PTR [rbp+72]
|
|
mov r11, QWORD PTR [rbp+80]
|
|
mov r12, QWORD PTR [rbp+88]
|
|
; A[5] x A[4]
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[6] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [rbp+72], r10
|
|
mov QWORD PTR [rbp+80], r11
|
|
; No load %r13 - %r8
|
|
; No load %r14 - %r9
|
|
; A[7] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; A[8] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r12
|
|
; No store %r13
|
|
; No load %r15 - %r10
|
|
; No load %rbx - %r8
|
|
; A[9] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[10] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[11] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; A[12] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[12] x A[5]
|
|
mov rdx, QWORD PTR [r9+96]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[12] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov QWORD PTR [r8+16], r10
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
; A[12] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[12] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r11
|
|
mov QWORD PTR [r8+32], r12
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r12, r13
|
|
; A[12] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[12] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r10
|
|
mov QWORD PTR [r8+48], r11
|
|
mov r10, r13
|
|
; A[12] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+56], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+64], r10
|
|
; Diagonal 6
|
|
mov r10, QWORD PTR [rbp+88]
|
|
; No load %r13 - %r9
|
|
; No load %r14 - %r10
|
|
; A[6] x A[5]
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r15, rcx
|
|
; A[7] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
mov QWORD PTR [rbp+88], r10
|
|
; No store %r13
|
|
; No load %r15 - %r8
|
|
; No load %rbx - %r9
|
|
; A[8] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[9] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [r8+8]
|
|
; A[10] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx rbx, rax
|
|
adox r12, rcx
|
|
; A[11] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r12
|
|
mov r11, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [r8+24]
|
|
; A[11] x A[6]
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[11] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+8], r10
|
|
mov QWORD PTR [r8+16], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[11] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[11] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+24], r12
|
|
mov QWORD PTR [r8+32], r10
|
|
mov r12, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [r8+56]
|
|
; A[11] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[13] x A[9]
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+40], r11
|
|
mov QWORD PTR [r8+48], r12
|
|
mov r11, QWORD PTR [r8+64]
|
|
mov r12, r13
|
|
; A[13] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[13] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+56], r10
|
|
mov QWORD PTR [r8+64], r11
|
|
mov r10, r13
|
|
; A[13] x A[12]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+72], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+80], r10
|
|
; Diagonal 7
|
|
; No load %r14 - %r8
|
|
; No load %r15 - %r9
|
|
; No load %rbx - %r10
|
|
; A[7] x A[6]
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; A[8] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; No store %r14
|
|
; No store %r15
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[9] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx rbx, rax
|
|
adox r10, rcx
|
|
; A[10] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r10
|
|
mov r12, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [r8+24]
|
|
; A[10] x A[7]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[10] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+8], r11
|
|
mov QWORD PTR [r8+16], r12
|
|
mov r11, QWORD PTR [r8+32]
|
|
mov r12, QWORD PTR [r8+40]
|
|
; A[10] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[6]
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+24], r10
|
|
mov QWORD PTR [r8+32], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[14] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[14] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+40], r12
|
|
mov QWORD PTR [r8+48], r10
|
|
mov r12, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [r8+72]
|
|
; A[14] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[14] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+56], r11
|
|
mov QWORD PTR [r8+64], r12
|
|
mov r11, QWORD PTR [r8+80]
|
|
mov r12, r13
|
|
; A[14] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[14] x A[12]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+72], r10
|
|
mov QWORD PTR [r8+80], r11
|
|
mov r10, r13
|
|
; A[14] x A[13]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+88], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+96], r10
|
|
; Diagonal 8
|
|
; No load %rbx - %r8
|
|
mov r11, QWORD PTR [r8]
|
|
mov r12, QWORD PTR [r8+8]
|
|
; A[8] x A[7]
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx rbx, rax
|
|
adox r11, rcx
|
|
; A[9] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; No store %rbx
|
|
mov QWORD PTR [r8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[9] x A[8]
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[15] x A[3]
|
|
mov rdx, QWORD PTR [r9+120]
|
|
mulx rcx, rax, QWORD PTR [r9+24]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+8], r12
|
|
mov QWORD PTR [r8+16], r10
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [r8+40]
|
|
; A[15] x A[4]
|
|
mulx rcx, rax, QWORD PTR [r9+32]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] x A[5]
|
|
mulx rcx, rax, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+24], r11
|
|
mov QWORD PTR [r8+32], r12
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r12, QWORD PTR [r8+56]
|
|
; A[15] x A[6]
|
|
mulx rcx, rax, QWORD PTR [r9+48]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] x A[7]
|
|
mulx rcx, rax, QWORD PTR [r9+56]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+40], r10
|
|
mov QWORD PTR [r8+48], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
; A[15] x A[8]
|
|
mulx rcx, rax, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
; A[15] x A[9]
|
|
mulx rcx, rax, QWORD PTR [r9+72]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
mov QWORD PTR [r8+56], r12
|
|
mov QWORD PTR [r8+64], r10
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [r8+88]
|
|
; A[15] x A[10]
|
|
mulx rcx, rax, QWORD PTR [r9+80]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
; A[15] x A[11]
|
|
mulx rcx, rax, QWORD PTR [r9+88]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+72], r11
|
|
mov QWORD PTR [r8+80], r12
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r12, r13
|
|
; A[15] x A[12]
|
|
mulx rcx, rax, QWORD PTR [r9+96]
|
|
adcx r10, rax
|
|
adox r11, rcx
|
|
; A[15] x A[13]
|
|
mulx rcx, rax, QWORD PTR [r9+104]
|
|
adcx r11, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r8+88], r10
|
|
mov QWORD PTR [r8+96], r11
|
|
mov r10, r13
|
|
; A[15] x A[14]
|
|
mulx rcx, rax, QWORD PTR [r9+112]
|
|
adcx r12, rax
|
|
adox r10, rcx
|
|
mov QWORD PTR [r8+104], r12
|
|
; Carry
|
|
adcx r10, r14
|
|
mov r14, r13
|
|
adcx r14, r13
|
|
adox r14, r13
|
|
mov QWORD PTR [r8+112], r10
|
|
mov QWORD PTR [r8+120], r14
|
|
; Double and Add in A[i] x A[i]
|
|
mov r11, QWORD PTR [rbp+8]
|
|
; A[0] x A[0]
|
|
mov rdx, QWORD PTR [r9]
|
|
mulx rcx, rax, rdx
|
|
mov QWORD PTR [rbp], rax
|
|
adox r11, r11
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+8], r11
|
|
mov r10, QWORD PTR [rbp+16]
|
|
mov r11, QWORD PTR [rbp+24]
|
|
; A[1] x A[1]
|
|
mov rdx, QWORD PTR [r9+8]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+16], r10
|
|
mov QWORD PTR [rbp+24], r11
|
|
mov r10, QWORD PTR [rbp+32]
|
|
mov r11, QWORD PTR [rbp+40]
|
|
; A[2] x A[2]
|
|
mov rdx, QWORD PTR [r9+16]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+32], r10
|
|
mov QWORD PTR [rbp+40], r11
|
|
mov r10, QWORD PTR [rbp+48]
|
|
mov r11, QWORD PTR [rbp+56]
|
|
; A[3] x A[3]
|
|
mov rdx, QWORD PTR [r9+24]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+48], r10
|
|
mov QWORD PTR [rbp+56], r11
|
|
mov r10, QWORD PTR [rbp+64]
|
|
mov r11, QWORD PTR [rbp+72]
|
|
; A[4] x A[4]
|
|
mov rdx, QWORD PTR [r9+32]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+64], r10
|
|
mov QWORD PTR [rbp+72], r11
|
|
mov r10, QWORD PTR [rbp+80]
|
|
mov r11, QWORD PTR [rbp+88]
|
|
; A[5] x A[5]
|
|
mov rdx, QWORD PTR [r9+40]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [rbp+80], r10
|
|
mov QWORD PTR [rbp+88], r11
|
|
; A[6] x A[6]
|
|
mov rdx, QWORD PTR [r9+48]
|
|
mulx rcx, rax, rdx
|
|
adox r15, r15
|
|
adox rdi, rdi
|
|
adcx r15, rax
|
|
adcx rdi, rcx
|
|
; A[7] x A[7]
|
|
mov rdx, QWORD PTR [r9+56]
|
|
mulx rcx, rax, rdx
|
|
adox rsi, rsi
|
|
adox rbx, rbx
|
|
adcx rsi, rax
|
|
adcx rbx, rcx
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
; A[8] x A[8]
|
|
mov rdx, QWORD PTR [r9+64]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8], r10
|
|
mov QWORD PTR [r8+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
; A[9] x A[9]
|
|
mov rdx, QWORD PTR [r9+72]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+16], r10
|
|
mov QWORD PTR [r8+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
; A[10] x A[10]
|
|
mov rdx, QWORD PTR [r9+80]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+32], r10
|
|
mov QWORD PTR [r8+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
; A[11] x A[11]
|
|
mov rdx, QWORD PTR [r9+88]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+48], r10
|
|
mov QWORD PTR [r8+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
; A[12] x A[12]
|
|
mov rdx, QWORD PTR [r9+96]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+64], r10
|
|
mov QWORD PTR [r8+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
; A[13] x A[13]
|
|
mov rdx, QWORD PTR [r9+104]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+80], r10
|
|
mov QWORD PTR [r8+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
; A[14] x A[14]
|
|
mov rdx, QWORD PTR [r9+112]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+96], r10
|
|
mov QWORD PTR [r8+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
; A[15] x A[15]
|
|
mov rdx, QWORD PTR [r9+120]
|
|
mulx rcx, rax, rdx
|
|
adox r10, r10
|
|
adox r11, r11
|
|
adcx r10, rax
|
|
adcx r11, rcx
|
|
mov QWORD PTR [r8+112], r10
|
|
mov QWORD PTR [r8+120], r11
|
|
mov QWORD PTR [r8+-32], r15
|
|
mov QWORD PTR [r8+-24], rdi
|
|
mov QWORD PTR [r8+-16], rsi
|
|
mov QWORD PTR [r8+-8], rbx
|
|
sub r8, 128
|
|
cmp r9, r8
|
|
jne L_end_1024_sqr_avx2_16
|
|
vmovdqu xmm0, OWORD PTR [rbp]
|
|
vmovups OWORD PTR [r8], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+16]
|
|
vmovups OWORD PTR [r8+16], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+32]
|
|
vmovups OWORD PTR [r8+32], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+48]
|
|
vmovups OWORD PTR [r8+48], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+64]
|
|
vmovups OWORD PTR [r8+64], xmm0
|
|
vmovdqu xmm0, OWORD PTR [rbp+80]
|
|
vmovups OWORD PTR [r8+80], xmm0
|
|
L_end_1024_sqr_avx2_16:
|
|
add rsp, 128
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
sp_1024_sqr_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Add b to a into r. (r = a + b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_add_16 PROC
|
|
; Add
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
add r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
adc r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
adc r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
adc r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
adc r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
adc r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
adc r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
adc r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
adc r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
adc r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
adc r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
adc r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
adc r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
adc r10, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+120], r10
|
|
adc rax, 0
|
|
ret
|
|
sp_1024_add_16 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into a. (a -= b)
|
|
; *
|
|
; * a A single precision integer and result.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_sub_in_place_16 PROC
|
|
mov r8, QWORD PTR [rcx]
|
|
xor rax, rax
|
|
sub r8, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov QWORD PTR [rcx], r8
|
|
sbb r9, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rcx+16]
|
|
mov QWORD PTR [rcx+8], r9
|
|
sbb r8, QWORD PTR [rdx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
mov QWORD PTR [rcx+16], r8
|
|
sbb r9, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rcx+32]
|
|
mov QWORD PTR [rcx+24], r9
|
|
sbb r8, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov QWORD PTR [rcx+32], r8
|
|
sbb r9, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rcx+48]
|
|
mov QWORD PTR [rcx+40], r9
|
|
sbb r8, QWORD PTR [rdx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
mov QWORD PTR [rcx+48], r8
|
|
sbb r9, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rcx+64]
|
|
mov QWORD PTR [rcx+56], r9
|
|
sbb r8, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov QWORD PTR [rcx+64], r8
|
|
sbb r9, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rcx+80]
|
|
mov QWORD PTR [rcx+72], r9
|
|
sbb r8, QWORD PTR [rdx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
mov QWORD PTR [rcx+80], r8
|
|
sbb r9, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rcx+96]
|
|
mov QWORD PTR [rcx+88], r9
|
|
sbb r8, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov QWORD PTR [rcx+96], r8
|
|
sbb r9, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rcx+112]
|
|
mov QWORD PTR [rcx+104], r9
|
|
sbb r8, QWORD PTR [rdx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx+112], r8
|
|
sbb r9, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+120], r9
|
|
sbb rax, 0
|
|
ret
|
|
sp_1024_sub_in_place_16 ENDP
|
|
_text ENDS
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_cond_sub_16 PROC
|
|
sub rsp, 128
|
|
mov rax, 0
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r9
|
|
and r11, r9
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rsp]
|
|
sub r10, r8
|
|
mov r11, QWORD PTR [rdx+8]
|
|
mov r8, QWORD PTR [rsp+8]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r8, QWORD PTR [rsp+16]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rdx+24]
|
|
mov r8, QWORD PTR [rsp+24]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r8, QWORD PTR [rsp+32]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rdx+40]
|
|
mov r8, QWORD PTR [rsp+40]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r8, QWORD PTR [rsp+48]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rdx+56]
|
|
mov r8, QWORD PTR [rsp+56]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r8, QWORD PTR [rsp+64]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rdx+72]
|
|
mov r8, QWORD PTR [rsp+72]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r8, QWORD PTR [rsp+80]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rdx+88]
|
|
mov r8, QWORD PTR [rsp+88]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r8, QWORD PTR [rsp+96]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rdx+104]
|
|
mov r8, QWORD PTR [rsp+104]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r8, QWORD PTR [rsp+112]
|
|
sbb r10, r8
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rdx+120]
|
|
mov r8, QWORD PTR [rsp+120]
|
|
sbb r11, r8
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb rax, 0
|
|
add rsp, 128
|
|
ret
|
|
sp_1024_cond_sub_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Conditionally subtract b from a using the mask m.
|
|
; * m is -1 to subtract and 0 when not copying.
|
|
; *
|
|
; * r A single precision number representing condition subtract result.
|
|
; * a A single precision number to subtract from.
|
|
; * b A single precision number to subtract.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_cond_sub_avx2_16 PROC
|
|
push r12
|
|
mov rax, 0
|
|
mov r12, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx]
|
|
pext r12, r12, r9
|
|
sub r10, r12
|
|
mov r12, QWORD PTR [r8+8]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+8], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+16], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [rdx+32]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+32], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+40], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+56]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+56], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+64], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+80], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+88], r12
|
|
sbb r10, r11
|
|
mov r12, QWORD PTR [r8+104]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext r12, r12, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
sbb r11, r12
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
pext r10, r10, r9
|
|
mov QWORD PTR [rcx+104], r11
|
|
sbb r12, r10
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
pext r11, r11, r9
|
|
mov QWORD PTR [rcx+112], r12
|
|
sbb r10, r11
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb rax, 0
|
|
pop r12
|
|
ret
|
|
sp_1024_cond_sub_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mul_d_16 PROC
|
|
push r12
|
|
mov r9, rdx
|
|
; A[0] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
mov r10, rax
|
|
mov r11, rdx
|
|
mov QWORD PTR [rcx], r10
|
|
; A[1] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+8]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+8], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[2] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+16]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+16], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[3] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+24]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[4] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+32]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+32], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[5] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+40], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[6] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+48], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[7] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+56]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+56], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[8] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+64]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+64], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[9] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+72]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[10] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+80]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+80], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[11] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+88], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[12] * B
|
|
mov rax, r8
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
add r10, rax
|
|
mov QWORD PTR [rcx+96], r10
|
|
adc r11, rdx
|
|
adc r12, 0
|
|
; A[13] * B
|
|
mov rax, r8
|
|
xor r10, r10
|
|
mul QWORD PTR [r9+104]
|
|
add r11, rax
|
|
mov QWORD PTR [rcx+104], r11
|
|
adc r12, rdx
|
|
adc r10, 0
|
|
; A[14] * B
|
|
mov rax, r8
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+112]
|
|
add r12, rax
|
|
mov QWORD PTR [rcx+112], r12
|
|
adc r10, rdx
|
|
adc r11, 0
|
|
; A[15] * B
|
|
mov rax, r8
|
|
mul QWORD PTR [r9+120]
|
|
add r10, rax
|
|
adc r11, rdx
|
|
mov QWORD PTR [rcx+120], r10
|
|
mov QWORD PTR [rcx+128], r11
|
|
pop r12
|
|
ret
|
|
sp_1024_mul_d_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Mul a by digit b into r. (r = a * b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision digit.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mul_d_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, rdx
|
|
; A[0] * B
|
|
mov rdx, r8
|
|
xor r13, r13
|
|
mulx r12, r11, QWORD PTR [rax]
|
|
mov QWORD PTR [rcx], r11
|
|
; A[1] * B
|
|
mulx r10, r9, QWORD PTR [rax+8]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+8], r12
|
|
; A[2] * B
|
|
mulx r10, r9, QWORD PTR [rax+16]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
; A[3] * B
|
|
mulx r10, r9, QWORD PTR [rax+24]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+24], r12
|
|
; A[4] * B
|
|
mulx r10, r9, QWORD PTR [rax+32]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+32], r11
|
|
; A[5] * B
|
|
mulx r10, r9, QWORD PTR [rax+40]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+40], r12
|
|
; A[6] * B
|
|
mulx r10, r9, QWORD PTR [rax+48]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
; A[7] * B
|
|
mulx r10, r9, QWORD PTR [rax+56]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+56], r12
|
|
; A[8] * B
|
|
mulx r10, r9, QWORD PTR [rax+64]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+64], r11
|
|
; A[9] * B
|
|
mulx r10, r9, QWORD PTR [rax+72]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+72], r12
|
|
; A[10] * B
|
|
mulx r10, r9, QWORD PTR [rax+80]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
; A[11] * B
|
|
mulx r10, r9, QWORD PTR [rax+88]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+88], r12
|
|
; A[12] * B
|
|
mulx r10, r9, QWORD PTR [rax+96]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+96], r11
|
|
; A[13] * B
|
|
mulx r10, r9, QWORD PTR [rax+104]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
mov QWORD PTR [rcx+104], r12
|
|
; A[14] * B
|
|
mulx r10, r9, QWORD PTR [rax+112]
|
|
mov r12, r13
|
|
adcx r11, r9
|
|
adox r12, r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
; A[15] * B
|
|
mulx r10, r9, QWORD PTR [rax+120]
|
|
mov r11, r13
|
|
adcx r12, r9
|
|
adox r11, r10
|
|
adcx r11, r13
|
|
mov QWORD PTR [rcx+120], r12
|
|
mov QWORD PTR [rcx+128], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mul_d_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF _WIN64
|
|
; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
|
|
; *
|
|
; * d1 The high order half of the number to divide.
|
|
; * d0 The low order half of the number to divide.
|
|
; * div The dividend.
|
|
; * returns the result of the division.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
div_1024_word_asm_16 PROC
|
|
mov r9, rdx
|
|
mov rax, r9
|
|
mov rdx, rcx
|
|
div r8
|
|
ret
|
|
div_1024_word_asm_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Compare a with b in constant time.
|
|
; *
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; * return -ve, 0 or +ve if a is less than, equal to or greater than b
|
|
; * respectively.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_cmp_16 PROC
|
|
push r12
|
|
xor r9, r9
|
|
mov r8, -1
|
|
mov rax, -1
|
|
mov r10, 1
|
|
mov r11, QWORD PTR [rcx+120]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+112]
|
|
mov r12, QWORD PTR [rdx+112]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+104]
|
|
mov r12, QWORD PTR [rdx+104]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+96]
|
|
mov r12, QWORD PTR [rdx+96]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+88]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+80]
|
|
mov r12, QWORD PTR [rdx+80]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+72]
|
|
mov r12, QWORD PTR [rdx+72]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+64]
|
|
mov r12, QWORD PTR [rdx+64]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+56]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+48]
|
|
mov r12, QWORD PTR [rdx+48]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+40]
|
|
mov r12, QWORD PTR [rdx+40]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+32]
|
|
mov r12, QWORD PTR [rdx+32]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+24]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+16]
|
|
mov r12, QWORD PTR [rdx+16]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx+8]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
mov r11, QWORD PTR [rcx]
|
|
mov r12, QWORD PTR [rdx]
|
|
and r11, r8
|
|
and r12, r8
|
|
sub r11, r12
|
|
cmova rax, r10
|
|
cmovc rax, r8
|
|
cmovnz r8, r9
|
|
xor rax, r8
|
|
pop r12
|
|
ret
|
|
sp_1024_cmp_16 ENDP
|
|
_text ENDS
|
|
; /* Conditionally copy a into r using the mask m.
|
|
; * m is -1 to copy and 0 when not.
|
|
; *
|
|
; * r A single precision number to copy over.
|
|
; * a A single precision number to copy.
|
|
; * m Mask value to apply.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_cond_copy_16 PROC
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [rcx+16]
|
|
mov r11, QWORD PTR [rcx+24]
|
|
xor rax, QWORD PTR [rdx]
|
|
xor r9, QWORD PTR [rdx+8]
|
|
xor r10, QWORD PTR [rdx+16]
|
|
xor r11, QWORD PTR [rdx+24]
|
|
and rax, r8
|
|
and r9, r8
|
|
and r10, r8
|
|
and r11, r8
|
|
xor QWORD PTR [rcx], rax
|
|
xor QWORD PTR [rcx+8], r9
|
|
xor QWORD PTR [rcx+16], r10
|
|
xor QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov r10, QWORD PTR [rcx+48]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
xor rax, QWORD PTR [rdx+32]
|
|
xor r9, QWORD PTR [rdx+40]
|
|
xor r10, QWORD PTR [rdx+48]
|
|
xor r11, QWORD PTR [rdx+56]
|
|
and rax, r8
|
|
and r9, r8
|
|
and r10, r8
|
|
and r11, r8
|
|
xor QWORD PTR [rcx+32], rax
|
|
xor QWORD PTR [rcx+40], r9
|
|
xor QWORD PTR [rcx+48], r10
|
|
xor QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov r10, QWORD PTR [rcx+80]
|
|
mov r11, QWORD PTR [rcx+88]
|
|
xor rax, QWORD PTR [rdx+64]
|
|
xor r9, QWORD PTR [rdx+72]
|
|
xor r10, QWORD PTR [rdx+80]
|
|
xor r11, QWORD PTR [rdx+88]
|
|
and rax, r8
|
|
and r9, r8
|
|
and r10, r8
|
|
and r11, r8
|
|
xor QWORD PTR [rcx+64], rax
|
|
xor QWORD PTR [rcx+72], r9
|
|
xor QWORD PTR [rcx+80], r10
|
|
xor QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [rcx+112]
|
|
mov r11, QWORD PTR [rcx+120]
|
|
xor rax, QWORD PTR [rdx+96]
|
|
xor r9, QWORD PTR [rdx+104]
|
|
xor r10, QWORD PTR [rdx+112]
|
|
xor r11, QWORD PTR [rdx+120]
|
|
and rax, r8
|
|
and r9, r8
|
|
and r10, r8
|
|
and r11, r8
|
|
xor QWORD PTR [rcx+96], rax
|
|
xor QWORD PTR [rcx+104], r9
|
|
xor QWORD PTR [rcx+112], r10
|
|
xor QWORD PTR [rcx+120], r11
|
|
ret
|
|
sp_1024_cond_copy_16 ENDP
|
|
_text ENDS
|
|
; /* Reduce the number back to 1024 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_reduce_16 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
mov r9, rdx
|
|
xor rsi, rsi
|
|
; i = 16
|
|
mov r10, 16
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rdi, QWORD PTR [rcx+8]
|
|
L_1024_mont_loop_16:
|
|
; mu = a[i] * mp
|
|
mov r13, r15
|
|
imul r13, r8
|
|
; a[i+0] += m[0] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9]
|
|
add r15, rax
|
|
adc r12, rdx
|
|
; a[i+1] += m[1] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+8]
|
|
mov r15, rdi
|
|
add r15, rax
|
|
adc r11, rdx
|
|
add r15, r12
|
|
adc r11, 0
|
|
; a[i+2] += m[2] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+16]
|
|
mov rdi, QWORD PTR [rcx+16]
|
|
add rdi, rax
|
|
adc r12, rdx
|
|
add rdi, r11
|
|
adc r12, 0
|
|
; a[i+3] += m[3] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+24]
|
|
mov r14, QWORD PTR [rcx+24]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+24], r14
|
|
adc r11, 0
|
|
; a[i+4] += m[4] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+32]
|
|
mov r14, QWORD PTR [rcx+32]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+32], r14
|
|
adc r12, 0
|
|
; a[i+5] += m[5] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+40]
|
|
mov r14, QWORD PTR [rcx+40]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+40], r14
|
|
adc r11, 0
|
|
; a[i+6] += m[6] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+48]
|
|
mov r14, QWORD PTR [rcx+48]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+48], r14
|
|
adc r12, 0
|
|
; a[i+7] += m[7] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+56]
|
|
mov r14, QWORD PTR [rcx+56]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+56], r14
|
|
adc r11, 0
|
|
; a[i+8] += m[8] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+64]
|
|
mov r14, QWORD PTR [rcx+64]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+64], r14
|
|
adc r12, 0
|
|
; a[i+9] += m[9] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+72]
|
|
mov r14, QWORD PTR [rcx+72]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+72], r14
|
|
adc r11, 0
|
|
; a[i+10] += m[10] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+80]
|
|
mov r14, QWORD PTR [rcx+80]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+80], r14
|
|
adc r12, 0
|
|
; a[i+11] += m[11] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+88]
|
|
mov r14, QWORD PTR [rcx+88]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+88], r14
|
|
adc r11, 0
|
|
; a[i+12] += m[12] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+96]
|
|
mov r14, QWORD PTR [rcx+96]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+96], r14
|
|
adc r12, 0
|
|
; a[i+13] += m[13] * mu
|
|
mov rax, r13
|
|
xor r11, r11
|
|
mul QWORD PTR [r9+104]
|
|
mov r14, QWORD PTR [rcx+104]
|
|
add r14, rax
|
|
adc r11, rdx
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+104], r14
|
|
adc r11, 0
|
|
; a[i+14] += m[14] * mu
|
|
mov rax, r13
|
|
xor r12, r12
|
|
mul QWORD PTR [r9+112]
|
|
mov r14, QWORD PTR [rcx+112]
|
|
add r14, rax
|
|
adc r12, rdx
|
|
add r14, r11
|
|
mov QWORD PTR [rcx+112], r14
|
|
adc r12, 0
|
|
; a[i+15] += m[15] * mu
|
|
mov rax, r13
|
|
mul QWORD PTR [r9+120]
|
|
mov r14, QWORD PTR [rcx+120]
|
|
add r12, rax
|
|
adc rdx, rsi
|
|
mov rsi, 0
|
|
adc rsi, 0
|
|
add r14, r12
|
|
mov QWORD PTR [rcx+120], r14
|
|
adc QWORD PTR [rcx+128], rdx
|
|
adc rsi, 0
|
|
; i -= 1
|
|
add rcx, 8
|
|
dec r10
|
|
jnz L_1024_mont_loop_16
|
|
mov r14, QWORD PTR [rcx+120]
|
|
mov QWORD PTR [rcx], r15
|
|
sub r14, QWORD PTR [r9+120]
|
|
mov QWORD PTR [rcx+8], rdi
|
|
sbb r14, r14
|
|
neg rsi
|
|
not r14
|
|
or rsi, r14
|
|
IFDEF _WIN64
|
|
mov r8, r9
|
|
mov r9, rsi
|
|
ELSE
|
|
mov r9, rsi
|
|
mov r8, r9
|
|
ENDIF
|
|
mov rdx, rcx
|
|
mov rcx, rcx
|
|
sub rcx, 128
|
|
call sp_1024_cond_sub_16
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_reduce_16 ENDP
|
|
_text ENDS
|
|
; /* Add two Montgomery form numbers (r = a + b % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a First number to add in Montogmery form.
|
|
; * b Second number to add in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_add_16 PROC
|
|
push r12
|
|
push r13
|
|
sub rsp, 128
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov r11, QWORD PTR [rdx+16]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
add rax, QWORD PTR [r8]
|
|
mov r13, 0
|
|
adc r10, QWORD PTR [r8+8]
|
|
adc r11, QWORD PTR [r8+16]
|
|
adc r12, QWORD PTR [r8+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov r11, QWORD PTR [rdx+48]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
adc rax, QWORD PTR [r8+32]
|
|
adc r10, QWORD PTR [r8+40]
|
|
adc r11, QWORD PTR [r8+48]
|
|
adc r12, QWORD PTR [r8+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
mov QWORD PTR [rcx+56], r12
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
adc rax, QWORD PTR [r8+64]
|
|
adc r10, QWORD PTR [r8+72]
|
|
adc r11, QWORD PTR [r8+80]
|
|
adc r12, QWORD PTR [r8+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
mov QWORD PTR [rcx+88], r12
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov r11, QWORD PTR [rdx+112]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
adc rax, QWORD PTR [r8+96]
|
|
adc r10, QWORD PTR [r8+104]
|
|
adc r11, QWORD PTR [r8+112]
|
|
adc r12, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
mov QWORD PTR [rcx+120], r12
|
|
sbb r13, 0
|
|
sub r12, QWORD PTR [r9+120]
|
|
sbb r12, r12
|
|
not r12
|
|
or r13, r12
|
|
mov r11, QWORD PTR [r9]
|
|
mov r12, QWORD PTR [r9+8]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp], r11
|
|
mov QWORD PTR [rsp+8], r12
|
|
mov r11, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [r9+24]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+16], r11
|
|
mov QWORD PTR [rsp+24], r12
|
|
mov r11, QWORD PTR [r9+32]
|
|
mov r12, QWORD PTR [r9+40]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+32], r11
|
|
mov QWORD PTR [rsp+40], r12
|
|
mov r11, QWORD PTR [r9+48]
|
|
mov r12, QWORD PTR [r9+56]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+48], r11
|
|
mov QWORD PTR [rsp+56], r12
|
|
mov r11, QWORD PTR [r9+64]
|
|
mov r12, QWORD PTR [r9+72]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+64], r11
|
|
mov QWORD PTR [rsp+72], r12
|
|
mov r11, QWORD PTR [r9+80]
|
|
mov r12, QWORD PTR [r9+88]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+80], r11
|
|
mov QWORD PTR [rsp+88], r12
|
|
mov r11, QWORD PTR [r9+96]
|
|
mov r12, QWORD PTR [r9+104]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+96], r11
|
|
mov QWORD PTR [rsp+104], r12
|
|
mov r11, QWORD PTR [r9+112]
|
|
mov r12, QWORD PTR [r9+120]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+112], r11
|
|
mov QWORD PTR [rsp+120], r12
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r10, QWORD PTR [rcx+8]
|
|
sub rax, QWORD PTR [rsp]
|
|
sbb r10, QWORD PTR [rsp+8]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r10, QWORD PTR [rcx+24]
|
|
sbb rax, QWORD PTR [rsp+16]
|
|
sbb r10, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
sbb rax, QWORD PTR [rsp+32]
|
|
sbb r10, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r10, QWORD PTR [rcx+56]
|
|
sbb rax, QWORD PTR [rsp+48]
|
|
sbb r10, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r10, QWORD PTR [rcx+72]
|
|
sbb rax, QWORD PTR [rsp+64]
|
|
sbb r10, QWORD PTR [rsp+72]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [rcx+88]
|
|
sbb rax, QWORD PTR [rsp+80]
|
|
sbb r10, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r10
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r10, QWORD PTR [rcx+104]
|
|
sbb rax, QWORD PTR [rsp+96]
|
|
sbb r10, QWORD PTR [rsp+104]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r10, QWORD PTR [rcx+120]
|
|
sbb rax, QWORD PTR [rsp+112]
|
|
sbb r10, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
add rsp, 128
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_add_16 ENDP
|
|
_text ENDS
|
|
; /* Double a Montgomery form number (r = a + a % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a Number to souble in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_dbl_16 PROC
|
|
push r12
|
|
sub rsp, 128
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r12, 0
|
|
adc r9, QWORD PTR [rdx+8]
|
|
adc r10, QWORD PTR [rdx+16]
|
|
adc r11, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
adc rax, QWORD PTR [rdx+32]
|
|
adc r9, QWORD PTR [rdx+40]
|
|
adc r10, QWORD PTR [rdx+48]
|
|
adc r11, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r11, QWORD PTR [rdx+88]
|
|
adc rax, QWORD PTR [rdx+64]
|
|
adc r9, QWORD PTR [rdx+72]
|
|
adc r10, QWORD PTR [rdx+80]
|
|
adc r11, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r11, QWORD PTR [rdx+120]
|
|
adc rax, QWORD PTR [rdx+96]
|
|
adc r9, QWORD PTR [rdx+104]
|
|
adc r10, QWORD PTR [rdx+112]
|
|
adc r11, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb r12, 0
|
|
sub r11, QWORD PTR [r8+120]
|
|
sbb r11, r11
|
|
not r11
|
|
or r12, r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
sub rax, QWORD PTR [rsp]
|
|
sbb r9, QWORD PTR [rsp+8]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
sbb rax, QWORD PTR [rsp+16]
|
|
sbb r9, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
sbb rax, QWORD PTR [rsp+32]
|
|
sbb r9, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
sbb rax, QWORD PTR [rsp+48]
|
|
sbb r9, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
sbb rax, QWORD PTR [rsp+64]
|
|
sbb r9, QWORD PTR [rsp+72]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
sbb rax, QWORD PTR [rsp+80]
|
|
sbb r9, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
sbb rax, QWORD PTR [rsp+96]
|
|
sbb r9, QWORD PTR [rsp+104]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
sbb rax, QWORD PTR [rsp+112]
|
|
sbb r9, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r9
|
|
add rsp, 128
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_dbl_16 ENDP
|
|
_text ENDS
|
|
; /* Triple a Montgomery form number (r = a + a + a % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a Number to souble in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_tpl_16 PROC
|
|
push r12
|
|
sub rsp, 128
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r12, 0
|
|
adc r9, QWORD PTR [rdx+8]
|
|
adc r10, QWORD PTR [rdx+16]
|
|
adc r11, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
adc rax, QWORD PTR [rdx+32]
|
|
adc r9, QWORD PTR [rdx+40]
|
|
adc r10, QWORD PTR [rdx+48]
|
|
adc r11, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r11, QWORD PTR [rdx+88]
|
|
adc rax, QWORD PTR [rdx+64]
|
|
adc r9, QWORD PTR [rdx+72]
|
|
adc r10, QWORD PTR [rdx+80]
|
|
adc r11, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r11, QWORD PTR [rdx+120]
|
|
adc rax, QWORD PTR [rdx+96]
|
|
adc r9, QWORD PTR [rdx+104]
|
|
adc r10, QWORD PTR [rdx+112]
|
|
adc r11, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb r12, 0
|
|
sub r11, QWORD PTR [r8+120]
|
|
sbb r11, r11
|
|
not r11
|
|
or r12, r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
sub rax, QWORD PTR [rsp]
|
|
sbb r9, QWORD PTR [rsp+8]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
sbb rax, QWORD PTR [rsp+16]
|
|
sbb r9, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
sbb rax, QWORD PTR [rsp+32]
|
|
sbb r9, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
sbb rax, QWORD PTR [rsp+48]
|
|
sbb r9, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
sbb rax, QWORD PTR [rsp+64]
|
|
sbb r9, QWORD PTR [rsp+72]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
sbb rax, QWORD PTR [rsp+80]
|
|
sbb r9, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
sbb rax, QWORD PTR [rsp+96]
|
|
sbb r9, QWORD PTR [rsp+104]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
sbb rax, QWORD PTR [rsp+112]
|
|
sbb r9, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r9
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [rcx+16]
|
|
mov r11, QWORD PTR [rcx+24]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r12, 0
|
|
adc r9, QWORD PTR [rdx+8]
|
|
adc r10, QWORD PTR [rdx+16]
|
|
adc r11, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov r10, QWORD PTR [rcx+48]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
adc rax, QWORD PTR [rdx+32]
|
|
adc r9, QWORD PTR [rdx+40]
|
|
adc r10, QWORD PTR [rdx+48]
|
|
adc r11, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov r10, QWORD PTR [rcx+80]
|
|
mov r11, QWORD PTR [rcx+88]
|
|
adc rax, QWORD PTR [rdx+64]
|
|
adc r9, QWORD PTR [rdx+72]
|
|
adc r10, QWORD PTR [rdx+80]
|
|
adc r11, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [rcx+112]
|
|
mov r11, QWORD PTR [rcx+120]
|
|
adc rax, QWORD PTR [rdx+96]
|
|
adc r9, QWORD PTR [rdx+104]
|
|
adc r10, QWORD PTR [rdx+112]
|
|
adc r11, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb r12, 0
|
|
sub r11, QWORD PTR [r8+120]
|
|
sbb r11, r11
|
|
not r11
|
|
or r12, r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp], r10
|
|
mov QWORD PTR [rsp+8], r11
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov QWORD PTR [rsp+24], r11
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov QWORD PTR [rsp+40], r11
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov QWORD PTR [rsp+56], r11
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov QWORD PTR [rsp+72], r11
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov QWORD PTR [rsp+88], r11
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov QWORD PTR [rsp+104], r11
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
and r10, r12
|
|
and r11, r12
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov QWORD PTR [rsp+120], r11
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
sub rax, QWORD PTR [rsp]
|
|
sbb r9, QWORD PTR [rsp+8]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
sbb rax, QWORD PTR [rsp+16]
|
|
sbb r9, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
sbb rax, QWORD PTR [rsp+32]
|
|
sbb r9, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
sbb rax, QWORD PTR [rsp+48]
|
|
sbb r9, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
sbb rax, QWORD PTR [rsp+64]
|
|
sbb r9, QWORD PTR [rsp+72]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
sbb rax, QWORD PTR [rsp+80]
|
|
sbb r9, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
sbb rax, QWORD PTR [rsp+96]
|
|
sbb r9, QWORD PTR [rsp+104]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
sbb rax, QWORD PTR [rsp+112]
|
|
sbb r9, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r9
|
|
add rsp, 128
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_tpl_16 ENDP
|
|
_text ENDS
|
|
; /* Subtract two Montgomery form numbers (r = a - b % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a First number to add in Montogmery form.
|
|
; * b Second number to add in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_sub_16 PROC
|
|
push r12
|
|
push r13
|
|
sub rsp, 128
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov r11, QWORD PTR [rdx+16]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
sub rax, QWORD PTR [r8]
|
|
mov r13, 0
|
|
sbb r10, QWORD PTR [r8+8]
|
|
sbb r11, QWORD PTR [r8+16]
|
|
sbb r12, QWORD PTR [r8+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov r11, QWORD PTR [rdx+48]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
sbb rax, QWORD PTR [r8+32]
|
|
sbb r10, QWORD PTR [r8+40]
|
|
sbb r11, QWORD PTR [r8+48]
|
|
sbb r12, QWORD PTR [r8+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
mov QWORD PTR [rcx+56], r12
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
sbb rax, QWORD PTR [r8+64]
|
|
sbb r10, QWORD PTR [r8+72]
|
|
sbb r11, QWORD PTR [r8+80]
|
|
sbb r12, QWORD PTR [r8+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
mov QWORD PTR [rcx+88], r12
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov r11, QWORD PTR [rdx+112]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
sbb rax, QWORD PTR [r8+96]
|
|
sbb r10, QWORD PTR [r8+104]
|
|
sbb r11, QWORD PTR [r8+112]
|
|
sbb r12, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
mov QWORD PTR [rcx+120], r12
|
|
sbb r13, 0
|
|
mov r11, QWORD PTR [r9]
|
|
mov r12, QWORD PTR [r9+8]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp], r11
|
|
mov QWORD PTR [rsp+8], r12
|
|
mov r11, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [r9+24]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+16], r11
|
|
mov QWORD PTR [rsp+24], r12
|
|
mov r11, QWORD PTR [r9+32]
|
|
mov r12, QWORD PTR [r9+40]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+32], r11
|
|
mov QWORD PTR [rsp+40], r12
|
|
mov r11, QWORD PTR [r9+48]
|
|
mov r12, QWORD PTR [r9+56]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+48], r11
|
|
mov QWORD PTR [rsp+56], r12
|
|
mov r11, QWORD PTR [r9+64]
|
|
mov r12, QWORD PTR [r9+72]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+64], r11
|
|
mov QWORD PTR [rsp+72], r12
|
|
mov r11, QWORD PTR [r9+80]
|
|
mov r12, QWORD PTR [r9+88]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+80], r11
|
|
mov QWORD PTR [rsp+88], r12
|
|
mov r11, QWORD PTR [r9+96]
|
|
mov r12, QWORD PTR [r9+104]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+96], r11
|
|
mov QWORD PTR [rsp+104], r12
|
|
mov r11, QWORD PTR [r9+112]
|
|
mov r12, QWORD PTR [r9+120]
|
|
and r11, r13
|
|
and r12, r13
|
|
mov QWORD PTR [rsp+112], r11
|
|
mov QWORD PTR [rsp+120], r12
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r10, QWORD PTR [rcx+8]
|
|
add rax, QWORD PTR [rsp]
|
|
adc r10, QWORD PTR [rsp+8]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r10, QWORD PTR [rcx+24]
|
|
adc rax, QWORD PTR [rsp+16]
|
|
adc r10, QWORD PTR [rsp+24]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
adc rax, QWORD PTR [rsp+32]
|
|
adc r10, QWORD PTR [rsp+40]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r10, QWORD PTR [rcx+56]
|
|
adc rax, QWORD PTR [rsp+48]
|
|
adc r10, QWORD PTR [rsp+56]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r10, QWORD PTR [rcx+72]
|
|
adc rax, QWORD PTR [rsp+64]
|
|
adc r10, QWORD PTR [rsp+72]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [rcx+88]
|
|
adc rax, QWORD PTR [rsp+80]
|
|
adc r10, QWORD PTR [rsp+88]
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r10
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r10, QWORD PTR [rcx+104]
|
|
adc rax, QWORD PTR [rsp+96]
|
|
adc r10, QWORD PTR [rsp+104]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r10, QWORD PTR [rcx+120]
|
|
adc rax, QWORD PTR [rsp+112]
|
|
adc r10, QWORD PTR [rsp+120]
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
add rsp, 128
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_sub_16 ENDP
|
|
_text ENDS
|
|
; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_div2_16 PROC
|
|
push r12
|
|
push r13
|
|
sub rsp, 128
|
|
mov r13, QWORD PTR [rdx]
|
|
xor r12, r12
|
|
mov rax, r13
|
|
and r13, 1
|
|
neg r13
|
|
mov r10, QWORD PTR [r8]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp], r10
|
|
mov r10, QWORD PTR [r8+8]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+8], r10
|
|
mov r10, QWORD PTR [r8+16]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+16], r10
|
|
mov r10, QWORD PTR [r8+24]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+24], r10
|
|
mov r10, QWORD PTR [r8+32]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+32], r10
|
|
mov r10, QWORD PTR [r8+40]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+40], r10
|
|
mov r10, QWORD PTR [r8+48]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+48], r10
|
|
mov r10, QWORD PTR [r8+56]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+56], r10
|
|
mov r10, QWORD PTR [r8+64]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+64], r10
|
|
mov r10, QWORD PTR [r8+72]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+72], r10
|
|
mov r10, QWORD PTR [r8+80]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+80], r10
|
|
mov r10, QWORD PTR [r8+88]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+88], r10
|
|
mov r10, QWORD PTR [r8+96]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+96], r10
|
|
mov r10, QWORD PTR [r8+104]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+104], r10
|
|
mov r10, QWORD PTR [r8+112]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+112], r10
|
|
mov r10, QWORD PTR [r8+120]
|
|
and r10, r13
|
|
mov QWORD PTR [rsp+120], r10
|
|
add QWORD PTR [rsp], rax
|
|
mov rax, QWORD PTR [rdx+8]
|
|
adc QWORD PTR [rsp+8], rax
|
|
mov rax, QWORD PTR [rdx+16]
|
|
adc QWORD PTR [rsp+16], rax
|
|
mov rax, QWORD PTR [rdx+24]
|
|
adc QWORD PTR [rsp+24], rax
|
|
mov rax, QWORD PTR [rdx+32]
|
|
adc QWORD PTR [rsp+32], rax
|
|
mov rax, QWORD PTR [rdx+40]
|
|
adc QWORD PTR [rsp+40], rax
|
|
mov rax, QWORD PTR [rdx+48]
|
|
adc QWORD PTR [rsp+48], rax
|
|
mov rax, QWORD PTR [rdx+56]
|
|
adc QWORD PTR [rsp+56], rax
|
|
mov rax, QWORD PTR [rdx+64]
|
|
adc QWORD PTR [rsp+64], rax
|
|
mov rax, QWORD PTR [rdx+72]
|
|
adc QWORD PTR [rsp+72], rax
|
|
mov rax, QWORD PTR [rdx+80]
|
|
adc QWORD PTR [rsp+80], rax
|
|
mov rax, QWORD PTR [rdx+88]
|
|
adc QWORD PTR [rsp+88], rax
|
|
mov rax, QWORD PTR [rdx+96]
|
|
adc QWORD PTR [rsp+96], rax
|
|
mov rax, QWORD PTR [rdx+104]
|
|
adc QWORD PTR [rsp+104], rax
|
|
mov rax, QWORD PTR [rdx+112]
|
|
adc QWORD PTR [rsp+112], rax
|
|
mov rax, QWORD PTR [rdx+120]
|
|
adc QWORD PTR [rsp+120], rax
|
|
adc r12, 0
|
|
mov rax, QWORD PTR [rsp]
|
|
mov r9, QWORD PTR [rsp+8]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx], rax
|
|
mov rax, QWORD PTR [rsp+16]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov r9, QWORD PTR [rsp+24]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov rax, QWORD PTR [rsp+32]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov r9, QWORD PTR [rsp+40]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov rax, QWORD PTR [rsp+48]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov r9, QWORD PTR [rsp+56]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov rax, QWORD PTR [rsp+64]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov r9, QWORD PTR [rsp+72]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov rax, QWORD PTR [rsp+80]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov r9, QWORD PTR [rsp+88]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov rax, QWORD PTR [rsp+96]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov r9, QWORD PTR [rsp+104]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov rax, QWORD PTR [rsp+112]
|
|
shrd r9, rax, 1
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov r9, QWORD PTR [rsp+120]
|
|
shrd rax, r9, 1
|
|
mov QWORD PTR [rcx+112], rax
|
|
shrd r9, r12, 1
|
|
mov QWORD PTR [rcx+120], r9
|
|
add rsp, 128
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_div2_16 ENDP
|
|
_text ENDS
|
|
; /* Sub b from a into r. (r = a - b)
|
|
; *
|
|
; * r A single precision integer.
|
|
; * a A single precision integer.
|
|
; * b A single precision integer.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_sub_16 PROC
|
|
mov r9, QWORD PTR [rdx]
|
|
xor rax, rax
|
|
sub r9, QWORD PTR [r8]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov QWORD PTR [rcx], r9
|
|
sbb r10, QWORD PTR [r8+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov QWORD PTR [rcx+8], r10
|
|
sbb r9, QWORD PTR [r8+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx+16], r9
|
|
sbb r10, QWORD PTR [r8+24]
|
|
mov r9, QWORD PTR [rdx+32]
|
|
mov QWORD PTR [rcx+24], r10
|
|
sbb r9, QWORD PTR [r8+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov QWORD PTR [rcx+32], r9
|
|
sbb r10, QWORD PTR [r8+40]
|
|
mov r9, QWORD PTR [rdx+48]
|
|
mov QWORD PTR [rcx+40], r10
|
|
sbb r9, QWORD PTR [r8+48]
|
|
mov r10, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+48], r9
|
|
sbb r10, QWORD PTR [r8+56]
|
|
mov r9, QWORD PTR [rdx+64]
|
|
mov QWORD PTR [rcx+56], r10
|
|
sbb r9, QWORD PTR [r8+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov QWORD PTR [rcx+64], r9
|
|
sbb r10, QWORD PTR [r8+72]
|
|
mov r9, QWORD PTR [rdx+80]
|
|
mov QWORD PTR [rcx+72], r10
|
|
sbb r9, QWORD PTR [r8+80]
|
|
mov r10, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+80], r9
|
|
sbb r10, QWORD PTR [r8+88]
|
|
mov r9, QWORD PTR [rdx+96]
|
|
mov QWORD PTR [rcx+88], r10
|
|
sbb r9, QWORD PTR [r8+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov QWORD PTR [rcx+96], r9
|
|
sbb r10, QWORD PTR [r8+104]
|
|
mov r9, QWORD PTR [rdx+112]
|
|
mov QWORD PTR [rcx+104], r10
|
|
sbb r9, QWORD PTR [r8+112]
|
|
mov r10, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+112], r9
|
|
sbb r10, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+120], r10
|
|
sbb rax, 0
|
|
ret
|
|
sp_1024_sub_16 ENDP
|
|
_text ENDS
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Reduce the number back to 1024 bits using Montgomery reduction.
|
|
; *
|
|
; * a A single precision number to reduce in place.
|
|
; * m The single precision number representing the modulus.
|
|
; * mp The digit representing the negative inverse of m mod 2^n.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_reduce_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r9, rcx
|
|
mov r10, rdx
|
|
xor rbp, rbp
|
|
; i = 16
|
|
mov r11, 16
|
|
mov r15, QWORD PTR [r9]
|
|
mov rdi, QWORD PTR [r9+8]
|
|
mov rsi, QWORD PTR [r9+16]
|
|
mov rbx, QWORD PTR [r9+24]
|
|
add r9, 64
|
|
xor rbp, rbp
|
|
L_1024_mont_loop_avx2_16:
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-32]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-24]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-24], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9+-8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-16], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-8], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+8]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+16]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+8], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+24]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+16], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+32]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+24], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+40]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+32], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+48]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+40], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+56]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+48], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+64]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+56], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+64], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; mu = a[i] * mp
|
|
mov rdx, r15
|
|
mov r12, r15
|
|
imul rdx, r8
|
|
xor r14, r14
|
|
; a[i+0] += m[0] * mu
|
|
mulx rcx, rax, QWORD PTR [r10]
|
|
mov r15, rdi
|
|
adcx r12, rax
|
|
adox r15, rcx
|
|
; a[i+1] += m[1] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+8]
|
|
mov rdi, rsi
|
|
adcx r15, rax
|
|
adox rdi, rcx
|
|
; a[i+2] += m[2] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+16]
|
|
mov rsi, rbx
|
|
adcx rdi, rax
|
|
adox rsi, rcx
|
|
; a[i+3] += m[3] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+24]
|
|
mov rbx, QWORD PTR [r9+-24]
|
|
adcx rsi, rax
|
|
adox rbx, rcx
|
|
; a[i+4] += m[4] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+32]
|
|
mov r13, QWORD PTR [r9+-16]
|
|
adcx rbx, rax
|
|
adox r13, rcx
|
|
; a[i+5] += m[5] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+40]
|
|
mov r12, QWORD PTR [r9+-8]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+-16], r13
|
|
; a[i+6] += m[6] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+48]
|
|
mov r13, QWORD PTR [r9]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+-8], r12
|
|
; a[i+7] += m[7] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+56]
|
|
mov r12, QWORD PTR [r9+8]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9], r13
|
|
; a[i+8] += m[8] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+64]
|
|
mov r13, QWORD PTR [r9+16]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+8], r12
|
|
; a[i+9] += m[9] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+72]
|
|
mov r12, QWORD PTR [r9+24]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+16], r13
|
|
; a[i+10] += m[10] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+80]
|
|
mov r13, QWORD PTR [r9+32]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+24], r12
|
|
; a[i+11] += m[11] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+88]
|
|
mov r12, QWORD PTR [r9+40]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+32], r13
|
|
; a[i+12] += m[12] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+96]
|
|
mov r13, QWORD PTR [r9+48]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+40], r12
|
|
; a[i+13] += m[13] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+104]
|
|
mov r12, QWORD PTR [r9+56]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+48], r13
|
|
; a[i+14] += m[14] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+112]
|
|
mov r13, QWORD PTR [r9+64]
|
|
adcx r12, rax
|
|
adox r13, rcx
|
|
mov QWORD PTR [r9+56], r12
|
|
; a[i+15] += m[15] * mu
|
|
mulx rcx, rax, QWORD PTR [r10+120]
|
|
mov r12, QWORD PTR [r9+72]
|
|
adcx r13, rax
|
|
adox r12, rcx
|
|
mov QWORD PTR [r9+64], r13
|
|
adcx r12, rbp
|
|
mov rbp, r14
|
|
mov QWORD PTR [r9+72], r12
|
|
adox rbp, r14
|
|
adcx rbp, r14
|
|
; a += 2
|
|
add r9, 16
|
|
; i -= 2
|
|
sub r11, 2
|
|
jnz L_1024_mont_loop_avx2_16
|
|
sub r9, 64
|
|
sub r12, QWORD PTR [r10+120]
|
|
mov r8, r9
|
|
sbb r12, r12
|
|
neg rbp
|
|
not r12
|
|
or rbp, r12
|
|
sub r9, 128
|
|
mov rcx, QWORD PTR [r10]
|
|
mov rdx, r15
|
|
pext rcx, rcx, rbp
|
|
sub rdx, rcx
|
|
mov rcx, QWORD PTR [r10+8]
|
|
mov rax, rdi
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+16]
|
|
mov rcx, rsi
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+8], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+24]
|
|
mov rdx, rbx
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+16], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+32]
|
|
mov rax, QWORD PTR [r8+32]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+24], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+40]
|
|
mov rcx, QWORD PTR [r8+40]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+32], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+48]
|
|
mov rdx, QWORD PTR [r8+48]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+40], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+56]
|
|
mov rax, QWORD PTR [r8+56]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+48], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+64]
|
|
mov rcx, QWORD PTR [r8+64]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+56], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+72]
|
|
mov rdx, QWORD PTR [r8+72]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+64], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+80]
|
|
mov rax, QWORD PTR [r8+80]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+72], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+88]
|
|
mov rcx, QWORD PTR [r8+88]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+80], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+96]
|
|
mov rdx, QWORD PTR [r8+96]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+88], rcx
|
|
sbb rdx, rax
|
|
mov rcx, QWORD PTR [r10+104]
|
|
mov rax, QWORD PTR [r8+104]
|
|
pext rcx, rcx, rbp
|
|
mov QWORD PTR [r9+96], rdx
|
|
sbb rax, rcx
|
|
mov rdx, QWORD PTR [r10+112]
|
|
mov rcx, QWORD PTR [r8+112]
|
|
pext rdx, rdx, rbp
|
|
mov QWORD PTR [r9+104], rax
|
|
sbb rcx, rdx
|
|
mov rax, QWORD PTR [r10+120]
|
|
mov rdx, QWORD PTR [r8+120]
|
|
pext rax, rax, rbp
|
|
mov QWORD PTR [r9+112], rcx
|
|
sbb rdx, rax
|
|
mov QWORD PTR [r9+120], rdx
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_reduce_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Add two Montgomery form numbers (r = a + b % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a First number to add in Montogmery form.
|
|
; * b Second number to add in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_add_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov r11, QWORD PTR [rdx+16]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
add rax, QWORD PTR [r8]
|
|
mov r13, 0
|
|
adc r10, QWORD PTR [r8+8]
|
|
adc r11, QWORD PTR [r8+16]
|
|
adc r12, QWORD PTR [r8+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov r11, QWORD PTR [rdx+48]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
adc rax, QWORD PTR [r8+32]
|
|
adc r10, QWORD PTR [r8+40]
|
|
adc r11, QWORD PTR [r8+48]
|
|
adc r12, QWORD PTR [r8+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
mov QWORD PTR [rcx+56], r12
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
adc rax, QWORD PTR [r8+64]
|
|
adc r10, QWORD PTR [r8+72]
|
|
adc r11, QWORD PTR [r8+80]
|
|
adc r12, QWORD PTR [r8+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
mov QWORD PTR [rcx+88], r12
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov r11, QWORD PTR [rdx+112]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
adc rax, QWORD PTR [r8+96]
|
|
adc r10, QWORD PTR [r8+104]
|
|
adc r11, QWORD PTR [r8+112]
|
|
adc r12, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
mov QWORD PTR [rcx+120], r12
|
|
sbb r13, 0
|
|
sub r12, QWORD PTR [r9+120]
|
|
sbb r12, r12
|
|
not r12
|
|
or r13, r12
|
|
mov r11, QWORD PTR [r9]
|
|
mov r12, QWORD PTR [r9+8]
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r10, QWORD PTR [rcx+8]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sub rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov r11, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [r9+24]
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r10, QWORD PTR [rcx+24]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov r11, QWORD PTR [r9+32]
|
|
mov r12, QWORD PTR [r9+40]
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov r11, QWORD PTR [r9+48]
|
|
mov r12, QWORD PTR [r9+56]
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r10, QWORD PTR [rcx+56]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
mov r11, QWORD PTR [r9+64]
|
|
mov r12, QWORD PTR [r9+72]
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r10, QWORD PTR [rcx+72]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov r11, QWORD PTR [r9+80]
|
|
mov r12, QWORD PTR [r9+88]
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [rcx+88]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r10
|
|
mov r11, QWORD PTR [r9+96]
|
|
mov r12, QWORD PTR [r9+104]
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r10, QWORD PTR [rcx+104]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov r11, QWORD PTR [r9+112]
|
|
mov r12, QWORD PTR [r9+120]
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r10, QWORD PTR [rcx+120]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
sbb rax, r11
|
|
sbb r10, r12
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_add_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Double a Montgomery form number (r = a + a % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a Number to souble in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_dbl_avx2_16 PROC
|
|
push r12
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r12, 0
|
|
adc r9, QWORD PTR [rdx+8]
|
|
adc r10, QWORD PTR [rdx+16]
|
|
adc r11, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
adc rax, QWORD PTR [rdx+32]
|
|
adc r9, QWORD PTR [rdx+40]
|
|
adc r10, QWORD PTR [rdx+48]
|
|
adc r11, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r11, QWORD PTR [rdx+88]
|
|
adc rax, QWORD PTR [rdx+64]
|
|
adc r9, QWORD PTR [rdx+72]
|
|
adc r10, QWORD PTR [rdx+80]
|
|
adc r11, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r11, QWORD PTR [rdx+120]
|
|
adc rax, QWORD PTR [rdx+96]
|
|
adc r9, QWORD PTR [rdx+104]
|
|
adc r10, QWORD PTR [rdx+112]
|
|
adc r11, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb r12, 0
|
|
sub r11, QWORD PTR [r8+120]
|
|
sbb r11, r11
|
|
not r11
|
|
or r12, r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sub rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r9
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_dbl_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Triple a Montgomery form number (r = a + a + a % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a Number to souble in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_tpl_avx2_16 PROC
|
|
push r12
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r9, QWORD PTR [rdx+8]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r12, 0
|
|
adc r9, QWORD PTR [rdx+8]
|
|
adc r10, QWORD PTR [rdx+16]
|
|
adc r11, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r9, QWORD PTR [rdx+40]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
adc rax, QWORD PTR [rdx+32]
|
|
adc r9, QWORD PTR [rdx+40]
|
|
adc r10, QWORD PTR [rdx+48]
|
|
adc r11, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r9, QWORD PTR [rdx+72]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r11, QWORD PTR [rdx+88]
|
|
adc rax, QWORD PTR [rdx+64]
|
|
adc r9, QWORD PTR [rdx+72]
|
|
adc r10, QWORD PTR [rdx+80]
|
|
adc r11, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r9, QWORD PTR [rdx+104]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r11, QWORD PTR [rdx+120]
|
|
adc rax, QWORD PTR [rdx+96]
|
|
adc r9, QWORD PTR [rdx+104]
|
|
adc r10, QWORD PTR [rdx+112]
|
|
adc r11, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb r12, 0
|
|
sub r11, QWORD PTR [r8+120]
|
|
sbb r11, r11
|
|
not r11
|
|
or r12, r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sub rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r9
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
mov r10, QWORD PTR [rcx+16]
|
|
mov r11, QWORD PTR [rcx+24]
|
|
add rax, QWORD PTR [rdx]
|
|
mov r12, 0
|
|
adc r9, QWORD PTR [rdx+8]
|
|
adc r10, QWORD PTR [rdx+16]
|
|
adc r11, QWORD PTR [rdx+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
mov r10, QWORD PTR [rcx+48]
|
|
mov r11, QWORD PTR [rcx+56]
|
|
adc rax, QWORD PTR [rdx+32]
|
|
adc r9, QWORD PTR [rdx+40]
|
|
adc r10, QWORD PTR [rdx+48]
|
|
adc r11, QWORD PTR [rdx+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
mov r10, QWORD PTR [rcx+80]
|
|
mov r11, QWORD PTR [rcx+88]
|
|
adc rax, QWORD PTR [rdx+64]
|
|
adc r9, QWORD PTR [rdx+72]
|
|
adc r10, QWORD PTR [rdx+80]
|
|
adc r11, QWORD PTR [rdx+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
mov r10, QWORD PTR [rcx+112]
|
|
mov r11, QWORD PTR [rcx+120]
|
|
adc rax, QWORD PTR [rdx+96]
|
|
adc r9, QWORD PTR [rdx+104]
|
|
adc r10, QWORD PTR [rdx+112]
|
|
adc r11, QWORD PTR [rdx+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
sbb r12, 0
|
|
sub r11, QWORD PTR [r8+120]
|
|
sbb r11, r11
|
|
not r11
|
|
or r12, r11
|
|
mov r10, QWORD PTR [r8]
|
|
mov r11, QWORD PTR [r8+8]
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sub rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r9
|
|
mov r10, QWORD PTR [r8+16]
|
|
mov r11, QWORD PTR [r8+24]
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r9, QWORD PTR [rcx+24]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r9
|
|
mov r10, QWORD PTR [r8+32]
|
|
mov r11, QWORD PTR [r8+40]
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r9, QWORD PTR [rcx+40]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r9
|
|
mov r10, QWORD PTR [r8+48]
|
|
mov r11, QWORD PTR [r8+56]
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r9, QWORD PTR [rcx+56]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r9
|
|
mov r10, QWORD PTR [r8+64]
|
|
mov r11, QWORD PTR [r8+72]
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r9, QWORD PTR [rcx+72]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r9
|
|
mov r10, QWORD PTR [r8+80]
|
|
mov r11, QWORD PTR [r8+88]
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r9, QWORD PTR [rcx+88]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r9
|
|
mov r10, QWORD PTR [r8+96]
|
|
mov r11, QWORD PTR [r8+104]
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r9, QWORD PTR [rcx+104]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r9
|
|
mov r10, QWORD PTR [r8+112]
|
|
mov r11, QWORD PTR [r8+120]
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r9, QWORD PTR [rcx+120]
|
|
pext r10, r10, r12
|
|
pext r11, r11, r12
|
|
sbb rax, r10
|
|
sbb r9, r11
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r9
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_tpl_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Subtract two Montgomery form numbers (r = a - b % m).
|
|
; *
|
|
; * r Result of addition.
|
|
; * a First number to add in Montogmery form.
|
|
; * b Second number to add in Montogmery form.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_mont_sub_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r10, QWORD PTR [rdx+8]
|
|
mov r11, QWORD PTR [rdx+16]
|
|
mov r12, QWORD PTR [rdx+24]
|
|
sub rax, QWORD PTR [r8]
|
|
mov r13, 0
|
|
sbb r10, QWORD PTR [r8+8]
|
|
sbb r11, QWORD PTR [r8+16]
|
|
sbb r12, QWORD PTR [r8+24]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov QWORD PTR [rcx+16], r11
|
|
mov QWORD PTR [rcx+24], r12
|
|
mov rax, QWORD PTR [rdx+32]
|
|
mov r10, QWORD PTR [rdx+40]
|
|
mov r11, QWORD PTR [rdx+48]
|
|
mov r12, QWORD PTR [rdx+56]
|
|
sbb rax, QWORD PTR [r8+32]
|
|
sbb r10, QWORD PTR [r8+40]
|
|
sbb r11, QWORD PTR [r8+48]
|
|
sbb r12, QWORD PTR [r8+56]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov QWORD PTR [rcx+48], r11
|
|
mov QWORD PTR [rcx+56], r12
|
|
mov rax, QWORD PTR [rdx+64]
|
|
mov r10, QWORD PTR [rdx+72]
|
|
mov r11, QWORD PTR [rdx+80]
|
|
mov r12, QWORD PTR [rdx+88]
|
|
sbb rax, QWORD PTR [r8+64]
|
|
sbb r10, QWORD PTR [r8+72]
|
|
sbb r11, QWORD PTR [r8+80]
|
|
sbb r12, QWORD PTR [r8+88]
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov QWORD PTR [rcx+80], r11
|
|
mov QWORD PTR [rcx+88], r12
|
|
mov rax, QWORD PTR [rdx+96]
|
|
mov r10, QWORD PTR [rdx+104]
|
|
mov r11, QWORD PTR [rdx+112]
|
|
mov r12, QWORD PTR [rdx+120]
|
|
sbb rax, QWORD PTR [r8+96]
|
|
sbb r10, QWORD PTR [r8+104]
|
|
sbb r11, QWORD PTR [r8+112]
|
|
sbb r12, QWORD PTR [r8+120]
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov QWORD PTR [rcx+112], r11
|
|
mov QWORD PTR [rcx+120], r12
|
|
sbb r13, 0
|
|
mov r11, QWORD PTR [r9]
|
|
mov r12, QWORD PTR [r9+8]
|
|
mov rax, QWORD PTR [rcx]
|
|
mov r10, QWORD PTR [rcx+8]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
add rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov r11, QWORD PTR [r9+16]
|
|
mov r12, QWORD PTR [r9+24]
|
|
mov rax, QWORD PTR [rcx+16]
|
|
mov r10, QWORD PTR [rcx+24]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov r11, QWORD PTR [r9+32]
|
|
mov r12, QWORD PTR [r9+40]
|
|
mov rax, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov r11, QWORD PTR [r9+48]
|
|
mov r12, QWORD PTR [r9+56]
|
|
mov rax, QWORD PTR [rcx+48]
|
|
mov r10, QWORD PTR [rcx+56]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
mov r11, QWORD PTR [r9+64]
|
|
mov r12, QWORD PTR [r9+72]
|
|
mov rax, QWORD PTR [rcx+64]
|
|
mov r10, QWORD PTR [rcx+72]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+64], rax
|
|
mov QWORD PTR [rcx+72], r10
|
|
mov r11, QWORD PTR [r9+80]
|
|
mov r12, QWORD PTR [r9+88]
|
|
mov rax, QWORD PTR [rcx+80]
|
|
mov r10, QWORD PTR [rcx+88]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+80], rax
|
|
mov QWORD PTR [rcx+88], r10
|
|
mov r11, QWORD PTR [r9+96]
|
|
mov r12, QWORD PTR [r9+104]
|
|
mov rax, QWORD PTR [rcx+96]
|
|
mov r10, QWORD PTR [rcx+104]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+96], rax
|
|
mov QWORD PTR [rcx+104], r10
|
|
mov r11, QWORD PTR [r9+112]
|
|
mov r12, QWORD PTR [r9+120]
|
|
mov rax, QWORD PTR [rcx+112]
|
|
mov r10, QWORD PTR [rcx+120]
|
|
pext r11, r11, r13
|
|
pext r12, r12, r13
|
|
adc rax, r11
|
|
adc r10, r12
|
|
mov QWORD PTR [rcx+112], rax
|
|
mov QWORD PTR [rcx+120], r10
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_mont_sub_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
|
|
; *
|
|
; * r Result of division by 2.
|
|
; * a Number to divide.
|
|
; * m Modulus (prime).
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_div2_avx2_16 PROC
|
|
push r12
|
|
push r13
|
|
mov r13, QWORD PTR [rdx]
|
|
xor r12, r12
|
|
mov r10, r13
|
|
and r13, 1
|
|
neg r13
|
|
mov rax, QWORD PTR [r8]
|
|
mov r9, QWORD PTR [r8+8]
|
|
mov r10, QWORD PTR [rdx]
|
|
mov r11, QWORD PTR [rdx+8]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
add r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx], r10
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov rax, QWORD PTR [r8+16]
|
|
mov r9, QWORD PTR [r8+24]
|
|
mov r10, QWORD PTR [rdx+16]
|
|
mov r11, QWORD PTR [rdx+24]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov rax, QWORD PTR [r8+32]
|
|
mov r9, QWORD PTR [r8+40]
|
|
mov r10, QWORD PTR [rdx+32]
|
|
mov r11, QWORD PTR [rdx+40]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov rax, QWORD PTR [r8+48]
|
|
mov r9, QWORD PTR [r8+56]
|
|
mov r10, QWORD PTR [rdx+48]
|
|
mov r11, QWORD PTR [rdx+56]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov rax, QWORD PTR [r8+64]
|
|
mov r9, QWORD PTR [r8+72]
|
|
mov r10, QWORD PTR [rdx+64]
|
|
mov r11, QWORD PTR [rdx+72]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov rax, QWORD PTR [r8+80]
|
|
mov r9, QWORD PTR [r8+88]
|
|
mov r10, QWORD PTR [rdx+80]
|
|
mov r11, QWORD PTR [rdx+88]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov rax, QWORD PTR [r8+96]
|
|
mov r9, QWORD PTR [r8+104]
|
|
mov r10, QWORD PTR [rdx+96]
|
|
mov r11, QWORD PTR [rdx+104]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov rax, QWORD PTR [r8+112]
|
|
mov r9, QWORD PTR [r8+120]
|
|
mov r10, QWORD PTR [rdx+112]
|
|
mov r11, QWORD PTR [rdx+120]
|
|
pext rax, rax, r13
|
|
pext r9, r9, r13
|
|
adc r10, rax
|
|
adc r11, r9
|
|
mov QWORD PTR [rcx+112], r10
|
|
mov QWORD PTR [rcx+120], r11
|
|
adc r12, 0
|
|
mov r10, QWORD PTR [rcx]
|
|
mov r11, QWORD PTR [rcx+8]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx], r10
|
|
mov r10, QWORD PTR [rcx+16]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+8], r11
|
|
mov r11, QWORD PTR [rcx+24]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+16], r10
|
|
mov r10, QWORD PTR [rcx+32]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov r11, QWORD PTR [rcx+40]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+32], r10
|
|
mov r10, QWORD PTR [rcx+48]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov r11, QWORD PTR [rcx+56]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+48], r10
|
|
mov r10, QWORD PTR [rcx+64]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+56], r11
|
|
mov r11, QWORD PTR [rcx+72]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+64], r10
|
|
mov r10, QWORD PTR [rcx+80]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+72], r11
|
|
mov r11, QWORD PTR [rcx+88]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+80], r10
|
|
mov r10, QWORD PTR [rcx+96]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+88], r11
|
|
mov r11, QWORD PTR [rcx+104]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+96], r10
|
|
mov r10, QWORD PTR [rcx+112]
|
|
shrd r11, r10, 1
|
|
mov QWORD PTR [rcx+104], r11
|
|
mov r11, QWORD PTR [rcx+120]
|
|
shrd r10, r11, 1
|
|
mov QWORD PTR [rcx+112], r10
|
|
shrd r11, r12, 1
|
|
mov QWORD PTR [rcx+120], r11
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_div2_avx2_16 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the bswap instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_from_bin_bswap PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 128
|
|
xor r13, r13
|
|
jmp L_1024_from_bin_bswap_64_end
|
|
L_1024_from_bin_bswap_64_start:
|
|
sub r11, 64
|
|
mov rax, QWORD PTR [r11+56]
|
|
mov r10, QWORD PTR [r11+48]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
mov rax, QWORD PTR [r11+40]
|
|
mov r10, QWORD PTR [r11+32]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
mov rax, QWORD PTR [r11+24]
|
|
mov r10, QWORD PTR [r11+16]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
mov rax, QWORD PTR [r11+8]
|
|
mov r10, QWORD PTR [r11]
|
|
bswap rax
|
|
bswap r10
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_1024_from_bin_bswap_64_end:
|
|
cmp r9, 63
|
|
jg L_1024_from_bin_bswap_64_start
|
|
jmp L_1024_from_bin_bswap_8_end
|
|
L_1024_from_bin_bswap_8_start:
|
|
sub r11, 8
|
|
mov rax, QWORD PTR [r11]
|
|
bswap rax
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_1024_from_bin_bswap_8_end:
|
|
cmp r9, 7
|
|
jg L_1024_from_bin_bswap_8_start
|
|
cmp r9, r13
|
|
je L_1024_from_bin_bswap_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_1024_from_bin_bswap_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_1024_from_bin_bswap_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_1024_from_bin_bswap_hi_end:
|
|
cmp rcx, r12
|
|
je L_1024_from_bin_bswap_zero_end
|
|
L_1024_from_bin_bswap_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_1024_from_bin_bswap_zero_start
|
|
L_1024_from_bin_bswap_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_from_bin_bswap ENDP
|
|
_text ENDS
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
; /* Read big endian unsigned byte array into r.
|
|
; * Uses the movbe instruction which is an optional instruction.
|
|
; *
|
|
; * r A single precision integer.
|
|
; * size Maximum number of bytes to convert
|
|
; * a Byte array.
|
|
; * n Number of bytes in array to read.
|
|
; */
|
|
_text SEGMENT READONLY PARA
|
|
sp_1024_from_bin_movbe PROC
|
|
push r12
|
|
push r13
|
|
mov r11, r8
|
|
mov r12, rcx
|
|
add r11, r9
|
|
add r12, 128
|
|
xor r13, r13
|
|
jmp L_1024_from_bin_movbe_64_end
|
|
L_1024_from_bin_movbe_64_start:
|
|
sub r11, 64
|
|
movbe rax, QWORD PTR [r11+56]
|
|
movbe r10, QWORD PTR [r11+48]
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r10
|
|
movbe rax, QWORD PTR [r11+40]
|
|
movbe r10, QWORD PTR [r11+32]
|
|
mov QWORD PTR [rcx+16], rax
|
|
mov QWORD PTR [rcx+24], r10
|
|
movbe rax, QWORD PTR [r11+24]
|
|
movbe r10, QWORD PTR [r11+16]
|
|
mov QWORD PTR [rcx+32], rax
|
|
mov QWORD PTR [rcx+40], r10
|
|
movbe rax, QWORD PTR [r11+8]
|
|
movbe r10, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx+48], rax
|
|
mov QWORD PTR [rcx+56], r10
|
|
add rcx, 64
|
|
sub r9, 64
|
|
L_1024_from_bin_movbe_64_end:
|
|
cmp r9, 63
|
|
jg L_1024_from_bin_movbe_64_start
|
|
jmp L_1024_from_bin_movbe_8_end
|
|
L_1024_from_bin_movbe_8_start:
|
|
sub r11, 8
|
|
movbe rax, QWORD PTR [r11]
|
|
mov QWORD PTR [rcx], rax
|
|
add rcx, 8
|
|
sub r9, 8
|
|
L_1024_from_bin_movbe_8_end:
|
|
cmp r9, 7
|
|
jg L_1024_from_bin_movbe_8_start
|
|
cmp r9, r13
|
|
je L_1024_from_bin_movbe_hi_end
|
|
mov r10, r13
|
|
mov rax, r13
|
|
L_1024_from_bin_movbe_hi_start:
|
|
mov al, BYTE PTR [r8]
|
|
shl r10, 8
|
|
inc r8
|
|
add r10, rax
|
|
dec r9
|
|
jg L_1024_from_bin_movbe_hi_start
|
|
mov QWORD PTR [rcx], r10
|
|
add rcx, 8
|
|
L_1024_from_bin_movbe_hi_end:
|
|
cmp rcx, r12
|
|
je L_1024_from_bin_movbe_zero_end
|
|
L_1024_from_bin_movbe_zero_start:
|
|
mov QWORD PTR [rcx], r13
|
|
add rcx, 8
|
|
cmp rcx, r12
|
|
jl L_1024_from_bin_movbe_zero_start
|
|
L_1024_from_bin_movbe_zero_end:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
sp_1024_from_bin_movbe ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
ENDIF
|
|
END
|