Merge pull request #3877 from SparkiDev/sakke_eccsi_fixup

This commit is contained in:
David Garske
2021-03-15 20:18:09 -07:00
committed by GitHub
2 changed files with 134 additions and 26 deletions

View File

@ -39668,6 +39668,13 @@ _sp_256_mont_add_4:
andq %rsi, %r11
subq %rsi, %rax
sbbq %r10, %rcx
sbbq $0x00, %r8
sbbq %r11, %r9
adcq $0x00, %rsi
andq %rsi, %r10
andq %rsi, %r11
subq %rsi, %rax
sbbq %r10, %rcx
movq %rax, (%rdi)
sbbq $0x00, %r8
movq %rcx, 8(%rdi)
@ -39712,6 +39719,13 @@ _sp_256_mont_dbl_4:
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
sbbq $0x00, %rcx
sbbq %r10, %r8
adcq $0x00, %r11
andq %r11, %r9
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
movq %rdx, (%rdi)
sbbq $0x00, %rcx
movq %rax, 8(%rdi)
@ -39758,6 +39772,13 @@ _sp_256_mont_tpl_4:
sbbq %r9, %rax
sbbq $0x00, %rcx
sbbq %r10, %r8
adcq $0x00, %r11
andq %r11, %r9
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
sbbq $0x00, %rcx
sbbq %r10, %r8
movq $0xffffffff, %r9
movq $0xffffffff00000001, %r10
addq (%rsi), %rdx
@ -39770,6 +39791,13 @@ _sp_256_mont_tpl_4:
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
sbbq $0x00, %rcx
sbbq %r10, %r8
adcq $0x00, %r11
andq %r11, %r9
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
movq %rdx, (%rdi)
sbbq $0x00, %rcx
movq %rax, 8(%rdi)
@ -39815,6 +39843,13 @@ _sp_256_mont_sub_4:
andq %rsi, %r11
addq %rsi, %rax
adcq %r10, %rcx
adcq $0x00, %r8
adcq %r11, %r9
adcq $0x00, %rsi
andq %rsi, %r10
andq %rsi, %r11
addq %rsi, %rax
adcq %r10, %rcx
movq %rax, (%rdi)
adcq $0x00, %r8
movq %rcx, 8(%rdi)
@ -42251,6 +42286,13 @@ L_256_mod_inv_avx2_4_usubv_sub_shr1:
vpand %ymm14, %ymm1, %ymm1
vpaddd %ymm5, %ymm0, %ymm0
vpaddd %ymm4, %ymm1, %ymm1
vpsrad $26, %ymm1, %ymm5
vpsrad $26, %ymm0, %ymm4
vpermd %ymm5, %ymm13, %ymm5
vpand %ymm14, %ymm0, %ymm0
vpand %ymm14, %ymm1, %ymm1
vpaddd %ymm5, %ymm0, %ymm0
vpaddd %ymm4, %ymm1, %ymm1
vpextrd $0x00, %xmm0, %eax
vpextrd $0x01, %xmm0, %r8d
vpextrd $2, %xmm0, %r10d
@ -42311,6 +42353,13 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1:
vpand %ymm14, %ymm3, %ymm3
vpaddd %ymm5, %ymm2, %ymm2
vpaddd %ymm4, %ymm3, %ymm3
vpsrad $26, %ymm3, %ymm5
vpsrad $26, %ymm2, %ymm4
vpermd %ymm5, %ymm13, %ymm5
vpand %ymm14, %ymm2, %ymm2
vpand %ymm14, %ymm3, %ymm3
vpaddd %ymm5, %ymm2, %ymm2
vpaddd %ymm4, %ymm3, %ymm3
vpextrd $0x00, %xmm2, %eax
vpextrd $0x01, %xmm2, %r8d
vpextrd $2, %xmm2, %r10d
@ -42324,14 +42373,19 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1:
vpextrd $0x00, %xmm2, %r14d
vpextrd $0x00, %xmm3, %r15d
L_256_mod_inv_avx2_4_store_done:
movslq %eax, %rax
shlq $26, %rcx
addq %rcx, %rax
movslq %r8d, %r8
shlq $26, %r9
addq %r9, %r8
movslq %r10d, %r10
shlq $26, %r11
addq %r11, %r10
movslq %r12d, %r12
shlq $26, %r13
addq %r13, %r12
movslq %r14d, %r14
shlq $26, %r15
addq %r15, %r14
movq %r8, %rcx
@ -44817,12 +44871,12 @@ L_384_get_entry_65_6_start:
movdqa %xmm14, %xmm12
paddd %xmm15, %xmm14
pcmpeqd %xmm13, %xmm12
movdqa (%rsi), %xmm6
movdqa 16(%rsi), %xmm7
movdqa 32(%rsi), %xmm8
movdqa 48(%rsi), %xmm9
movdqa 64(%rsi), %xmm10
movdqa 80(%rsi), %xmm11
movdqu (%rsi), %xmm6
movdqu 16(%rsi), %xmm7
movdqu 32(%rsi), %xmm8
movdqu 48(%rsi), %xmm9
movdqu 64(%rsi), %xmm10
movdqu 80(%rsi), %xmm11
addq $0x60, %rsi
pand %xmm12, %xmm6
pand %xmm12, %xmm7

View File

@ -38455,6 +38455,13 @@ sp_256_mont_add_4 PROC
and r13, rdx
sub rax, rdx
sbb r9, r12
sbb r10, 0
sbb r11, r13
adc rdx, 0
and r12, rdx
and r13, rdx
sub rax, rdx
sbb r9, r12
mov QWORD PTR [rcx], rax
sbb r10, 0
mov QWORD PTR [rcx+8], r9
@ -38492,6 +38499,13 @@ sp_256_mont_dbl_4 PROC
and r12, r13
sub rax, r13
sbb r8, r11
sbb r9, 0
sbb r10, r12
adc r13, 0
and r11, r13
and r12, r13
sub rax, r13
sbb r8, r11
mov QWORD PTR [rcx], rax
sbb r9, 0
mov QWORD PTR [rcx+8], r8
@ -38531,6 +38545,13 @@ sp_256_mont_tpl_4 PROC
sbb r8, r11
sbb r9, 0
sbb r10, r12
adc r13, 0
and r11, r13
and r12, r13
sub rax, r13
sbb r8, r11
sbb r9, 0
sbb r10, r12
mov r11, 4294967295
mov r12, 18446744069414584321
add rax, QWORD PTR [rdx]
@ -38543,6 +38564,13 @@ sp_256_mont_tpl_4 PROC
and r12, r13
sub rax, r13
sbb r8, r11
sbb r9, 0
sbb r10, r12
adc r13, 0
and r11, r13
and r12, r13
sub rax, r13
sbb r8, r11
mov QWORD PTR [rcx], rax
sbb r9, 0
mov QWORD PTR [rcx+8], r8
@ -38581,6 +38609,13 @@ sp_256_mont_sub_4 PROC
and r13, rdx
add rax, rdx
adc r9, r12
adc r10, 0
adc r11, r13
adc rdx, 0
and r12, rdx
and r13, rdx
add rax, rdx
adc r9, r12
mov QWORD PTR [rcx], rax
adc r10, 0
mov QWORD PTR [rcx+8], r9
@ -39345,10 +39380,10 @@ L_256_get_entry_64_4_start:
movdqa xmm8, xmm10
paddd xmm10, xmm11
pcmpeqd xmm8, xmm9
movdqa xmm4, [rdx]
movdqa xmm5, [rdx+16]
movdqa xmm6, [rdx+32]
movdqa xmm7, [rdx+48]
movdqu xmm4, [rdx]
movdqu xmm5, [rdx+16]
movdqu xmm6, [rdx+32]
movdqu xmm7, [rdx+48]
add rdx, 64
pand xmm4, xmm8
pand xmm5, xmm8
@ -39432,10 +39467,10 @@ L_256_get_entry_65_4_start:
movdqa xmm8, xmm10
paddd xmm10, xmm11
pcmpeqd xmm8, xmm9
movdqa xmm4, [rdx]
movdqa xmm5, [rdx+16]
movdqa xmm6, [rdx+32]
movdqa xmm7, [rdx+48]
movdqu xmm4, [rdx]
movdqu xmm5, [rdx+16]
movdqu xmm6, [rdx+32]
movdqu xmm7, [rdx+48]
add rdx, 64
pand xmm4, xmm8
pand xmm5, xmm8
@ -40774,6 +40809,13 @@ L_256_mod_inv_avx2_4_usubv_sub_shr1:
vpand ymm1, ymm1, ymm14
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm4
vpsrad ymm5, ymm1, 26
vpsrad ymm4, ymm0, 26
vpermd ymm5, ymm13, ymm5
vpand ymm0, ymm0, ymm14
vpand ymm1, ymm1, ymm14
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm4
vpextrd eax, xmm0, 0
vpextrd r10d, xmm0, 1
vpextrd r12d, xmm0, 2
@ -40834,6 +40876,13 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1:
vpand ymm3, ymm3, ymm14
vpaddd ymm2, ymm2, ymm5
vpaddd ymm3, ymm3, ymm4
vpsrad ymm5, ymm3, 26
vpsrad ymm4, ymm2, 26
vpermd ymm5, ymm13, ymm5
vpand ymm2, ymm2, ymm14
vpand ymm3, ymm3, ymm14
vpaddd ymm2, ymm2, ymm5
vpaddd ymm3, ymm3, ymm4
vpextrd eax, xmm2, 0
vpextrd r10d, xmm2, 1
vpextrd r12d, xmm2, 2
@ -40847,14 +40896,19 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1:
vpextrd edi, xmm2, 0
vpextrd esi, xmm3, 0
L_256_mod_inv_avx2_4_store_done:
movslq rax, eax
shl r9, 26
add rax, r9
movslq r10, r10d
shl r11, 26
add r10, r11
movslq r12, r12d
shl r13, 26
add r12, r13
movslq r14, r14d
shl r15, 26
add r14, r15
movslq rdi, edi
shl rsi, 26
add rdi, rsi
mov r9, r10
@ -43037,12 +43091,12 @@ L_384_get_entry_64_6_start:
movdqa xmm12, xmm14
paddd xmm14, xmm15
pcmpeqd xmm12, xmm13
movdqa xmm6, [rdx]
movdqa xmm7, [rdx+16]
movdqa xmm8, [rdx+32]
movdqa xmm9, [rdx+48]
movdqa xmm10, [rdx+64]
movdqa xmm11, [rdx+80]
movdqu xmm6, [rdx]
movdqu xmm7, [rdx+16]
movdqu xmm8, [rdx+32]
movdqu xmm9, [rdx+48]
movdqu xmm10, [rdx+64]
movdqu xmm11, [rdx+80]
add rdx, 96
pand xmm6, xmm12
pand xmm7, xmm12
@ -43144,12 +43198,12 @@ L_384_get_entry_65_6_start:
movdqa xmm12, xmm14
paddd xmm14, xmm15
pcmpeqd xmm12, xmm13
movdqa xmm6, [rdx]
movdqa xmm7, [rdx+16]
movdqa xmm8, [rdx+32]
movdqa xmm9, [rdx+48]
movdqa xmm10, [rdx+64]
movdqa xmm11, [rdx+80]
movdqu xmm6, [rdx]
movdqu xmm7, [rdx+16]
movdqu xmm8, [rdx+32]
movdqu xmm9, [rdx+48]
movdqu xmm10, [rdx+64]
movdqu xmm11, [rdx+80]
add rdx, 96
pand xmm6, xmm12
pand xmm7, xmm12