mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-01-27 13:22:21 +01:00
Fix spaces at start of copyright line in .asm files. Changed generation of X25519 and Ed448 code to better match changes already in C code. Fixed formatting in places.
1061 lines
29 KiB
NASM
1061 lines
29 KiB
NASM
; /* poly1305_asm.asm */
|
|
; /*
|
|
; * Copyright (C) 2006-2025 wolfSSL Inc.
|
|
; *
|
|
; * This file is part of wolfSSL.
|
|
; *
|
|
; * wolfSSL is free software; you can redistribute it and/or modify
|
|
; * it under the terms of the GNU General Public License as published by
|
|
; * the Free Software Foundation; either version 2 of the License, or
|
|
; * (at your option) any later version.
|
|
; *
|
|
; * wolfSSL is distributed in the hope that it will be useful,
|
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
; * GNU General Public License for more details.
|
|
; *
|
|
; * You should have received a copy of the GNU General Public License
|
|
; * along with this program; if not, write to the Free Software
|
|
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
|
; */
|
|
IF @Version LT 1200
|
|
; AVX2 instructions not recognized by old versions of MASM
|
|
IFNDEF NO_AVX2_SUPPORT
|
|
NO_AVX2_SUPPORT = 1
|
|
ENDIF
|
|
; MOVBE instruction not recognized by old versions of MASM
|
|
IFNDEF NO_MOVBE_SUPPORT
|
|
NO_MOVBE_SUPPORT = 1
|
|
ENDIF
|
|
ENDIF
|
|
|
|
IFNDEF HAVE_INTEL_AVX1
|
|
HAVE_INTEL_AVX1 = 1
|
|
ENDIF
|
|
IFNDEF NO_AVX2_SUPPORT
|
|
HAVE_INTEL_AVX2 = 1
|
|
ENDIF
|
|
|
|
IFNDEF _WIN64
|
|
_WIN64 = 1
|
|
ENDIF
|
|
|
|
IFDEF HAVE_INTEL_AVX1
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_setkey_avx PROC
|
|
push r12
|
|
push r13
|
|
mov r12, 1152921487695413247
|
|
mov r13, 1152921487695413244
|
|
mov rax, QWORD PTR [rdx]
|
|
mov r8, QWORD PTR [rdx+8]
|
|
mov r9, QWORD PTR [rdx+16]
|
|
mov r10, QWORD PTR [rdx+24]
|
|
and rax, r12
|
|
and r8, r13
|
|
mov r12, rax
|
|
mov r13, r8
|
|
xor r11, r11
|
|
mov QWORD PTR [rcx], rax
|
|
mov QWORD PTR [rcx+8], r8
|
|
mov QWORD PTR [rcx+24], r11
|
|
mov QWORD PTR [rcx+32], r11
|
|
mov QWORD PTR [rcx+40], r11
|
|
mov QWORD PTR [rcx+48], r9
|
|
mov QWORD PTR [rcx+56], r10
|
|
mov QWORD PTR [rcx+352], r11
|
|
mov QWORD PTR [rcx+408], r11
|
|
mov QWORD PTR [rcx+360], rax
|
|
mov QWORD PTR [rcx+416], r8
|
|
add r12, rax
|
|
add r13, r8
|
|
mov QWORD PTR [rcx+368], r12
|
|
mov QWORD PTR [rcx+424], r13
|
|
add r12, rax
|
|
add r13, r8
|
|
mov QWORD PTR [rcx+376], r12
|
|
mov QWORD PTR [rcx+432], r13
|
|
add r12, rax
|
|
add r13, r8
|
|
mov QWORD PTR [rcx+384], r12
|
|
mov QWORD PTR [rcx+440], r13
|
|
add r12, rax
|
|
add r13, r8
|
|
mov QWORD PTR [rcx+392], r12
|
|
mov QWORD PTR [rcx+448], r13
|
|
add r12, rax
|
|
add r13, r8
|
|
mov QWORD PTR [rcx+400], r12
|
|
mov QWORD PTR [rcx+456], r13
|
|
mov QWORD PTR [rcx+608], r11
|
|
mov BYTE PTR [rcx+616], 1
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
poly1305_setkey_avx ENDP
|
|
_text ENDS
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_block_avx PROC
|
|
push r15
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r15, QWORD PTR [rcx]
|
|
mov rbx, QWORD PTR [rcx+8]
|
|
mov r8, QWORD PTR [rcx+24]
|
|
mov r9, QWORD PTR [rcx+32]
|
|
mov r10, QWORD PTR [rcx+40]
|
|
xor r14, r14
|
|
mov r14b, BYTE PTR [rcx+616]
|
|
; h += m
|
|
mov r11, QWORD PTR [rdx]
|
|
mov r12, QWORD PTR [rdx+8]
|
|
add r8, r11
|
|
adc r9, r12
|
|
mov rax, rbx
|
|
adc r10, r14
|
|
; r[1] * h[0] => rdx, rax ==> t2, t1
|
|
mul r8
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; r[0] * h[1] => rdx, rax ++> t2, t1
|
|
mov rax, r15
|
|
mul r9
|
|
add r12, rax
|
|
mov rax, r15
|
|
adc r13, rdx
|
|
; r[0] * h[0] => rdx, rax ==> t4, t0
|
|
mul r8
|
|
mov r11, rax
|
|
mov r8, rdx
|
|
; r[1] * h[1] => rdx, rax =+> t3, t2
|
|
mov rax, rbx
|
|
mul r9
|
|
; r[0] * h[2] +> t2
|
|
add r13, QWORD PTR [rcx+8*r10+352]
|
|
mov r14, rdx
|
|
add r12, r8
|
|
adc r13, rax
|
|
; r[1] * h[2] +> t3
|
|
adc r14, QWORD PTR [rcx+8*r10+408]
|
|
; r * h in r14, r13, r12, r11
|
|
; h = (r * h) mod 2^130 - 5
|
|
mov r10, r13
|
|
and r13, -4
|
|
and r10, 3
|
|
add r11, r13
|
|
mov r8, r13
|
|
adc r12, r14
|
|
adc r10, 0
|
|
shrd r8, r14, 2
|
|
shr r14, 2
|
|
add r8, r11
|
|
adc r12, r14
|
|
mov r9, r12
|
|
adc r10, 0
|
|
; h in r10, r9, r8
|
|
; Store h to ctx
|
|
mov QWORD PTR [rcx+24], r8
|
|
mov QWORD PTR [rcx+32], r9
|
|
mov QWORD PTR [rcx+40], r10
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
pop r15
|
|
ret
|
|
poly1305_block_avx ENDP
|
|
_text ENDS
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_blocks_avx PROC
|
|
push rdi
|
|
push rsi
|
|
push r15
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
mov rcx, r8
|
|
mov r15, QWORD PTR [rdi]
|
|
mov rbx, QWORD PTR [rdi+8]
|
|
mov r8, QWORD PTR [rdi+24]
|
|
mov r9, QWORD PTR [rdi+32]
|
|
mov r10, QWORD PTR [rdi+40]
|
|
L_poly1305_avx_blocks_start:
|
|
; h += m
|
|
mov r11, QWORD PTR [rsi]
|
|
mov r12, QWORD PTR [rsi+8]
|
|
add r8, r11
|
|
adc r9, r12
|
|
mov rax, rbx
|
|
adc r10, 0
|
|
; r[1] * h[0] => rdx, rax ==> t2, t1
|
|
mul r8
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
; r[0] * h[1] => rdx, rax ++> t2, t1
|
|
mov rax, r15
|
|
mul r9
|
|
add r12, rax
|
|
mov rax, r15
|
|
adc r13, rdx
|
|
; r[0] * h[0] => rdx, rax ==> t4, t0
|
|
mul r8
|
|
mov r11, rax
|
|
mov r8, rdx
|
|
; r[1] * h[1] => rdx, rax =+> t3, t2
|
|
mov rax, rbx
|
|
mul r9
|
|
; r[0] * h[2] +> t2
|
|
add r13, QWORD PTR [rdi+8*r10+360]
|
|
mov r14, rdx
|
|
add r12, r8
|
|
adc r13, rax
|
|
; r[1] * h[2] +> t3
|
|
adc r14, QWORD PTR [rdi+8*r10+416]
|
|
; r * h in r14, r13, r12, r11
|
|
; h = (r * h) mod 2^130 - 5
|
|
mov r10, r13
|
|
and r13, -4
|
|
and r10, 3
|
|
add r11, r13
|
|
mov r8, r13
|
|
adc r12, r14
|
|
adc r10, 0
|
|
shrd r8, r14, 2
|
|
shr r14, 2
|
|
add r8, r11
|
|
adc r12, r14
|
|
mov r9, r12
|
|
adc r10, 0
|
|
; h in r10, r9, r8
|
|
; Next block from message
|
|
add rsi, 16
|
|
sub rcx, 16
|
|
jg L_poly1305_avx_blocks_start
|
|
; Store h to ctx
|
|
mov QWORD PTR [rdi+24], r8
|
|
mov QWORD PTR [rdi+32], r9
|
|
mov QWORD PTR [rdi+40], r10
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
pop r15
|
|
pop rsi
|
|
pop rdi
|
|
ret
|
|
poly1305_blocks_avx ENDP
|
|
_text ENDS
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_final_avx PROC
|
|
push rdi
|
|
push rbx
|
|
push r12
|
|
mov rdi, rcx
|
|
mov rbx, rdx
|
|
mov rax, QWORD PTR [rdi+608]
|
|
test rax, rax
|
|
je L_poly1305_avx_final_no_more
|
|
mov BYTE PTR [rdi+rax+480], 1
|
|
jmp L_poly1305_avx_final_cmp_rem
|
|
L_poly1305_avx_final_zero_rem:
|
|
mov BYTE PTR [rdi+rax+480], 0
|
|
L_poly1305_avx_final_cmp_rem:
|
|
inc al
|
|
cmp rax, 16
|
|
jl L_poly1305_avx_final_zero_rem
|
|
mov BYTE PTR [rdi+616], 0
|
|
lea rdx, QWORD PTR [rdi+480]
|
|
call poly1305_block_avx
|
|
L_poly1305_avx_final_no_more:
|
|
mov rax, QWORD PTR [rdi+24]
|
|
mov rdx, QWORD PTR [rdi+32]
|
|
mov rcx, QWORD PTR [rdi+40]
|
|
mov r11, QWORD PTR [rdi+48]
|
|
mov r12, QWORD PTR [rdi+56]
|
|
; h %= p
|
|
; h = (h + pad)
|
|
; mod 2^130 - 5
|
|
mov r8, rcx
|
|
and rcx, 3
|
|
shr r8, 2
|
|
; Multiply by 5
|
|
lea r8, QWORD PTR [r8+4*r8+0]
|
|
add rax, r8
|
|
adc rdx, 0
|
|
adc rcx, 0
|
|
; Fixup when between (1 << 130) - 1 and (1 << 130) - 5
|
|
mov r8, rax
|
|
mov r9, rdx
|
|
mov r10, rcx
|
|
add r8, 5
|
|
adc r9, 0
|
|
adc r10, 0
|
|
cmp r10, 4
|
|
cmove rax, r8
|
|
cmove rdx, r9
|
|
; h += pad
|
|
add rax, r11
|
|
adc rdx, r12
|
|
mov QWORD PTR [rbx], rax
|
|
mov QWORD PTR [rbx+8], rdx
|
|
; Zero out r
|
|
mov QWORD PTR [rdi], 0
|
|
mov QWORD PTR [rdi+8], 0
|
|
; Zero out h
|
|
mov QWORD PTR [rdi+24], 0
|
|
mov QWORD PTR [rdi+32], 0
|
|
mov QWORD PTR [rdi+40], 0
|
|
; Zero out pad
|
|
mov QWORD PTR [rdi+48], 0
|
|
mov QWORD PTR [rdi+56], 0
|
|
pop r12
|
|
pop rbx
|
|
pop rdi
|
|
ret
|
|
poly1305_final_avx ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
IFDEF HAVE_INTEL_AVX2
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_calc_powers_avx2 PROC
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push rbp
|
|
mov r8, QWORD PTR [rcx]
|
|
mov r9, QWORD PTR [rcx+8]
|
|
xor r10, r10
|
|
; Convert to 26 bits in 32
|
|
mov rax, r8
|
|
mov rdx, r8
|
|
mov rsi, r8
|
|
mov rbx, r9
|
|
mov rbp, r9
|
|
shr rdx, 26
|
|
shrd rsi, r9, 52
|
|
shr rbx, 14
|
|
shrd rbp, r10, 40
|
|
and rax, 67108863
|
|
and rdx, 67108863
|
|
and rsi, 67108863
|
|
and rbx, 67108863
|
|
and rbp, 67108863
|
|
mov DWORD PTR [rcx+224], eax
|
|
mov DWORD PTR [rcx+228], edx
|
|
mov DWORD PTR [rcx+232], esi
|
|
mov DWORD PTR [rcx+236], ebx
|
|
mov DWORD PTR [rcx+240], ebp
|
|
mov DWORD PTR [rcx+244], 0
|
|
; Square 128-bit
|
|
mov rax, r9
|
|
mul r8
|
|
xor r14, r14
|
|
mov r12, rax
|
|
mov r13, rdx
|
|
add r12, rax
|
|
adc r13, rdx
|
|
adc r14, 0
|
|
mov rax, r8
|
|
mul rax
|
|
mov r11, rax
|
|
mov rdi, rdx
|
|
mov rax, r9
|
|
mul rax
|
|
add r12, rdi
|
|
adc r13, rax
|
|
adc r14, rdx
|
|
; Reduce 256-bit to 130-bit
|
|
mov rax, r13
|
|
mov rdx, r14
|
|
and rax, -4
|
|
and r13, 3
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r13, 0
|
|
shrd rax, rdx, 2
|
|
shr rdx, 2
|
|
add r11, rax
|
|
adc r12, rdx
|
|
adc r13, 0
|
|
mov rax, r13
|
|
shr rax, 2
|
|
lea rax, QWORD PTR [rax+4*rax+0]
|
|
and r13, 3
|
|
add r11, rax
|
|
adc r12, 0
|
|
adc r13, 0
|
|
; Convert to 26 bits in 32
|
|
mov rax, r11
|
|
mov rdx, r11
|
|
mov rsi, r11
|
|
mov rbx, r12
|
|
mov rbp, r12
|
|
shr rdx, 26
|
|
shrd rsi, r12, 52
|
|
shr rbx, 14
|
|
shrd rbp, r13, 40
|
|
and rax, 67108863
|
|
and rdx, 67108863
|
|
and rsi, 67108863
|
|
and rbx, 67108863
|
|
and rbp, 67108863
|
|
mov DWORD PTR [rcx+256], eax
|
|
mov DWORD PTR [rcx+260], edx
|
|
mov DWORD PTR [rcx+264], esi
|
|
mov DWORD PTR [rcx+268], ebx
|
|
mov DWORD PTR [rcx+272], ebp
|
|
mov DWORD PTR [rcx+276], 0
|
|
; Multiply 128-bit by 130-bit
|
|
; r1[0] * r2[0]
|
|
mov rax, r8
|
|
mul r11
|
|
mov r14, rax
|
|
mov r15, rdx
|
|
; r1[0] * r2[1]
|
|
mov rax, r8
|
|
mul r12
|
|
mov rdi, 0
|
|
add r15, rax
|
|
adc rdi, rdx
|
|
; r1[1] * r2[0]
|
|
mov rax, r9
|
|
mul r11
|
|
mov rsi, 0
|
|
add r15, rax
|
|
adc rdi, rdx
|
|
adc rsi, 0
|
|
; r1[0] * r2[2]
|
|
mov rax, r8
|
|
mul r13
|
|
add rdi, rax
|
|
adc rsi, rdx
|
|
; r1[1] * r2[1]
|
|
mov rax, r9
|
|
mul r12
|
|
mov rbx, 0
|
|
add rdi, rax
|
|
adc rsi, rdx
|
|
adc rbx, 0
|
|
; r1[1] * r2[2]
|
|
mov rax, r9
|
|
mul r13
|
|
add rsi, rax
|
|
adc rbx, rdx
|
|
; Reduce 260-bit to 130-bit
|
|
mov rax, rdi
|
|
mov rdx, rsi
|
|
mov rbx, rbx
|
|
and rax, -4
|
|
and rdi, 3
|
|
add r14, rax
|
|
adc r15, rdx
|
|
adc rdi, rbx
|
|
shrd rax, rdx, 2
|
|
shrd rdx, rbx, 2
|
|
shr rbx, 2
|
|
add r14, rax
|
|
adc r15, rdx
|
|
adc rdi, rbx
|
|
mov rax, rdi
|
|
and rdi, 3
|
|
shr rax, 2
|
|
lea rax, QWORD PTR [rax+4*rax+0]
|
|
add r14, rax
|
|
adc r15, 0
|
|
adc rdi, 0
|
|
; Convert to 26 bits in 32
|
|
mov rax, r14
|
|
mov rdx, r14
|
|
mov rsi, r14
|
|
mov rbx, r15
|
|
mov rbp, r15
|
|
shr rdx, 26
|
|
shrd rsi, r15, 52
|
|
shr rbx, 14
|
|
shrd rbp, rdi, 40
|
|
and rax, 67108863
|
|
and rdx, 67108863
|
|
and rsi, 67108863
|
|
and rbx, 67108863
|
|
and rbp, 67108863
|
|
mov DWORD PTR [rcx+288], eax
|
|
mov DWORD PTR [rcx+292], edx
|
|
mov DWORD PTR [rcx+296], esi
|
|
mov DWORD PTR [rcx+300], ebx
|
|
mov DWORD PTR [rcx+304], ebp
|
|
mov DWORD PTR [rcx+308], 0
|
|
; Square 130-bit
|
|
mov rax, r12
|
|
mul r11
|
|
xor r14, r14
|
|
mov r9, rax
|
|
mov r10, rdx
|
|
add r9, rax
|
|
adc r10, rdx
|
|
adc r14, 0
|
|
mov rax, r11
|
|
mul rax
|
|
mov r8, rax
|
|
mov rdi, rdx
|
|
mov rax, r12
|
|
mul rax
|
|
add r9, rdi
|
|
adc r10, rax
|
|
adc r14, rdx
|
|
mov rax, r13
|
|
mul rax
|
|
mov r15, rax
|
|
mov rax, r13
|
|
mul r11
|
|
add r10, rax
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
add r10, rax
|
|
adc r14, rdx
|
|
adc r15, 0
|
|
mov rax, r13
|
|
mul r12
|
|
add r14, rax
|
|
adc r15, rdx
|
|
add r14, rax
|
|
adc r15, rdx
|
|
; Reduce 260-bit to 130-bit
|
|
mov rax, r10
|
|
mov rdx, r14
|
|
mov rdi, r15
|
|
and rax, -4
|
|
and r10, 3
|
|
add r8, rax
|
|
adc r9, rdx
|
|
adc r10, rdi
|
|
shrd rax, rdx, 2
|
|
shrd rdx, rdi, 2
|
|
shr rdi, 2
|
|
add r8, rax
|
|
adc r9, rdx
|
|
adc r10, rdi
|
|
mov rax, r10
|
|
and r10, 3
|
|
shr rax, 2
|
|
lea rax, QWORD PTR [rax+4*rax+0]
|
|
add r8, rax
|
|
adc r9, 0
|
|
adc r10, 0
|
|
; Convert to 26 bits in 32
|
|
mov rax, r8
|
|
mov rdx, r8
|
|
mov rsi, r8
|
|
mov rbx, r9
|
|
mov rbp, r9
|
|
shr rdx, 26
|
|
shrd rsi, r9, 52
|
|
shr rbx, 14
|
|
shrd rbp, r10, 40
|
|
and rax, 67108863
|
|
and rdx, 67108863
|
|
and rsi, 67108863
|
|
and rbx, 67108863
|
|
and rbp, 67108863
|
|
mov DWORD PTR [rcx+320], eax
|
|
mov DWORD PTR [rcx+324], edx
|
|
mov DWORD PTR [rcx+328], esi
|
|
mov DWORD PTR [rcx+332], ebx
|
|
mov DWORD PTR [rcx+336], ebp
|
|
mov DWORD PTR [rcx+340], 0
|
|
pop rbp
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
poly1305_calc_powers_avx2 ENDP
|
|
_text ENDS
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_setkey_avx2 PROC
|
|
call poly1305_setkey_avx
|
|
vpxor ymm0, ymm0, ymm0
|
|
vmovdqu YMMWORD PTR [rcx+64], ymm0
|
|
vmovdqu YMMWORD PTR [rcx+96], ymm0
|
|
vmovdqu YMMWORD PTR [rcx+128], ymm0
|
|
vmovdqu YMMWORD PTR [rcx+160], ymm0
|
|
vmovdqu YMMWORD PTR [rcx+192], ymm0
|
|
mov QWORD PTR [rcx+608], 0
|
|
mov WORD PTR [rcx+616], 0
|
|
ret
|
|
poly1305_setkey_avx2 ENDP
|
|
_text ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_poly1305_avx2_blocks_mask QWORD 67108863, 67108863,
|
|
67108863, 67108863
|
|
ptr_L_poly1305_avx2_blocks_mask QWORD L_poly1305_avx2_blocks_mask
|
|
_DATA ENDS
|
|
_DATA SEGMENT
|
|
ALIGN 16
|
|
L_poly1305_avx2_blocks_hibit QWORD 16777216, 16777216,
|
|
16777216, 16777216
|
|
ptr_L_poly1305_avx2_blocks_hibit QWORD L_poly1305_avx2_blocks_hibit
|
|
_DATA ENDS
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_blocks_avx2 PROC
|
|
push r12
|
|
push rdi
|
|
push rsi
|
|
push rbx
|
|
push r13
|
|
push r14
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
mov rdx, r8
|
|
sub rsp, 480
|
|
vmovdqu OWORD PTR [rsp+320], xmm6
|
|
vmovdqu OWORD PTR [rsp+336], xmm7
|
|
vmovdqu OWORD PTR [rsp+352], xmm8
|
|
vmovdqu OWORD PTR [rsp+368], xmm9
|
|
vmovdqu OWORD PTR [rsp+384], xmm10
|
|
vmovdqu OWORD PTR [rsp+400], xmm11
|
|
vmovdqu OWORD PTR [rsp+416], xmm12
|
|
vmovdqu OWORD PTR [rsp+432], xmm13
|
|
vmovdqu OWORD PTR [rsp+448], xmm14
|
|
vmovdqu OWORD PTR [rsp+464], xmm15
|
|
mov r13, QWORD PTR [ptr_L_poly1305_avx2_blocks_mask]
|
|
mov r14, QWORD PTR [ptr_L_poly1305_avx2_blocks_hibit]
|
|
mov rcx, rsp
|
|
and rcx, -32
|
|
add rcx, 32
|
|
vpxor ymm15, ymm15, ymm15
|
|
mov rbx, rcx
|
|
lea rax, QWORD PTR [rdi+64]
|
|
add rbx, 160
|
|
cmp WORD PTR [rdi+616], 0
|
|
jne L_poly1305_avx2_blocks_begin_h
|
|
; Load the message data
|
|
vmovdqu ymm0, YMMWORD PTR [rsi]
|
|
vmovdqu ymm1, YMMWORD PTR [rsi+32]
|
|
vperm2i128 ymm2, ymm0, ymm1, 32
|
|
vperm2i128 ymm0, ymm0, ymm1, 49
|
|
vpunpckldq ymm1, ymm2, ymm0
|
|
vpunpckhdq ymm3, ymm2, ymm0
|
|
vpunpckldq ymm0, ymm1, ymm15
|
|
vpunpckhdq ymm1, ymm1, ymm15
|
|
vpunpckldq ymm2, ymm3, ymm15
|
|
vpunpckhdq ymm3, ymm3, ymm15
|
|
vmovdqu ymm4, YMMWORD PTR [r14]
|
|
vpsllq ymm1, ymm1, 6
|
|
vpsllq ymm2, ymm2, 12
|
|
vpsllq ymm3, ymm3, 18
|
|
vmovdqu ymm14, YMMWORD PTR [r13]
|
|
; Reduce, in place, the message data
|
|
vpsrlq ymm10, ymm0, 26
|
|
vpsrlq ymm11, ymm3, 26
|
|
vpand ymm0, ymm0, ymm14
|
|
vpand ymm3, ymm3, ymm14
|
|
vpaddq ymm1, ymm10, ymm1
|
|
vpaddq ymm4, ymm11, ymm4
|
|
vpsrlq ymm10, ymm1, 26
|
|
vpsrlq ymm11, ymm4, 26
|
|
vpand ymm1, ymm1, ymm14
|
|
vpand ymm4, ymm4, ymm14
|
|
vpaddq ymm2, ymm10, ymm2
|
|
vpslld ymm12, ymm11, 2
|
|
vpaddd ymm12, ymm11, ymm12
|
|
vpsrlq ymm10, ymm2, 26
|
|
vpaddq ymm0, ymm12, ymm0
|
|
vpsrlq ymm11, ymm0, 26
|
|
vpand ymm2, ymm2, ymm14
|
|
vpand ymm0, ymm0, ymm14
|
|
vpaddq ymm3, ymm10, ymm3
|
|
vpaddq ymm1, ymm11, ymm1
|
|
vpsrlq ymm10, ymm3, 26
|
|
vpand ymm3, ymm3, ymm14
|
|
vpaddq ymm4, ymm10, ymm4
|
|
add rsi, 64
|
|
sub rdx, 64
|
|
jz L_poly1305_avx2_blocks_store
|
|
jmp L_poly1305_avx2_blocks_load_r4
|
|
L_poly1305_avx2_blocks_begin_h:
|
|
; Load the H values.
|
|
vmovdqu ymm0, YMMWORD PTR [rax]
|
|
vmovdqu ymm1, YMMWORD PTR [rax+32]
|
|
vmovdqu ymm2, YMMWORD PTR [rax+64]
|
|
vmovdqu ymm3, YMMWORD PTR [rax+96]
|
|
vmovdqu ymm4, YMMWORD PTR [rax+128]
|
|
; Check if there is a power of r to load - otherwise use r^4.
|
|
cmp BYTE PTR [rdi+616], 0
|
|
je L_poly1305_avx2_blocks_load_r4
|
|
; Load the 4 powers of r - r^4, r^3, r^2, r^1.
|
|
vmovdqu ymm8, YMMWORD PTR [rdi+224]
|
|
vmovdqu ymm7, YMMWORD PTR [rdi+256]
|
|
vmovdqu ymm6, YMMWORD PTR [rdi+288]
|
|
vmovdqu ymm5, YMMWORD PTR [rdi+320]
|
|
vpermq ymm5, ymm5, 216
|
|
vpermq ymm6, ymm6, 216
|
|
vpermq ymm7, ymm7, 216
|
|
vpermq ymm8, ymm8, 216
|
|
vpunpcklqdq ymm10, ymm5, ymm6
|
|
vpunpckhqdq ymm11, ymm5, ymm6
|
|
vpunpcklqdq ymm12, ymm7, ymm8
|
|
vpunpckhqdq ymm13, ymm7, ymm8
|
|
vperm2i128 ymm5, ymm10, ymm12, 32
|
|
vperm2i128 ymm7, ymm10, ymm12, 49
|
|
vperm2i128 ymm9, ymm11, ymm13, 32
|
|
vpsrlq ymm6, ymm5, 32
|
|
vpsrlq ymm8, ymm7, 32
|
|
jmp L_poly1305_avx2_blocks_mul_5
|
|
L_poly1305_avx2_blocks_load_r4:
|
|
; Load r^4 into all four positions.
|
|
vmovdqu ymm13, YMMWORD PTR [rdi+320]
|
|
vpermq ymm5, ymm13, 0
|
|
vpsrlq ymm14, ymm13, 32
|
|
vpermq ymm7, ymm13, 85
|
|
vpermq ymm9, ymm13, 170
|
|
vpermq ymm6, ymm14, 0
|
|
vpermq ymm8, ymm14, 85
|
|
L_poly1305_avx2_blocks_mul_5:
|
|
; Multiply top 4 26-bit values of all four H by 5
|
|
vpslld ymm10, ymm6, 2
|
|
vpslld ymm11, ymm7, 2
|
|
vpslld ymm12, ymm8, 2
|
|
vpslld ymm13, ymm9, 2
|
|
vpaddq ymm10, ymm6, ymm10
|
|
vpaddq ymm11, ymm7, ymm11
|
|
vpaddq ymm12, ymm8, ymm12
|
|
vpaddq ymm13, ymm9, ymm13
|
|
; Store powers of r and multiple of 5 for use in multiply.
|
|
vmovdqa YMMWORD PTR [rbx], ymm10
|
|
vmovdqa YMMWORD PTR [rbx+32], ymm11
|
|
vmovdqa YMMWORD PTR [rbx+64], ymm12
|
|
vmovdqa YMMWORD PTR [rbx+96], ymm13
|
|
vmovdqa YMMWORD PTR [rcx], ymm5
|
|
vmovdqa YMMWORD PTR [rcx+32], ymm6
|
|
vmovdqa YMMWORD PTR [rcx+64], ymm7
|
|
vmovdqa YMMWORD PTR [rcx+96], ymm8
|
|
vmovdqa YMMWORD PTR [rcx+128], ymm9
|
|
vmovdqu ymm14, YMMWORD PTR [r13]
|
|
; If not finished then loop over data
|
|
cmp BYTE PTR [rdi+616], 1
|
|
jne L_poly1305_avx2_blocks_start
|
|
; Do last multiply, reduce, add the four H together and move to
|
|
; 32-bit registers
|
|
vpmuludq ymm5, ymm4, [rbx]
|
|
vpmuludq ymm10, ymm3, [rbx+32]
|
|
vpmuludq ymm6, ymm4, [rbx+32]
|
|
vpmuludq ymm11, ymm3, [rbx+64]
|
|
vpmuludq ymm7, ymm4, [rbx+64]
|
|
vpaddq ymm5, ymm10, ymm5
|
|
vpmuludq ymm12, ymm2, [rbx+64]
|
|
vpmuludq ymm8, ymm4, [rbx+96]
|
|
vpaddq ymm6, ymm11, ymm6
|
|
vpmuludq ymm13, ymm1, [rbx+96]
|
|
vpmuludq ymm10, ymm2, [rbx+96]
|
|
vpaddq ymm5, ymm12, ymm5
|
|
vpmuludq ymm11, ymm3, [rbx+96]
|
|
vpmuludq ymm12, ymm3, [rcx]
|
|
vpaddq ymm5, ymm13, ymm5
|
|
vpmuludq ymm9, ymm4, [rcx]
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpmuludq ymm13, ymm0, [rcx]
|
|
vpaddq ymm7, ymm11, ymm7
|
|
vpmuludq ymm10, ymm1, [rcx]
|
|
vpaddq ymm8, ymm12, ymm8
|
|
vpmuludq ymm11, ymm2, [rcx]
|
|
vpmuludq ymm12, ymm2, [rcx+32]
|
|
vpaddq ymm5, ymm13, ymm5
|
|
vpmuludq ymm13, ymm3, [rcx+32]
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpmuludq ymm10, ymm0, [rcx+32]
|
|
vpaddq ymm7, ymm11, ymm7
|
|
vpmuludq ymm11, ymm1, [rcx+32]
|
|
vpaddq ymm8, ymm12, ymm8
|
|
vpmuludq ymm12, ymm1, [rcx+64]
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpmuludq ymm13, ymm2, [rcx+64]
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpmuludq ymm10, ymm0, [rcx+64]
|
|
vpaddq ymm7, ymm11, ymm7
|
|
vpmuludq ymm11, ymm0, [rcx+96]
|
|
vpaddq ymm8, ymm12, ymm8
|
|
vpmuludq ymm12, ymm1, [rcx+96]
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpaddq ymm7, ymm10, ymm7
|
|
vpmuludq ymm13, ymm0, [rcx+128]
|
|
vpaddq ymm8, ymm11, ymm8
|
|
vpaddq ymm9, ymm12, ymm9
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpsrlq ymm10, ymm5, 26
|
|
vpsrlq ymm11, ymm8, 26
|
|
vpand ymm5, ymm5, ymm14
|
|
vpand ymm8, ymm8, ymm14
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpaddq ymm9, ymm11, ymm9
|
|
vpsrlq ymm10, ymm6, 26
|
|
vpsrlq ymm11, ymm9, 26
|
|
vpand ymm1, ymm6, ymm14
|
|
vpand ymm4, ymm9, ymm14
|
|
vpaddq ymm7, ymm10, ymm7
|
|
vpslld ymm12, ymm11, 2
|
|
vpaddd ymm12, ymm11, ymm12
|
|
vpsrlq ymm10, ymm7, 26
|
|
vpaddq ymm5, ymm12, ymm5
|
|
vpsrlq ymm11, ymm5, 26
|
|
vpand ymm2, ymm7, ymm14
|
|
vpand ymm0, ymm5, ymm14
|
|
vpaddq ymm8, ymm10, ymm8
|
|
vpaddq ymm1, ymm11, ymm1
|
|
vpsrlq ymm10, ymm8, 26
|
|
vpand ymm3, ymm8, ymm14
|
|
vpaddq ymm4, ymm10, ymm4
|
|
vpsrldq ymm5, ymm0, 8
|
|
vpsrldq ymm6, ymm1, 8
|
|
vpsrldq ymm7, ymm2, 8
|
|
vpsrldq ymm8, ymm3, 8
|
|
vpsrldq ymm9, ymm4, 8
|
|
vpaddq ymm0, ymm5, ymm0
|
|
vpaddq ymm1, ymm6, ymm1
|
|
vpaddq ymm2, ymm7, ymm2
|
|
vpaddq ymm3, ymm8, ymm3
|
|
vpaddq ymm4, ymm9, ymm4
|
|
vpermq ymm5, ymm0, 2
|
|
vpermq ymm6, ymm1, 2
|
|
vpermq ymm7, ymm2, 2
|
|
vpermq ymm8, ymm3, 2
|
|
vpermq ymm9, ymm4, 2
|
|
vpaddq ymm0, ymm5, ymm0
|
|
vpaddq ymm1, ymm6, ymm1
|
|
vpaddq ymm2, ymm7, ymm2
|
|
vpaddq ymm3, ymm8, ymm3
|
|
vpaddq ymm4, ymm9, ymm4
|
|
vmovd r8d, xmm0
|
|
vmovd r9d, xmm1
|
|
vmovd r10d, xmm2
|
|
vmovd r11d, xmm3
|
|
vmovd r12d, xmm4
|
|
jmp L_poly1305_avx2_blocks_end_calc
|
|
L_poly1305_avx2_blocks_start:
|
|
vmovdqu ymm5, YMMWORD PTR [rsi]
|
|
vmovdqu ymm6, YMMWORD PTR [rsi+32]
|
|
vperm2i128 ymm7, ymm5, ymm6, 32
|
|
vperm2i128 ymm5, ymm5, ymm6, 49
|
|
vpunpckldq ymm6, ymm7, ymm5
|
|
vpunpckhdq ymm8, ymm7, ymm5
|
|
vpunpckldq ymm5, ymm6, ymm15
|
|
vpunpckhdq ymm6, ymm6, ymm15
|
|
vpunpckldq ymm7, ymm8, ymm15
|
|
vpunpckhdq ymm8, ymm8, ymm15
|
|
vmovdqu ymm9, YMMWORD PTR [r14]
|
|
vpsllq ymm6, ymm6, 6
|
|
vpsllq ymm7, ymm7, 12
|
|
vpsllq ymm8, ymm8, 18
|
|
vpmuludq ymm10, ymm4, [rbx]
|
|
vpaddq ymm5, ymm10, ymm5
|
|
vpmuludq ymm10, ymm3, [rbx+32]
|
|
vpmuludq ymm11, ymm4, [rbx+32]
|
|
vpaddq ymm6, ymm11, ymm6
|
|
vpmuludq ymm11, ymm3, [rbx+64]
|
|
vpmuludq ymm12, ymm4, [rbx+64]
|
|
vpaddq ymm7, ymm12, ymm7
|
|
vpaddq ymm5, ymm10, ymm5
|
|
vpmuludq ymm12, ymm2, [rbx+64]
|
|
vpmuludq ymm13, ymm4, [rbx+96]
|
|
vpaddq ymm8, ymm13, ymm8
|
|
vpaddq ymm6, ymm11, ymm6
|
|
vpmuludq ymm13, ymm1, [rbx+96]
|
|
vpmuludq ymm10, ymm2, [rbx+96]
|
|
vpaddq ymm5, ymm12, ymm5
|
|
vpmuludq ymm11, ymm3, [rbx+96]
|
|
vpmuludq ymm12, ymm3, [rcx]
|
|
vpaddq ymm5, ymm13, ymm5
|
|
vpmuludq ymm13, ymm4, [rcx]
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpmuludq ymm13, ymm0, [rcx]
|
|
vpaddq ymm7, ymm11, ymm7
|
|
vpmuludq ymm10, ymm1, [rcx]
|
|
vpaddq ymm8, ymm12, ymm8
|
|
vpmuludq ymm11, ymm2, [rcx]
|
|
vpmuludq ymm12, ymm2, [rcx+32]
|
|
vpaddq ymm5, ymm13, ymm5
|
|
vpmuludq ymm13, ymm3, [rcx+32]
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpmuludq ymm10, ymm0, [rcx+32]
|
|
vpaddq ymm7, ymm11, ymm7
|
|
vpmuludq ymm11, ymm1, [rcx+32]
|
|
vpaddq ymm8, ymm12, ymm8
|
|
vpmuludq ymm12, ymm1, [rcx+64]
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpmuludq ymm13, ymm2, [rcx+64]
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpmuludq ymm10, ymm0, [rcx+64]
|
|
vpaddq ymm7, ymm11, ymm7
|
|
vpmuludq ymm11, ymm0, [rcx+96]
|
|
vpaddq ymm8, ymm12, ymm8
|
|
vpmuludq ymm12, ymm1, [rcx+96]
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpaddq ymm7, ymm10, ymm7
|
|
vpmuludq ymm13, ymm0, [rcx+128]
|
|
vpaddq ymm8, ymm11, ymm8
|
|
vpaddq ymm9, ymm12, ymm9
|
|
vpaddq ymm9, ymm13, ymm9
|
|
vpsrlq ymm10, ymm5, 26
|
|
vpsrlq ymm11, ymm8, 26
|
|
vpand ymm5, ymm5, ymm14
|
|
vpand ymm8, ymm8, ymm14
|
|
vpaddq ymm6, ymm10, ymm6
|
|
vpaddq ymm9, ymm11, ymm9
|
|
vpsrlq ymm10, ymm6, 26
|
|
vpsrlq ymm11, ymm9, 26
|
|
vpand ymm1, ymm6, ymm14
|
|
vpand ymm4, ymm9, ymm14
|
|
vpaddq ymm7, ymm10, ymm7
|
|
vpslld ymm12, ymm11, 2
|
|
vpaddd ymm12, ymm11, ymm12
|
|
vpsrlq ymm10, ymm7, 26
|
|
vpaddq ymm5, ymm12, ymm5
|
|
vpsrlq ymm11, ymm5, 26
|
|
vpand ymm2, ymm7, ymm14
|
|
vpand ymm0, ymm5, ymm14
|
|
vpaddq ymm8, ymm10, ymm8
|
|
vpaddq ymm1, ymm11, ymm1
|
|
vpsrlq ymm10, ymm8, 26
|
|
vpand ymm3, ymm8, ymm14
|
|
vpaddq ymm4, ymm10, ymm4
|
|
add rsi, 64
|
|
sub rdx, 64
|
|
jnz L_poly1305_avx2_blocks_start
|
|
L_poly1305_avx2_blocks_store:
|
|
; Store four H values - state
|
|
vmovdqu YMMWORD PTR [rax], ymm0
|
|
vmovdqu YMMWORD PTR [rax+32], ymm1
|
|
vmovdqu YMMWORD PTR [rax+64], ymm2
|
|
vmovdqu YMMWORD PTR [rax+96], ymm3
|
|
vmovdqu YMMWORD PTR [rax+128], ymm4
|
|
L_poly1305_avx2_blocks_end_calc:
|
|
cmp BYTE PTR [rdi+616], 0
|
|
je L_poly1305_avx2_blocks_complete
|
|
mov rax, r8
|
|
mov rdx, r10
|
|
mov rcx, r12
|
|
shr rdx, 12
|
|
shr rcx, 24
|
|
shl r9, 26
|
|
shl r10, 52
|
|
shl r11, 14
|
|
shl r12, 40
|
|
add rax, r9
|
|
adc rax, r10
|
|
adc rdx, r11
|
|
adc rdx, r12
|
|
adc rcx, 0
|
|
mov r8, rcx
|
|
and rcx, 3
|
|
shr r8, 2
|
|
lea r8, QWORD PTR [r8+4*r8+0]
|
|
add rax, r8
|
|
adc rdx, 0
|
|
adc rcx, 0
|
|
mov QWORD PTR [rdi+24], rax
|
|
mov QWORD PTR [rdi+32], rdx
|
|
mov QWORD PTR [rdi+40], rcx
|
|
L_poly1305_avx2_blocks_complete:
|
|
mov BYTE PTR [rdi+617], 1
|
|
vzeroupper
|
|
vmovdqu xmm6, OWORD PTR [rsp+320]
|
|
vmovdqu xmm7, OWORD PTR [rsp+336]
|
|
vmovdqu xmm8, OWORD PTR [rsp+352]
|
|
vmovdqu xmm9, OWORD PTR [rsp+368]
|
|
vmovdqu xmm10, OWORD PTR [rsp+384]
|
|
vmovdqu xmm11, OWORD PTR [rsp+400]
|
|
vmovdqu xmm12, OWORD PTR [rsp+416]
|
|
vmovdqu xmm13, OWORD PTR [rsp+432]
|
|
vmovdqu xmm14, OWORD PTR [rsp+448]
|
|
vmovdqu xmm15, OWORD PTR [rsp+464]
|
|
add rsp, 480
|
|
pop r14
|
|
pop r13
|
|
pop rbx
|
|
pop rsi
|
|
pop rdi
|
|
pop r12
|
|
ret
|
|
poly1305_blocks_avx2 ENDP
|
|
_text ENDS
|
|
_text SEGMENT READONLY PARA
|
|
poly1305_final_avx2 PROC
|
|
push rdi
|
|
push rsi
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
mov BYTE PTR [rdi+616], 1
|
|
mov cl, BYTE PTR [rdi+617]
|
|
cmp cl, 0
|
|
je L_poly1305_avx2_final_done_blocks_X4
|
|
push rsi
|
|
mov r8, 64
|
|
xor rdx, rdx
|
|
mov rcx, rdi
|
|
call poly1305_blocks_avx2
|
|
pop rsi
|
|
L_poly1305_avx2_final_done_blocks_X4:
|
|
mov rax, QWORD PTR [rdi+608]
|
|
mov rcx, rax
|
|
and rcx, -16
|
|
cmp cl, 0
|
|
je L_poly1305_avx2_final_done_blocks
|
|
push rcx
|
|
push rax
|
|
push rsi
|
|
mov r8, rcx
|
|
lea rdx, QWORD PTR [rdi+480]
|
|
mov rcx, rdi
|
|
call poly1305_blocks_avx
|
|
pop rsi
|
|
pop rax
|
|
pop rcx
|
|
L_poly1305_avx2_final_done_blocks:
|
|
sub QWORD PTR [rdi+608], rcx
|
|
xor rdx, rdx
|
|
jmp L_poly1305_avx2_final_cmp_copy
|
|
L_poly1305_avx2_final_start_copy:
|
|
mov r8b, BYTE PTR [rdi+rcx+480]
|
|
mov BYTE PTR [rdi+rdx+480], r8b
|
|
inc cl
|
|
inc dl
|
|
L_poly1305_avx2_final_cmp_copy:
|
|
cmp al, cl
|
|
jne L_poly1305_avx2_final_start_copy
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
call poly1305_final_avx
|
|
vpxor ymm0, ymm0, ymm0
|
|
vmovdqu YMMWORD PTR [rdi+64], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+96], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+128], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+160], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+192], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+224], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+256], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+288], ymm0
|
|
vmovdqu YMMWORD PTR [rdi+320], ymm0
|
|
mov QWORD PTR [rdi+608], 0
|
|
mov WORD PTR [rdi+616], 0
|
|
vzeroupper
|
|
pop rsi
|
|
pop rdi
|
|
ret
|
|
poly1305_final_avx2 ENDP
|
|
_text ENDS
|
|
ENDIF
|
|
END
|