Merge pull request #10728 from SparkiDev/intel_asm_fixup

Intel x86/x64 assembly fixes
This commit is contained in:
David Garske
2026-06-25 21:41:08 -07:00
committed by GitHub
15 changed files with 137 additions and 204 deletions
+6 -2
View File
@@ -1885,7 +1885,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c
if BUILD_CURVE25519_INTELASM
if !BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
endif !BUILD_X86_ASM
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c
endif BUILD_X86_ASM
else
if BUILD_ARMASM
if !BUILD_FIPS_V6_PLUS
@@ -1946,7 +1948,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c
if BUILD_CURVE25519_INTELASM
if !BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
endif !BUILD_X86_ASM
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c
endif BUILD_X86_ASM
else
if !BUILD_FIPS_V6_PLUS
if BUILD_ARMASM
+16 -16
View File
@@ -15778,7 +15778,7 @@ int wc_AesXtsDecryptSector(XtsAes* aes, byte* out, const byte* in, word32 sz,
}
#endif
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
#if defined(USE_INTEL_SPEEDUP_FOR_AES) && !defined(USE_INTEL_SPEEDUP)
#define USE_INTEL_SPEEDUP
@@ -15841,7 +15841,7 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo
#endif /* HAVE_INTEL_AVX1 */
#endif /* HAVE_AES_DECRYPT */
#endif /* WOLFSSL_AESNI */
#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
#ifdef HAVE_AES_ECB
#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
@@ -16094,7 +16094,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
AES_XTS_encrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key,
(byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds);
ret = 0;
#elif defined(WOLFSSL_AESNI)
#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(HAVE_INTEL_AVX1)
@@ -16196,7 +16196,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
stream->bytes_crypted_with_this_tweak = 0;
{
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(HAVE_INTEL_AVX1)
@@ -16217,7 +16217,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
RESTORE_VECTOR_REGISTERS();
}
else
#endif /* WOLFSSL_AESNI */
#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
{
ret = AesXtsInitTweak_sw(xaes, stream->tweak_block);
}
@@ -16247,7 +16247,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
{
int ret;
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
Aes *aes;
#endif
@@ -16255,7 +16255,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
return BAD_FUNC_ARG;
}
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
aes = &xaes->aes;
#endif
@@ -16291,7 +16291,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
}
#endif
{
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(HAVE_INTEL_AVX1)
@@ -16314,7 +16314,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
RESTORE_VECTOR_REGISTERS();
}
else
#endif /* WOLFSSL_AESNI */
#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
{
ret = AesXtsEncryptUpdate_sw(xaes, out, in, sz, stream->tweak_block);
}
@@ -16575,7 +16575,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
AES_XTS_decrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key,
(byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds);
ret = 0;
#elif defined(WOLFSSL_AESNI)
#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(HAVE_INTEL_AVX1)
@@ -16680,7 +16680,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
stream->bytes_crypted_with_this_tweak = 0;
{
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(HAVE_INTEL_AVX1)
@@ -16701,7 +16701,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
RESTORE_VECTOR_REGISTERS();
}
else
#endif /* WOLFSSL_AESNI */
#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
{
ret = AesXtsInitTweak_sw(xaes, stream->tweak_block);
}
@@ -16729,7 +16729,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
struct XtsAesStreamData *stream)
{
int ret;
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
Aes *aes;
#endif
@@ -16737,7 +16737,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
return BAD_FUNC_ARG;
}
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
#ifdef WC_AES_XTS_SUPPORT_SIMULTANEOUS_ENC_AND_DEC_KEYS
aes = &xaes->aes_decrypt;
#else
@@ -16767,7 +16767,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
#endif
{
#ifdef WOLFSSL_AESNI
#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(HAVE_INTEL_AVX1)
@@ -16790,7 +16790,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
RESTORE_VECTOR_REGISTERS();
}
else
#endif /* WOLFSSL_AESNI */
#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
{
ret = AesXtsDecryptUpdate_sw(xaes, out, in, sz,
stream->tweak_block);
+5 -5
View File
@@ -1831,11 +1831,11 @@ _AES_ECB_decrypt_AESNI:
push %edi
push %esi
push %ebx
movl 20(%esp), %edi
movl 24(%esp), %esi
movl 28(%esp), %edx
movl 32(%esp), %ecx
movl 36(%esp), %eax
movl 16(%esp), %edi
movl 20(%esp), %esi
movl 24(%esp), %edx
movl 28(%esp), %ecx
movl 32(%esp), %eax
movl %edx, %ebx
-2
View File
@@ -3485,7 +3485,6 @@ L_AES_GCM_decrypt_aesni_last_block_start:
movdqa %xmm1, %xmm12
pclmulqdq $0x00, %xmm0, %xmm12
aesenc 80(%r15), %xmm8
movdqa %xmm1, %xmm1
pclmulqdq $0x11, %xmm0, %xmm1
aesenc 96(%r15), %xmm8
pxor %xmm11, %xmm10
@@ -6303,7 +6302,6 @@ L_AES_GCM_decrypt_update_aesni_last_block_start:
movdqa %xmm1, %xmm12
pclmulqdq $0x00, %xmm0, %xmm12
aesenc 80(%rdi), %xmm8
movdqa %xmm1, %xmm1
pclmulqdq $0x11, %xmm0, %xmm1
aesenc 96(%rdi), %xmm8
pxor %xmm11, %xmm10
+49 -86
View File
@@ -750,6 +750,9 @@ L_AES_GCM_encrypt_aesni_calc_aad_done:
# First 64 bytes of input
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm4
movdqu %xmm4, %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm3
movdqa %xmm4, %xmm5
movdqa %xmm4, %xmm6
@@ -761,9 +764,6 @@ L_AES_GCM_encrypt_aesni_calc_aad_done:
pshufb %xmm3, %xmm6
paddd L_aes_gcm_three, %xmm7
pshufb %xmm3, %xmm7
movdqu 64(%esp), %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa (%ebp), %xmm3
pxor %xmm3, %xmm4
pxor %xmm3, %xmm5
@@ -867,6 +867,9 @@ L_AES_GCM_encrypt_aesni_ghash_64:
leal (%edi,%ebx,1), %edx
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm4
movdqu %xmm4, %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm3
movdqa %xmm4, %xmm5
movdqa %xmm4, %xmm6
@@ -878,9 +881,6 @@ L_AES_GCM_encrypt_aesni_ghash_64:
pshufb %xmm3, %xmm6
paddd L_aes_gcm_three, %xmm7
pshufb %xmm3, %xmm7
movdqu 64(%esp), %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa (%ebp), %xmm3
pxor %xmm3, %xmm4
pxor %xmm3, %xmm5
@@ -2146,6 +2146,9 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace:
leal (%edi,%ebx,1), %edx
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm4
movdqu %xmm4, %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm3
movdqa %xmm4, %xmm5
movdqa %xmm4, %xmm6
@@ -2157,9 +2160,6 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace:
pshufb %xmm3, %xmm6
paddd L_aes_gcm_three, %xmm7
pshufb %xmm3, %xmm7
movdqu 64(%esp), %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa (%ebp), %xmm3
pxor %xmm3, %xmm4
pxor %xmm3, %xmm5
@@ -2359,6 +2359,9 @@ L_AES_GCM_decrypt_aesni_ghash_64:
leal (%edi,%ebx,1), %edx
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm4
movdqu %xmm4, %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm3
movdqa %xmm4, %xmm5
movdqa %xmm4, %xmm6
@@ -2370,9 +2373,6 @@ L_AES_GCM_decrypt_aesni_ghash_64:
pshufb %xmm3, %xmm6
paddd L_aes_gcm_three, %xmm7
pshufb %xmm3, %xmm7
movdqu 64(%esp), %xmm3
paddd L_aes_gcm_four, %xmm3
movdqu %xmm3, 64(%esp)
movdqa (%ebp), %xmm3
pxor %xmm3, %xmm4
pxor %xmm3, %xmm5
@@ -2455,8 +2455,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
movdqu 16(%ecx), %xmm1
pxor %xmm0, %xmm4
pxor %xmm1, %xmm5
movdqu %xmm0, (%ecx)
movdqu %xmm1, 16(%ecx)
movdqu %xmm4, (%edx)
movdqu %xmm5, 16(%edx)
aesenclast %xmm3, %xmm6
@@ -2465,8 +2463,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
movdqu 48(%ecx), %xmm1
pxor %xmm0, %xmm6
pxor %xmm1, %xmm7
movdqu %xmm0, 32(%ecx)
movdqu %xmm1, 48(%ecx)
movdqu %xmm6, 32(%edx)
movdqu %xmm7, 48(%edx)
# ghash encrypted counter
@@ -3536,6 +3532,9 @@ AES_GCM_encrypt_update_aesni:
# First 64 bytes of input
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm0
movdqu %xmm0, %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm7
movdqa %xmm0, %xmm1
movdqa %xmm0, %xmm2
@@ -3547,9 +3546,6 @@ AES_GCM_encrypt_update_aesni:
pshufb %xmm7, %xmm2
paddd L_aes_gcm_three, %xmm3
pshufb %xmm7, %xmm3
movdqu 64(%esp), %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa (%ebp), %xmm7
pxor %xmm7, %xmm0
pxor %xmm7, %xmm1
@@ -3644,6 +3640,8 @@ L_AES_GCM_encrypt_update_aesni_enc_done:
movdqu %xmm3, 48(%edi)
cmpl $0x40, %eax
movl $0x40, %ebx
movl %esi, %ecx
movl %edi, %edx
jle L_AES_GCM_encrypt_update_aesni_end_64
# More 64 bytes of input
L_AES_GCM_encrypt_update_aesni_ghash_64:
@@ -3651,6 +3649,9 @@ L_AES_GCM_encrypt_update_aesni_ghash_64:
leal (%edi,%ebx,1), %edx
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm0
movdqu %xmm0, %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm7
movdqa %xmm0, %xmm1
movdqa %xmm0, %xmm2
@@ -3662,9 +3663,6 @@ L_AES_GCM_encrypt_update_aesni_ghash_64:
pshufb %xmm7, %xmm2
paddd L_aes_gcm_three, %xmm3
pshufb %xmm7, %xmm3
movdqu 64(%esp), %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa (%ebp), %xmm7
pxor %xmm7, %xmm0
pxor %xmm7, %xmm1
@@ -4406,6 +4404,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
leal (%edi,%ebx,1), %edx
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm0
movdqu %xmm0, %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm7
movdqa %xmm0, %xmm1
movdqa %xmm0, %xmm2
@@ -4417,9 +4418,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
pshufb %xmm7, %xmm2
paddd L_aes_gcm_three, %xmm3
pshufb %xmm7, %xmm3
movdqu 64(%esp), %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa (%ebp), %xmm7
pxor %xmm7, %xmm0
pxor %xmm7, %xmm1
@@ -4619,6 +4617,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64:
leal (%edi,%ebx,1), %edx
# Encrypt 64 bytes of counter
movdqu 64(%esp), %xmm0
movdqu %xmm0, %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa L_aes_gcm_bswap_epi64, %xmm7
movdqa %xmm0, %xmm1
movdqa %xmm0, %xmm2
@@ -4630,9 +4631,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64:
pshufb %xmm7, %xmm2
paddd L_aes_gcm_three, %xmm3
pshufb %xmm7, %xmm3
movdqu 64(%esp), %xmm7
paddd L_aes_gcm_four, %xmm7
movdqu %xmm7, 64(%esp)
movdqa (%ebp), %xmm7
pxor %xmm7, %xmm0
pxor %xmm7, %xmm1
@@ -4715,8 +4713,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
movdqu 16(%ecx), %xmm5
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
movdqu %xmm4, (%ecx)
movdqu %xmm5, 16(%ecx)
movdqu %xmm0, (%edx)
movdqu %xmm1, 16(%edx)
aesenclast %xmm7, %xmm2
@@ -4725,8 +4721,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
movdqu 48(%ecx), %xmm5
pxor %xmm4, %xmm2
pxor %xmm5, %xmm3
movdqu %xmm4, 32(%ecx)
movdqu %xmm5, 48(%ecx)
movdqu %xmm2, 32(%edx)
movdqu %xmm3, 48(%edx)
# ghash encrypted counter
@@ -5556,6 +5550,8 @@ L_AES_GCM_encrypt_avx1_calc_aad_done:
vmovdqu %xmm3, 48(%esp)
# First 64 bytes of input
vmovdqu 64(%esp), %xmm4
vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3
vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5
vpshufb %xmm3, %xmm5, %xmm5
@@ -5564,9 +5560,6 @@ L_AES_GCM_encrypt_avx1_calc_aad_done:
vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7
vpshufb %xmm3, %xmm7, %xmm7
vpshufb %xmm3, %xmm4, %xmm4
vmovdqu 64(%esp), %xmm3
vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa (%ebp), %xmm3
vpxor %xmm3, %xmm4, %xmm4
vpxor %xmm3, %xmm5, %xmm5
@@ -5649,8 +5642,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
vmovdqu 16(%esi), %xmm1
vpxor %xmm0, %xmm4, %xmm4
vpxor %xmm1, %xmm5, %xmm5
vmovdqu %xmm0, (%esi)
vmovdqu %xmm1, 16(%esi)
vmovdqu %xmm4, (%edi)
vmovdqu %xmm5, 16(%edi)
vaesenclast %xmm3, %xmm6, %xmm6
@@ -5659,8 +5650,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
vmovdqu 48(%esi), %xmm1
vpxor %xmm0, %xmm6, %xmm6
vpxor %xmm1, %xmm7, %xmm7
vmovdqu %xmm0, 32(%esi)
vmovdqu %xmm1, 48(%esi)
vmovdqu %xmm6, 32(%edi)
vmovdqu %xmm7, 48(%edi)
cmpl $0x40, %eax
@@ -5673,6 +5662,8 @@ L_AES_GCM_encrypt_avx1_ghash_64:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu 64(%esp), %xmm4
vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3
vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5
vpshufb %xmm3, %xmm5, %xmm5
@@ -5681,9 +5672,6 @@ L_AES_GCM_encrypt_avx1_ghash_64:
vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7
vpshufb %xmm3, %xmm7, %xmm7
vpshufb %xmm3, %xmm4, %xmm4
vmovdqu 64(%esp), %xmm3
vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa (%ebp), %xmm3
vpxor %xmm3, %xmm4, %xmm4
vpxor %xmm3, %xmm5, %xmm5
@@ -5864,7 +5852,7 @@ L_AES_GCM_encrypt_avx1_end_64:
vmovdqu 96(%esp), %xmm2
# Block 1
vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4
vmovdqa (%edx), %xmm1
vmovdqu (%edx), %xmm1
vpshufb %xmm4, %xmm1, %xmm1
vmovdqu 48(%esp), %xmm3
vpxor %xmm2, %xmm1, %xmm1
@@ -5886,7 +5874,7 @@ L_AES_GCM_encrypt_avx1_end_64:
vpxor %xmm5, %xmm2, %xmm2
# Block 2
vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4
vmovdqa 16(%edx), %xmm1
vmovdqu 16(%edx), %xmm1
vpshufb %xmm4, %xmm1, %xmm1
vmovdqu 32(%esp), %xmm3
# ghash_gfmul_xor_avx
@@ -5907,7 +5895,7 @@ L_AES_GCM_encrypt_avx1_end_64:
vpxor %xmm5, %xmm2, %xmm2
# Block 3
vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4
vmovdqa 32(%edx), %xmm1
vmovdqu 32(%edx), %xmm1
vpshufb %xmm4, %xmm1, %xmm1
vmovdqu 16(%esp), %xmm3
# ghash_gfmul_xor_avx
@@ -5928,7 +5916,7 @@ L_AES_GCM_encrypt_avx1_end_64:
vpxor %xmm5, %xmm2, %xmm2
# Block 4
vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4
vmovdqa 48(%edx), %xmm1
vmovdqu 48(%edx), %xmm1
vpshufb %xmm4, %xmm1, %xmm1
vmovdqu (%esp), %xmm3
# ghash_gfmul_xor_avx
@@ -6776,6 +6764,8 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu 64(%esp), %xmm4
vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3
vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5
vpshufb %xmm3, %xmm5, %xmm5
@@ -6784,9 +6774,6 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace:
vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7
vpshufb %xmm3, %xmm7, %xmm7
vpshufb %xmm3, %xmm4, %xmm4
vmovdqu 64(%esp), %xmm3
vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa (%ebp), %xmm3
vpxor %xmm3, %xmm4, %xmm4
vpxor %xmm3, %xmm5, %xmm5
@@ -6972,6 +6959,8 @@ L_AES_GCM_decrypt_avx1_ghash_64:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu 64(%esp), %xmm4
vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3
vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5
vpshufb %xmm3, %xmm5, %xmm5
@@ -6980,9 +6969,6 @@ L_AES_GCM_decrypt_avx1_ghash_64:
vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7
vpshufb %xmm3, %xmm7, %xmm7
vpshufb %xmm3, %xmm4, %xmm4
vmovdqu 64(%esp), %xmm3
vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3
vmovdqu %xmm3, 64(%esp)
vmovdqa (%ebp), %xmm3
vpxor %xmm3, %xmm4, %xmm4
vpxor %xmm3, %xmm5, %xmm5
@@ -7065,8 +7051,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
vmovdqu 16(%ecx), %xmm1
vpxor %xmm0, %xmm4, %xmm4
vpxor %xmm1, %xmm5, %xmm5
vmovdqu %xmm0, (%ecx)
vmovdqu %xmm1, 16(%ecx)
vmovdqu %xmm4, (%edx)
vmovdqu %xmm5, 16(%edx)
vaesenclast %xmm3, %xmm6, %xmm6
@@ -7075,8 +7059,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
vmovdqu 48(%ecx), %xmm1
vpxor %xmm0, %xmm6, %xmm6
vpxor %xmm1, %xmm7, %xmm7
vmovdqu %xmm0, 32(%ecx)
vmovdqu %xmm1, 48(%ecx)
vmovdqu %xmm6, 32(%edx)
vmovdqu %xmm7, 48(%edx)
# ghash encrypted counter
@@ -7181,7 +7163,6 @@ L_AES_GCM_decrypt_avx1_last_block_start:
pshufb L_aes_gcm_avx1_bswap_mask, %xmm7
pxor %xmm2, %xmm7
vmovdqu 64(%esp), %xmm5
vmovdqu %xmm7, %xmm7
vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
vpaddd L_aes_gcm_avx1_one, %xmm5, %xmm5
vmovdqu %xmm5, 64(%esp)
@@ -7995,6 +7976,8 @@ AES_GCM_encrypt_update_avx1:
vmovdqu %xmm7, 48(%esp)
# First 64 bytes of input
vmovdqu 64(%esp), %xmm0
vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7
vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1
vpshufb %xmm7, %xmm1, %xmm1
@@ -8003,9 +7986,6 @@ AES_GCM_encrypt_update_avx1:
vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3
vpshufb %xmm7, %xmm3, %xmm3
vpshufb %xmm7, %xmm0, %xmm0
vmovdqu 64(%esp), %xmm7
vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa (%ebp), %xmm7
vpxor %xmm7, %xmm0, %xmm0
vpxor %xmm7, %xmm1, %xmm1
@@ -8088,8 +8068,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
vmovdqu 16(%esi), %xmm5
vpxor %xmm4, %xmm0, %xmm0
vpxor %xmm5, %xmm1, %xmm1
vmovdqu %xmm4, (%esi)
vmovdqu %xmm5, 16(%esi)
vmovdqu %xmm0, (%edi)
vmovdqu %xmm1, 16(%edi)
vaesenclast %xmm7, %xmm2, %xmm2
@@ -8098,8 +8076,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
vmovdqu 48(%esi), %xmm5
vpxor %xmm4, %xmm2, %xmm2
vpxor %xmm5, %xmm3, %xmm3
vmovdqu %xmm4, 32(%esi)
vmovdqu %xmm5, 48(%esi)
vmovdqu %xmm2, 32(%edi)
vmovdqu %xmm3, 48(%edi)
cmpl $0x40, %eax
@@ -8112,6 +8088,8 @@ L_AES_GCM_encrypt_update_avx1_ghash_64:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu 64(%esp), %xmm0
vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7
vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1
vpshufb %xmm7, %xmm1, %xmm1
@@ -8120,9 +8098,6 @@ L_AES_GCM_encrypt_update_avx1_ghash_64:
vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3
vpshufb %xmm7, %xmm3, %xmm3
vpshufb %xmm7, %xmm0, %xmm0
vmovdqu 64(%esp), %xmm7
vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa (%ebp), %xmm7
vpxor %xmm7, %xmm0, %xmm0
vpxor %xmm7, %xmm1, %xmm1
@@ -8754,6 +8729,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu 64(%esp), %xmm0
vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7
vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1
vpshufb %xmm7, %xmm1, %xmm1
@@ -8762,9 +8739,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3
vpshufb %xmm7, %xmm3, %xmm3
vpshufb %xmm7, %xmm0, %xmm0
vmovdqu 64(%esp), %xmm7
vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa (%ebp), %xmm7
vpxor %xmm7, %xmm0, %xmm0
vpxor %xmm7, %xmm1, %xmm1
@@ -8950,6 +8924,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu 64(%esp), %xmm0
vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7
vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1
vpshufb %xmm7, %xmm1, %xmm1
@@ -8958,9 +8934,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64:
vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3
vpshufb %xmm7, %xmm3, %xmm3
vpshufb %xmm7, %xmm0, %xmm0
vmovdqu 64(%esp), %xmm7
vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7
vmovdqu %xmm7, 64(%esp)
vmovdqa (%ebp), %xmm7
vpxor %xmm7, %xmm0, %xmm0
vpxor %xmm7, %xmm1, %xmm1
@@ -9043,8 +9016,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
vmovdqu 16(%ecx), %xmm5
vpxor %xmm4, %xmm0, %xmm0
vpxor %xmm5, %xmm1, %xmm1
vmovdqu %xmm4, (%ecx)
vmovdqu %xmm5, 16(%ecx)
vmovdqu %xmm0, (%edx)
vmovdqu %xmm1, 16(%edx)
vaesenclast %xmm7, %xmm2, %xmm2
@@ -9053,8 +9024,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
vmovdqu 48(%ecx), %xmm5
vpxor %xmm4, %xmm2, %xmm2
vpxor %xmm5, %xmm3, %xmm3
vmovdqu %xmm4, 32(%ecx)
vmovdqu %xmm5, 48(%ecx)
vmovdqu %xmm2, 32(%edx)
vmovdqu %xmm3, 48(%edx)
# ghash encrypted counter
@@ -9155,12 +9124,10 @@ L_AES_GCM_decrypt_update_avx1_done_64:
L_AES_GCM_decrypt_update_avx1_last_block_start:
leal (%esi,%ebx,1), %ecx
leal (%edi,%ebx,1), %edx
vmovdqu (%ecx), %xmm1
vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
vpxor %xmm6, %xmm1, %xmm1
vmovdqu %xmm1, (%esp)
vmovdqu (%ecx), %xmm3
vpshufb L_aes_gcm_avx1_bswap_mask, %xmm3, %xmm3
vpxor %xmm6, %xmm3, %xmm3
vmovdqu 64(%esp), %xmm1
vmovdqu (%esp), %xmm3
vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
vpaddd L_aes_gcm_avx1_one, %xmm1, %xmm1
vmovdqu %xmm1, 64(%esp)
@@ -11036,8 +11003,6 @@ L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
vmovdqu 16(%ecx), %xmm4
vpxor %xmm7, %xmm0, %xmm0
vpxor %xmm4, %xmm1, %xmm1
vmovdqu %xmm7, (%ecx)
vmovdqu %xmm4, 16(%ecx)
vmovdqu %xmm0, (%edx)
vmovdqu %xmm1, 16(%edx)
vmovdqu 32(%ecx), %xmm7
@@ -12733,8 +12698,6 @@ L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
vmovdqu 16(%ecx), %xmm4
vpxor %xmm7, %xmm0, %xmm0
vpxor %xmm4, %xmm1, %xmm1
vmovdqu %xmm7, (%ecx)
vmovdqu %xmm4, 16(%ecx)
vmovdqu %xmm0, (%edx)
vmovdqu %xmm1, 16(%edx)
vmovdqu 32(%ecx), %xmm7
-1
View File
@@ -504,7 +504,6 @@ _poly1305_calc_powers_avx2:
# Reduce 260-bit to 130-bit
movq %r15, %rax
movq %rsi, %rdx
movq %rbx, %rbx
andq $-4, %rax
andq $3, %r15
addq %rax, %r13
-1
View File
@@ -454,7 +454,6 @@ poly1305_calc_powers_avx2 PROC
; Reduce 260-bit to 130-bit
mov rax, rdi
mov rdx, rsi
mov rbx, rbx
and rax, -4
and rdi, 3
add r14, rax
+3
View File
@@ -45,6 +45,9 @@
#undef WOLFSSL_ARMASM
#undef WOLFSSL_RISCV_ASM
#endif
#ifdef WOLFSSL_X86_BUILD
#undef USE_INTEL_SPEEDUP
#endif
#if defined(WOLFSSL_PSOC6_CRYPTO)
#include <wolfssl/wolfcrypt/port/cypress/psoc6_crypto.h>
+4 -4
View File
@@ -770,7 +770,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
"mull %[a] \n\t" \
"movl %%eax, %[l] \n\t" \
"movl %%edx, %[h] \n\t" \
: [h] "+r" (vh), [l] "+r" (vl) \
: [h] "+rm" (vh), [l] "+rm" (vl) \
: [a] "rm" (va), [b] "rm" (vb) \
: "eax", "edx", "cc" \
)
@@ -794,7 +794,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
"addl %%eax, %[l] \n\t" \
"adcl %%edx, %[h] \n\t" \
"adcl $0 , %[o] \n\t" \
: [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo) \
: [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
: [a] "rm" (va), [b] "rm" (vb) \
: "eax", "edx", "cc" \
)
@@ -820,7 +820,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
"addl %%eax, %[l] \n\t" \
"adcl %%edx, %[h] \n\t" \
"adcl $0 , %[o] \n\t" \
: [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo) \
: [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
: [a] "rm" (va), [b] "rm" (vb) \
: "eax", "edx", "cc" \
)
@@ -859,7 +859,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
"addl %%eax, %[l] \n\t" \
"adcl %%edx, %[h] \n\t" \
"adcl $0 , %[o] \n\t" \
: [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo) \
: [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
: [a] "rm" (va) \
: "eax", "edx", "cc" \
)
+21 -43
View File
@@ -7656,7 +7656,7 @@ _sp_2048_sqr_32:
subq $0x110, %rsp
movq %rdi, 256(%rsp)
movq %rsi, 264(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 128(%rsi), %r9
movq (%rsi), %rdx
@@ -7820,7 +7820,7 @@ _sp_2048_sqr_32:
movq 256(%rsp), %rsi
leaq 128(%rsp), %r8
addq $0x180, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -128(%r8), %rax
subq -128(%rsi), %rax
movq -120(%r8), %rdx
@@ -8197,7 +8197,7 @@ _sp_2048_sqr_avx2_32:
subq $0x110, %rsp
movq %rdi, 256(%rsp)
movq %rsi, 264(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 128(%rsi), %r9
movq (%rsi), %rdx
@@ -8361,7 +8361,7 @@ _sp_2048_sqr_avx2_32:
movq 256(%rsp), %rsi
leaq 128(%rsp), %r8
addq $0x180, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -128(%r8), %rax
subq -128(%rsi), %rax
movq -120(%r8), %rdx
@@ -9405,7 +9405,6 @@ L_2048_mont_reduce_16_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $0x80, %rdi
#ifndef __APPLE__
callq sp_2048_cond_sub_16@plt
@@ -10017,7 +10016,6 @@ _sp_2048_mont_reduce_avx2_16:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $0x40, %rdi
xorq %rbp, %rbp
L_2048_mont_reduce_avx2_16_loop:
# mu = a[i] * mp
movq %r12, %rdx
@@ -11482,7 +11480,6 @@ L_2048_mont_reduce_32_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $0x100, %rdi
#ifndef __APPLE__
callq sp_2048_cond_sub_32@plt
@@ -12368,7 +12365,6 @@ _sp_2048_mont_reduce_avx2_32:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $0x80, %rdi
xorq %rbp, %rbp
L_2048_mont_reduce_avx2_32_loop:
# mu = a[i] * mp
movq %r12, %rdx
@@ -15173,7 +15169,7 @@ sp_2048_lshift_32:
_sp_2048_lshift_32:
#endif /* __APPLE__ */
movb %dl, %cl
movq $0x00, %r10
xorq %r10, %r10
movq 216(%rsi), %r11
movq 224(%rsi), %rdx
movq 232(%rsi), %rax
@@ -22716,7 +22712,7 @@ _sp_3072_sqr_24:
subq $0xd0, %rsp
movq %rdi, 192(%rsp)
movq %rsi, 200(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 96(%rsi), %r9
movq (%rsi), %rdx
@@ -22848,7 +22844,7 @@ _sp_3072_sqr_24:
movq 192(%rsp), %rsi
leaq 96(%rsp), %r8
addq $0x120, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -96(%r8), %rax
subq -96(%rsi), %rax
movq -88(%r8), %rdx
@@ -23141,7 +23137,7 @@ _sp_3072_sqr_avx2_24:
subq $0xd0, %rsp
movq %rdi, 192(%rsp)
movq %rsi, 200(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 96(%rsi), %r9
movq (%rsi), %rdx
@@ -23273,7 +23269,7 @@ _sp_3072_sqr_avx2_24:
movq 192(%rsp), %rsi
leaq 96(%rsp), %r8
addq $0x120, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -96(%r8), %rax
subq -96(%rsi), %rax
movq -88(%r8), %rdx
@@ -23566,7 +23562,7 @@ _sp_3072_sqr_48:
subq $0x190, %rsp
movq %rdi, 384(%rsp)
movq %rsi, 392(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 192(%rsi), %r9
movq (%rsi), %rdx
@@ -23794,7 +23790,7 @@ _sp_3072_sqr_48:
movq 384(%rsp), %rsi
leaq 192(%rsp), %r8
addq $0x240, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -192(%r8), %rax
subq -192(%rsi), %rax
movq -184(%r8), %rdx
@@ -24339,7 +24335,7 @@ _sp_3072_sqr_avx2_48:
subq $0x190, %rsp
movq %rdi, 384(%rsp)
movq %rsi, 392(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 192(%rsi), %r9
movq (%rsi), %rdx
@@ -24567,7 +24563,7 @@ _sp_3072_sqr_avx2_48:
movq 384(%rsp), %rsi
leaq 192(%rsp), %r8
addq $0x240, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -192(%r8), %rax
subq -192(%rsi), %rax
movq -184(%r8), %rdx
@@ -25973,7 +25969,6 @@ L_3072_mont_reduce_24_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $0xc0, %rdi
#ifndef __APPLE__
callq sp_3072_cond_sub_24@plt
@@ -26801,7 +26796,6 @@ _sp_3072_mont_reduce_avx2_24:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $0x60, %rdi
xorq %rbp, %rbp
L_3072_mont_reduce_avx2_24_loop:
# mu = a[i] * mp
movq %r12, %rdx
@@ -28885,7 +28879,6 @@ L_3072_mont_reduce_48_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $0x180, %rdi
#ifndef __APPLE__
callq sp_3072_cond_sub_48@plt
@@ -30123,7 +30116,6 @@ _sp_3072_mont_reduce_avx2_48:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $0xc0, %rdi
xorq %rbp, %rbp
L_3072_mont_reduce_avx2_48_loop:
# mu = a[i] * mp
movq %r12, %rdx
@@ -31900,7 +31892,7 @@ sp_3072_lshift_48:
_sp_3072_lshift_48:
#endif /* __APPLE__ */
movb %dl, %cl
movq $0x00, %r10
xorq %r10, %r10
movq 344(%rsi), %r11
movq 352(%rsi), %rdx
movq 360(%rsi), %rax
@@ -35658,7 +35650,7 @@ _sp_4096_sqr_64:
subq $0x210, %rsp
movq %rdi, 512(%rsp)
movq %rsi, 520(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 256(%rsi), %r9
movq (%rsi), %rdx
@@ -35950,7 +35942,7 @@ _sp_4096_sqr_64:
movq 512(%rsp), %rsi
leaq 256(%rsp), %r8
addq $0x300, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -256(%r8), %rax
subq -256(%rsi), %rax
movq -248(%r8), %rdx
@@ -36663,7 +36655,7 @@ _sp_4096_sqr_avx2_64:
subq $0x210, %rsp
movq %rdi, 512(%rsp)
movq %rsi, 520(%rsp)
movq $0x00, %rcx
xorq %rcx, %rcx
movq %rsp, %r8
leaq 256(%rsi), %r9
movq (%rsi), %rdx
@@ -36955,7 +36947,7 @@ _sp_4096_sqr_avx2_64:
movq 512(%rsp), %rsi
leaq 256(%rsp), %r8
addq $0x300, %rsi
movq $0x00, %rcx
xorq %rcx, %rcx
movq -256(%r8), %rax
subq -256(%rsi), %rax
movq -248(%r8), %rdx
@@ -39337,7 +39329,6 @@ L_4096_mont_reduce_64_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $0x200, %rdi
#ifndef __APPLE__
callq sp_4096_cond_sub_64@plt
@@ -40927,7 +40918,6 @@ _sp_4096_mont_reduce_avx2_64:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $0x100, %rdi
xorq %rbp, %rbp
L_4096_mont_reduce_avx2_64_loop:
# mu = a[i] * mp
movq %r12, %rdx
@@ -43260,7 +43250,7 @@ sp_4096_lshift_64:
_sp_4096_lshift_64:
#endif /* __APPLE__ */
movb %dl, %cl
movq $0x00, %r10
xorq %r10, %r10
movq 472(%rsi), %r11
movq 480(%rsi), %rdx
movq 488(%rsi), %rax
@@ -44326,15 +44316,11 @@ _sp_256_mont_sqr_4:
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %rax
movq %rdx, %rdx
movq %rax, %r8
movq %rdx, %rbx
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
movq %rax, %rax
movq %rdx, %rdx
addq %rbx, %r9
adcq %rax, %r10
adcq $0x00, %rdx
@@ -44342,8 +44328,6 @@ _sp_256_mont_sqr_4:
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
movq %rax, %rax
movq %rdx, %rdx
addq %rbx, %r11
adcq %rax, %r12
adcq $0x00, %rdx
@@ -44351,8 +44335,6 @@ _sp_256_mont_sqr_4:
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
movq %rax, %rax
movq %rdx, %rdx
addq %rbx, %r13
adcq %rax, %r14
adcq %rdx, %r15
@@ -48981,7 +48963,6 @@ L_384_mont_reduce_order_6_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $48, %rdi
#ifndef __APPLE__
callq sp_384_cond_sub_6@plt
@@ -56409,7 +56390,6 @@ _sp_521_mont_reduce_order_avx2_9:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $32, %rdi
xorq %rbp, %rbp
L_521_mont_reduce_order_avx2_9_loop:
# mu = a[i] * mp
movq %r12, %rdx
@@ -57531,7 +57511,7 @@ sp_521_lshift_9:
_sp_521_lshift_9:
#endif /* __APPLE__ */
movb %dl, %cl
movq $0x00, %r10
xorq %r10, %r10
movq 32(%rsi), %r11
movq 40(%rsi), %rdx
movq 48(%rsi), %rax
@@ -57584,7 +57564,7 @@ sp_521_lshift_18:
_sp_521_lshift_18:
#endif /* __APPLE__ */
movb %dl, %cl
movq $0x00, %r10
xorq %r10, %r10
movq 104(%rsi), %r11
movq 112(%rsi), %rdx
movq 120(%rsi), %rax
@@ -64747,7 +64727,6 @@ L_1024_mont_reduce_16_loop:
movq %rsi, %rdx
#endif /* _WIN64 */
movq %rdi, %rsi
movq %rdi, %rdi
subq $0x80, %rdi
#ifndef __APPLE__
callq sp_1024_cond_sub_16@plt
@@ -65797,7 +65776,6 @@ _sp_1024_mont_reduce_avx2_16:
movq 16(%rdi), %r14
movq 24(%rdi), %r15
addq $0x40, %rdi
xorq %rbp, %rbp
L_1024_mont_reduce_avx2_16_loop:
# mu = a[i] * mp
movq %r12, %rdx
+21 -43
View File
@@ -7505,7 +7505,7 @@ sp_2048_sqr_32 PROC
sub rsp, 272
mov QWORD PTR [rsp+256], rcx
mov QWORD PTR [rsp+264], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+128]
mov rax, QWORD PTR [rdx]
@@ -7657,7 +7657,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+256]
lea r10, QWORD PTR [rsp+128]
add rdx, 384
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-128]
sub r8, QWORD PTR [rdx+-128]
mov rax, QWORD PTR [r10+-120]
@@ -8023,7 +8023,7 @@ sp_2048_sqr_avx2_32 PROC
sub rsp, 272
mov QWORD PTR [rsp+256], rcx
mov QWORD PTR [rsp+264], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+128]
mov rax, QWORD PTR [rdx]
@@ -8175,7 +8175,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+256]
lea r10, QWORD PTR [rsp+128]
add rdx, 384
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-128]
sub r8, QWORD PTR [rdx+-128]
mov rax, QWORD PTR [r10+-120]
@@ -9179,7 +9179,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 128
call sp_2048_cond_sub_16
pop rsi
@@ -9736,7 +9735,6 @@ sp_2048_mont_reduce_avx2_16 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 64
xor rbp, rbp
L_2048_mont_reduce_avx2_16_loop:
; mu = a[i] * mp
mov rdx, r14
@@ -11190,7 +11188,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 256
call sp_2048_cond_sub_32
pop rsi
@@ -12019,7 +12016,6 @@ sp_2048_mont_reduce_avx2_32 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 128
xor rbp, rbp
L_2048_mont_reduce_avx2_32_loop:
; mu = a[i] * mp
mov rdx, r14
@@ -14805,7 +14801,7 @@ sp_2048_lshift_32 PROC
push r13
mov rax, rcx
mov cl, r8b
mov r12, 0
xor r12, r12
mov r13, QWORD PTR [rdx+216]
mov r8, QWORD PTR [rdx+224]
mov r9, QWORD PTR [rdx+232]
@@ -22145,7 +22141,7 @@ sp_3072_sqr_24 PROC
sub rsp, 208
mov QWORD PTR [rsp+192], rcx
mov QWORD PTR [rsp+200], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+96]
mov rax, QWORD PTR [rdx]
@@ -22265,7 +22261,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+192]
lea r10, QWORD PTR [rsp+96]
add rdx, 288
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-96]
sub r8, QWORD PTR [rdx+-96]
mov rax, QWORD PTR [r10+-88]
@@ -22547,7 +22543,7 @@ sp_3072_sqr_avx2_24 PROC
sub rsp, 208
mov QWORD PTR [rsp+192], rcx
mov QWORD PTR [rsp+200], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+96]
mov rax, QWORD PTR [rdx]
@@ -22667,7 +22663,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+192]
lea r10, QWORD PTR [rsp+96]
add rdx, 288
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-96]
sub r8, QWORD PTR [rdx+-96]
mov rax, QWORD PTR [r10+-88]
@@ -22949,7 +22945,7 @@ sp_3072_sqr_48 PROC
sub rsp, 400
mov QWORD PTR [rsp+384], rcx
mov QWORD PTR [rsp+392], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+192]
mov rax, QWORD PTR [rdx]
@@ -23165,7 +23161,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+384]
lea r10, QWORD PTR [rsp+192]
add rdx, 576
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-192]
sub r8, QWORD PTR [rdx+-192]
mov rax, QWORD PTR [r10+-184]
@@ -23699,7 +23695,7 @@ sp_3072_sqr_avx2_48 PROC
sub rsp, 400
mov QWORD PTR [rsp+384], rcx
mov QWORD PTR [rsp+392], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+192]
mov rax, QWORD PTR [rdx]
@@ -23915,7 +23911,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+384]
lea r10, QWORD PTR [rsp+192]
add rdx, 576
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-192]
sub r8, QWORD PTR [rdx+-192]
mov rax, QWORD PTR [r10+-184]
@@ -25292,7 +25288,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 192
call sp_3072_cond_sub_24
pop rsi
@@ -26065,7 +26060,6 @@ sp_3072_mont_reduce_avx2_24 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 96
xor rbp, rbp
L_3072_mont_reduce_avx2_24_loop:
; mu = a[i] * mp
mov rdx, r14
@@ -28138,7 +28132,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 384
call sp_3072_cond_sub_48
pop rsi
@@ -29319,7 +29312,6 @@ sp_3072_mont_reduce_avx2_48 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 192
xor rbp, rbp
L_3072_mont_reduce_avx2_48_loop:
; mu = a[i] * mp
mov rdx, r14
@@ -31077,7 +31069,7 @@ sp_3072_lshift_48 PROC
push r13
mov rax, rcx
mov cl, r8b
mov r12, 0
xor r12, r12
mov r13, QWORD PTR [rdx+344]
mov r8, QWORD PTR [rdx+352]
mov r9, QWORD PTR [rdx+360]
@@ -34728,7 +34720,7 @@ sp_4096_sqr_64 PROC
sub rsp, 528
mov QWORD PTR [rsp+512], rcx
mov QWORD PTR [rsp+520], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+256]
mov rax, QWORD PTR [rdx]
@@ -35008,7 +35000,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+512]
lea r10, QWORD PTR [rsp+256]
add rdx, 768
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-256]
sub r8, QWORD PTR [rdx+-256]
mov rax, QWORD PTR [r10+-248]
@@ -35710,7 +35702,7 @@ sp_4096_sqr_avx2_64 PROC
sub rsp, 528
mov QWORD PTR [rsp+512], rcx
mov QWORD PTR [rsp+520], rdx
mov r9, 0
xor r9, r9
mov r10, rsp
lea r11, QWORD PTR [rdx+256]
mov rax, QWORD PTR [rdx]
@@ -35990,7 +35982,7 @@ ENDIF
mov rdx, QWORD PTR [rsp+512]
lea r10, QWORD PTR [rsp+256]
add rdx, 768
mov r9, 0
xor r9, r9
mov r8, QWORD PTR [r10+-256]
sub r8, QWORD PTR [rdx+-256]
mov rax, QWORD PTR [r10+-248]
@@ -38343,7 +38335,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 512
call sp_4096_cond_sub_64
pop rsi
@@ -39876,7 +39867,6 @@ sp_4096_mont_reduce_avx2_64 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 256
xor rbp, rbp
L_4096_mont_reduce_avx2_64_loop:
; mu = a[i] * mp
mov rdx, r14
@@ -42190,7 +42180,7 @@ sp_4096_lshift_64 PROC
push r13
mov rax, rcx
mov cl, r8b
mov r12, 0
xor r12, r12
mov r13, QWORD PTR [rdx+472]
mov r8, QWORD PTR [rdx+480]
mov r9, QWORD PTR [rdx+488]
@@ -43187,15 +43177,11 @@ sp_256_mont_sqr_4 PROC
; A[0] * A[0]
mov rax, QWORD PTR [r8]
mul rax
mov rax, rax
mov rdx, rdx
mov r10, rax
mov rbx, rdx
; A[1] * A[1]
mov rax, QWORD PTR [r8+8]
mul rax
mov rax, rax
mov rdx, rdx
add r11, rbx
adc r12, rax
adc rdx, 0
@@ -43203,8 +43189,6 @@ sp_256_mont_sqr_4 PROC
; A[2] * A[2]
mov rax, QWORD PTR [r8+16]
mul rax
mov rax, rax
mov rdx, rdx
add r13, rbx
adc r14, rax
adc rdx, 0
@@ -43212,8 +43196,6 @@ sp_256_mont_sqr_4 PROC
; A[3] * A[3]
mov rax, QWORD PTR [r8+24]
mul rax
mov rax, rax
mov rdx, rdx
add r15, rbx
adc rdi, rax
adc rsi, rdx
@@ -47531,7 +47513,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 48
call sp_384_cond_sub_6
pop rsi
@@ -54689,7 +54670,6 @@ sp_521_mont_reduce_order_avx2_9 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 32
xor rbp, rbp
L_521_mont_reduce_order_avx2_9_loop:
; mu = a[i] * mp
mov rdx, r14
@@ -55781,7 +55761,7 @@ sp_521_lshift_9 PROC
push r13
mov rax, rcx
mov cl, r8b
mov r12, 0
xor r12, r12
mov r13, QWORD PTR [rdx+32]
mov r8, QWORD PTR [rdx+40]
mov r9, QWORD PTR [rdx+48]
@@ -55828,7 +55808,7 @@ sp_521_lshift_18 PROC
push r13
mov rax, rcx
mov cl, r8b
mov r12, 0
xor r12, r12
mov r13, QWORD PTR [rdx+104]
mov r8, QWORD PTR [rdx+112]
mov r9, QWORD PTR [rdx+120]
@@ -62803,7 +62783,6 @@ ELSE
mov r8, r9
ENDIF
mov rdx, rcx
mov rcx, rcx
sub rcx, 128
call sp_1024_cond_sub_16
pop rsi
@@ -63804,7 +63783,6 @@ sp_1024_mont_reduce_avx2_16 PROC
mov rdi, QWORD PTR [r9+16]
mov rsi, QWORD PTR [r9+24]
add r9, 64
xor rbp, rbp
L_1024_mont_reduce_avx2_16_loop:
; mu = a[i] * mp
mov rdx, r14
+4
View File
@@ -166,6 +166,10 @@
#include <wolfcrypt/src/misc.c>
#endif
#ifdef WOLFSSL_X86_BUILD
#undef USE_INTEL_SPEEDUP
#endif
#if defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM_PRECALC) && \
!defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM)
#define WOLFSSL_MLDSA_SIGN_SMALL_MEM
+3
View File
@@ -74,6 +74,9 @@
#undef WOLFSSL_ARMASM
#undef WOLFSSL_RISCV_ASM
#endif
#ifdef WOLFSSL_X86_BUILD
#undef USE_INTEL_SPEEDUP
#endif
#include <wolfssl/wolfcrypt/wc_mlkem.h>
#include <wolfssl/wolfcrypt/sha3.h>
+3
View File
@@ -52,6 +52,9 @@
#undef WOLFSSL_ARMASM
#undef WOLFSSL_RISCV_ASM
#endif
#ifdef WOLFSSL_X86_BUILD
#undef USE_INTEL_SPEEDUP
#endif
#if defined(USE_INTEL_SPEEDUP)
/* CPU information for Intel. */
+2 -1
View File
@@ -29,7 +29,8 @@
#include <wolfssl/wolfcrypt/types.h>
#if defined(USE_INTEL_SPEEDUP) && !defined(NO_CURVED25519_X64)
#if defined(USE_INTEL_SPEEDUP) && defined(WOLFSSL_X86_64_BUILD) && \
!defined(NO_CURVED25519_X64)
#define CURVED25519_X64
#elif defined(HAVE___UINT128_T) && !defined(NO_CURVED25519_128BIT)
#define CURVED25519_128BIT