Merge pull request #10728 from SparkiDev/intel_asm_fixup

Intel x86/x64 assembly fixes
2026-07-05 10:40:52 +02:00 · 2026-06-25 21:41:08 -07:00
parent 23bfe9b65e fc946d6327
commit 39c0336cb1
15 changed files with 137 additions and 204 deletions
@@ -1885,7 +1885,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c
 if BUILD_CURVE25519_INTELASM
 if !BUILD_X86_ASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
-endif !BUILD_X86_ASM
+else
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c
+endif BUILD_X86_ASM
 else
 if BUILD_ARMASM
 if !BUILD_FIPS_V6_PLUS
@@ -1946,7 +1948,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c
 if BUILD_CURVE25519_INTELASM
 if !BUILD_X86_ASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
-endif !BUILD_X86_ASM
+else
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c
+endif BUILD_X86_ASM
 else
 if !BUILD_FIPS_V6_PLUS
 if BUILD_ARMASM
@@ -15778,7 +15778,7 @@ int wc_AesXtsDecryptSector(XtsAes* aes, byte* out, const byte* in, word32 sz,
 }
 #endif

-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)

 #if defined(USE_INTEL_SPEEDUP_FOR_AES) && !defined(USE_INTEL_SPEEDUP)
    #define USE_INTEL_SPEEDUP
@@ -15841,7 +15841,7 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo
 #endif /* HAVE_INTEL_AVX1 */
 #endif /* HAVE_AES_DECRYPT */

-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */

 #ifdef HAVE_AES_ECB
 #if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
@@ -16094,7 +16094,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
    AES_XTS_encrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key,
        (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds);
    ret = 0;
-#elif defined(WOLFSSL_AESNI)
+#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
    if (aes->use_aesni) {
        SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16196,7 +16196,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
    stream->bytes_crypted_with_this_tweak = 0;

    {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
        if (aes->use_aesni) {
            SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16217,7 +16217,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
            RESTORE_VECTOR_REGISTERS();
        }
        else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
        {
            ret = AesXtsInitTweak_sw(xaes, stream->tweak_block);
        }
@@ -16247,7 +16247,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
 {
    int ret;

-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
    Aes *aes;
 #endif

@@ -16255,7 +16255,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
        return BAD_FUNC_ARG;
    }

-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
    aes = &xaes->aes;
 #endif

@@ -16291,7 +16291,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
    }
 #endif
    {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
        if (aes->use_aesni) {
            SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16314,7 +16314,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
            RESTORE_VECTOR_REGISTERS();
        }
        else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
        {
            ret = AesXtsEncryptUpdate_sw(xaes, out, in, sz, stream->tweak_block);
        }
@@ -16575,7 +16575,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
    AES_XTS_decrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key,
        (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds);
    ret = 0;
-#elif defined(WOLFSSL_AESNI)
+#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
    if (aes->use_aesni) {
        SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16680,7 +16680,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
    stream->bytes_crypted_with_this_tweak = 0;

    {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
        if (aes->use_aesni) {
            SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16701,7 +16701,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
            RESTORE_VECTOR_REGISTERS();
        }
        else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
        {
            ret = AesXtsInitTweak_sw(xaes, stream->tweak_block);
        }
@@ -16729,7 +16729,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
                           struct XtsAesStreamData *stream)
 {
    int ret;
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
    Aes *aes;
 #endif

@@ -16737,7 +16737,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
        return BAD_FUNC_ARG;
    }

-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
 #ifdef WC_AES_XTS_SUPPORT_SIMULTANEOUS_ENC_AND_DEC_KEYS
    aes = &xaes->aes_decrypt;
 #else
@@ -16767,7 +16767,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
 #endif

    {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
        if (aes->use_aesni) {
            SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16790,7 +16790,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
            RESTORE_VECTOR_REGISTERS();
        }
        else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
        {
            ret = AesXtsDecryptUpdate_sw(xaes, out, in, sz,
                                         stream->tweak_block);
@@ -1831,11 +1831,11 @@ _AES_ECB_decrypt_AESNI:
        push	%edi
        push	%esi
        push	%ebx
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movl	32(%esp), %ecx
-        movl	36(%esp), %eax
+        movl	16(%esp), %edi
+        movl	20(%esp), %esi
+        movl	24(%esp), %edx
+        movl	28(%esp), %ecx
+        movl	32(%esp), %eax


        movl    %edx, %ebx
@@ -3485,7 +3485,6 @@ L_AES_GCM_decrypt_aesni_last_block_start:
        movdqa	%xmm1, %xmm12
        pclmulqdq	$0x00, %xmm0, %xmm12
        aesenc	80(%r15), %xmm8
-        movdqa	%xmm1, %xmm1
        pclmulqdq	$0x11, %xmm0, %xmm1
        aesenc	96(%r15), %xmm8
        pxor	%xmm11, %xmm10
@@ -6303,7 +6302,6 @@ L_AES_GCM_decrypt_update_aesni_last_block_start:
        movdqa	%xmm1, %xmm12
        pclmulqdq	$0x00, %xmm0, %xmm12
        aesenc	80(%rdi), %xmm8
-        movdqa	%xmm1, %xmm1
        pclmulqdq	$0x11, %xmm0, %xmm1
        aesenc	96(%rdi), %xmm8
        pxor	%xmm11, %xmm10
@@ -750,6 +750,9 @@ L_AES_GCM_encrypt_aesni_calc_aad_done:
        # First 64 bytes of input
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm3
        movdqa	%xmm4, %xmm5
        movdqa	%xmm4, %xmm6
@@ -761,9 +764,6 @@ L_AES_GCM_encrypt_aesni_calc_aad_done:
        pshufb	%xmm3, %xmm6
        paddd	L_aes_gcm_three, %xmm7
        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
        movdqa	(%ebp), %xmm3
        pxor	%xmm3, %xmm4
        pxor	%xmm3, %xmm5
@@ -867,6 +867,9 @@ L_AES_GCM_encrypt_aesni_ghash_64:
        leal	(%edi,%ebx,1), %edx
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm3
        movdqa	%xmm4, %xmm5
        movdqa	%xmm4, %xmm6
@@ -878,9 +881,6 @@ L_AES_GCM_encrypt_aesni_ghash_64:
        pshufb	%xmm3, %xmm6
        paddd	L_aes_gcm_three, %xmm7
        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
        movdqa	(%ebp), %xmm3
        pxor	%xmm3, %xmm4
        pxor	%xmm3, %xmm5
@@ -2146,6 +2146,9 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace:
        leal	(%edi,%ebx,1), %edx
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm3
        movdqa	%xmm4, %xmm5
        movdqa	%xmm4, %xmm6
@@ -2157,9 +2160,6 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace:
        pshufb	%xmm3, %xmm6
        paddd	L_aes_gcm_three, %xmm7
        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
        movdqa	(%ebp), %xmm3
        pxor	%xmm3, %xmm4
        pxor	%xmm3, %xmm5
@@ -2359,6 +2359,9 @@ L_AES_GCM_decrypt_aesni_ghash_64:
        leal	(%edi,%ebx,1), %edx
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm3
        movdqa	%xmm4, %xmm5
        movdqa	%xmm4, %xmm6
@@ -2370,9 +2373,6 @@ L_AES_GCM_decrypt_aesni_ghash_64:
        pshufb	%xmm3, %xmm6
        paddd	L_aes_gcm_three, %xmm7
        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
        movdqa	(%ebp), %xmm3
        pxor	%xmm3, %xmm4
        pxor	%xmm3, %xmm5
@@ -2455,8 +2455,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
        movdqu	16(%ecx), %xmm1
        pxor	%xmm0, %xmm4
        pxor	%xmm1, %xmm5
-        movdqu	%xmm0, (%ecx)
-        movdqu	%xmm1, 16(%ecx)
        movdqu	%xmm4, (%edx)
        movdqu	%xmm5, 16(%edx)
        aesenclast	%xmm3, %xmm6
@@ -2465,8 +2463,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
        movdqu	48(%ecx), %xmm1
        pxor	%xmm0, %xmm6
        pxor	%xmm1, %xmm7
-        movdqu	%xmm0, 32(%ecx)
-        movdqu	%xmm1, 48(%ecx)
        movdqu	%xmm6, 32(%edx)
        movdqu	%xmm7, 48(%edx)
        # ghash encrypted counter
@@ -3536,6 +3532,9 @@ AES_GCM_encrypt_update_aesni:
        # First 64 bytes of input
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm7
        movdqa	%xmm0, %xmm1
        movdqa	%xmm0, %xmm2
@@ -3547,9 +3546,6 @@ AES_GCM_encrypt_update_aesni:
        pshufb	%xmm7, %xmm2
        paddd	L_aes_gcm_three, %xmm3
        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
        movdqa	(%ebp), %xmm7
        pxor	%xmm7, %xmm0
        pxor	%xmm7, %xmm1
@@ -3644,6 +3640,8 @@ L_AES_GCM_encrypt_update_aesni_enc_done:
        movdqu	%xmm3, 48(%edi)
        cmpl	$0x40, %eax
        movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
        jle	L_AES_GCM_encrypt_update_aesni_end_64
        # More 64 bytes of input
 L_AES_GCM_encrypt_update_aesni_ghash_64:
@@ -3651,6 +3649,9 @@ L_AES_GCM_encrypt_update_aesni_ghash_64:
        leal	(%edi,%ebx,1), %edx
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm7
        movdqa	%xmm0, %xmm1
        movdqa	%xmm0, %xmm2
@@ -3662,9 +3663,6 @@ L_AES_GCM_encrypt_update_aesni_ghash_64:
        pshufb	%xmm7, %xmm2
        paddd	L_aes_gcm_three, %xmm3
        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
        movdqa	(%ebp), %xmm7
        pxor	%xmm7, %xmm0
        pxor	%xmm7, %xmm1
@@ -4406,6 +4404,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
        leal	(%edi,%ebx,1), %edx
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm7
        movdqa	%xmm0, %xmm1
        movdqa	%xmm0, %xmm2
@@ -4417,9 +4418,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
        pshufb	%xmm7, %xmm2
        paddd	L_aes_gcm_three, %xmm3
        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
        movdqa	(%ebp), %xmm7
        pxor	%xmm7, %xmm0
        pxor	%xmm7, %xmm1
@@ -4619,6 +4617,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64:
        leal	(%edi,%ebx,1), %edx
        # Encrypt 64 bytes of counter
        movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
        movdqa	L_aes_gcm_bswap_epi64, %xmm7
        movdqa	%xmm0, %xmm1
        movdqa	%xmm0, %xmm2
@@ -4630,9 +4631,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64:
        pshufb	%xmm7, %xmm2
        paddd	L_aes_gcm_three, %xmm3
        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
        movdqa	(%ebp), %xmm7
        pxor	%xmm7, %xmm0
        pxor	%xmm7, %xmm1
@@ -4715,8 +4713,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
        movdqu	16(%ecx), %xmm5
        pxor	%xmm4, %xmm0
        pxor	%xmm5, %xmm1
-        movdqu	%xmm4, (%ecx)
-        movdqu	%xmm5, 16(%ecx)
        movdqu	%xmm0, (%edx)
        movdqu	%xmm1, 16(%edx)
        aesenclast	%xmm7, %xmm2
@@ -4725,8 +4721,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
        movdqu	48(%ecx), %xmm5
        pxor	%xmm4, %xmm2
        pxor	%xmm5, %xmm3
-        movdqu	%xmm4, 32(%ecx)
-        movdqu	%xmm5, 48(%ecx)
        movdqu	%xmm2, 32(%edx)
        movdqu	%xmm3, 48(%edx)
        # ghash encrypted counter
@@ -5556,6 +5550,8 @@ L_AES_GCM_encrypt_avx1_calc_aad_done:
        vmovdqu	%xmm3, 48(%esp)
        # First 64 bytes of input
        vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
        vpshufb	%xmm3, %xmm5, %xmm5
@@ -5564,9 +5560,6 @@ L_AES_GCM_encrypt_avx1_calc_aad_done:
        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
        vpshufb	%xmm3, %xmm7, %xmm7
        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	(%ebp), %xmm3
        vpxor	%xmm3, %xmm4, %xmm4
        vpxor	%xmm3, %xmm5, %xmm5
@@ -5649,8 +5642,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
        vmovdqu	16(%esi), %xmm1
        vpxor	%xmm0, %xmm4, %xmm4
        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, (%esi)
-        vmovdqu	%xmm1, 16(%esi)
        vmovdqu	%xmm4, (%edi)
        vmovdqu	%xmm5, 16(%edi)
        vaesenclast	%xmm3, %xmm6, %xmm6
@@ -5659,8 +5650,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
        vmovdqu	48(%esi), %xmm1
        vpxor	%xmm0, %xmm6, %xmm6
        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%esi)
-        vmovdqu	%xmm1, 48(%esi)
        vmovdqu	%xmm6, 32(%edi)
        vmovdqu	%xmm7, 48(%edi)
        cmpl	$0x40, %eax
@@ -5673,6 +5662,8 @@ L_AES_GCM_encrypt_avx1_ghash_64:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
        vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
        vpshufb	%xmm3, %xmm5, %xmm5
@@ -5681,9 +5672,6 @@ L_AES_GCM_encrypt_avx1_ghash_64:
        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
        vpshufb	%xmm3, %xmm7, %xmm7
        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	(%ebp), %xmm3
        vpxor	%xmm3, %xmm4, %xmm4
        vpxor	%xmm3, %xmm5, %xmm5
@@ -5864,7 +5852,7 @@ L_AES_GCM_encrypt_avx1_end_64:
        vmovdqu	96(%esp), %xmm2
        # Block 1
        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	(%edx), %xmm1
+        vmovdqu	(%edx), %xmm1
        vpshufb	%xmm4, %xmm1, %xmm1
        vmovdqu	48(%esp), %xmm3
        vpxor	%xmm2, %xmm1, %xmm1
@@ -5886,7 +5874,7 @@ L_AES_GCM_encrypt_avx1_end_64:
        vpxor	%xmm5, %xmm2, %xmm2
        # Block 2
        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	16(%edx), %xmm1
+        vmovdqu	16(%edx), %xmm1
        vpshufb	%xmm4, %xmm1, %xmm1
        vmovdqu	32(%esp), %xmm3
        # ghash_gfmul_xor_avx
@@ -5907,7 +5895,7 @@ L_AES_GCM_encrypt_avx1_end_64:
        vpxor	%xmm5, %xmm2, %xmm2
        # Block 3
        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	32(%edx), %xmm1
+        vmovdqu	32(%edx), %xmm1
        vpshufb	%xmm4, %xmm1, %xmm1
        vmovdqu	16(%esp), %xmm3
        # ghash_gfmul_xor_avx
@@ -5928,7 +5916,7 @@ L_AES_GCM_encrypt_avx1_end_64:
        vpxor	%xmm5, %xmm2, %xmm2
        # Block 4
        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	48(%edx), %xmm1
+        vmovdqu	48(%edx), %xmm1
        vpshufb	%xmm4, %xmm1, %xmm1
        vmovdqu	(%esp), %xmm3
        # ghash_gfmul_xor_avx
@@ -6776,6 +6764,8 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
        vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
        vpshufb	%xmm3, %xmm5, %xmm5
@@ -6784,9 +6774,6 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace:
        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
        vpshufb	%xmm3, %xmm7, %xmm7
        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	(%ebp), %xmm3
        vpxor	%xmm3, %xmm4, %xmm4
        vpxor	%xmm3, %xmm5, %xmm5
@@ -6972,6 +6959,8 @@ L_AES_GCM_decrypt_avx1_ghash_64:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
        vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
        vpshufb	%xmm3, %xmm5, %xmm5
@@ -6980,9 +6969,6 @@ L_AES_GCM_decrypt_avx1_ghash_64:
        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
        vpshufb	%xmm3, %xmm7, %xmm7
        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
        vmovdqa	(%ebp), %xmm3
        vpxor	%xmm3, %xmm4, %xmm4
        vpxor	%xmm3, %xmm5, %xmm5
@@ -7065,8 +7051,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
        vmovdqu	16(%ecx), %xmm1
        vpxor	%xmm0, %xmm4, %xmm4
        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, (%ecx)
-        vmovdqu	%xmm1, 16(%ecx)
        vmovdqu	%xmm4, (%edx)
        vmovdqu	%xmm5, 16(%edx)
        vaesenclast	%xmm3, %xmm6, %xmm6
@@ -7075,8 +7059,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
        vmovdqu	48(%ecx), %xmm1
        vpxor	%xmm0, %xmm6, %xmm6
        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%ecx)
-        vmovdqu	%xmm1, 48(%ecx)
        vmovdqu	%xmm6, 32(%edx)
        vmovdqu	%xmm7, 48(%edx)
        # ghash encrypted counter
@@ -7181,7 +7163,6 @@ L_AES_GCM_decrypt_avx1_last_block_start:
        pshufb	L_aes_gcm_avx1_bswap_mask, %xmm7
        pxor	%xmm2, %xmm7
        vmovdqu	64(%esp), %xmm5
-        vmovdqu	%xmm7, %xmm7
        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
        vmovdqu	%xmm5, 64(%esp)
@@ -7995,6 +7976,8 @@ AES_GCM_encrypt_update_avx1:
        vmovdqu	%xmm7, 48(%esp)
        # First 64 bytes of input
        vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
        vpshufb	%xmm7, %xmm1, %xmm1
@@ -8003,9 +7986,6 @@ AES_GCM_encrypt_update_avx1:
        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
        vpshufb	%xmm7, %xmm3, %xmm3
        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	(%ebp), %xmm7
        vpxor	%xmm7, %xmm0, %xmm0
        vpxor	%xmm7, %xmm1, %xmm1
@@ -8088,8 +8068,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
        vmovdqu	16(%esi), %xmm5
        vpxor	%xmm4, %xmm0, %xmm0
        vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, (%esi)
-        vmovdqu	%xmm5, 16(%esi)
        vmovdqu	%xmm0, (%edi)
        vmovdqu	%xmm1, 16(%edi)
        vaesenclast	%xmm7, %xmm2, %xmm2
@@ -8098,8 +8076,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
        vmovdqu	48(%esi), %xmm5
        vpxor	%xmm4, %xmm2, %xmm2
        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 32(%esi)
-        vmovdqu	%xmm5, 48(%esi)
        vmovdqu	%xmm2, 32(%edi)
        vmovdqu	%xmm3, 48(%edi)
        cmpl	$0x40, %eax
@@ -8112,6 +8088,8 @@ L_AES_GCM_encrypt_update_avx1_ghash_64:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
        vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
        vpshufb	%xmm7, %xmm1, %xmm1
@@ -8120,9 +8098,6 @@ L_AES_GCM_encrypt_update_avx1_ghash_64:
        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
        vpshufb	%xmm7, %xmm3, %xmm3
        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	(%ebp), %xmm7
        vpxor	%xmm7, %xmm0, %xmm0
        vpxor	%xmm7, %xmm1, %xmm1
@@ -8754,6 +8729,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
        vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
        vpshufb	%xmm7, %xmm1, %xmm1
@@ -8762,9 +8739,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
        vpshufb	%xmm7, %xmm3, %xmm3
        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	(%ebp), %xmm7
        vpxor	%xmm7, %xmm0, %xmm0
        vpxor	%xmm7, %xmm1, %xmm1
@@ -8950,6 +8924,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
        vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
        vpshufb	%xmm7, %xmm1, %xmm1
@@ -8958,9 +8934,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64:
        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
        vpshufb	%xmm7, %xmm3, %xmm3
        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
        vmovdqa	(%ebp), %xmm7
        vpxor	%xmm7, %xmm0, %xmm0
        vpxor	%xmm7, %xmm1, %xmm1
@@ -9043,8 +9016,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
        vmovdqu	16(%ecx), %xmm5
        vpxor	%xmm4, %xmm0, %xmm0
        vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, (%ecx)
-        vmovdqu	%xmm5, 16(%ecx)
        vmovdqu	%xmm0, (%edx)
        vmovdqu	%xmm1, 16(%edx)
        vaesenclast	%xmm7, %xmm2, %xmm2
@@ -9053,8 +9024,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
        vmovdqu	48(%ecx), %xmm5
        vpxor	%xmm4, %xmm2, %xmm2
        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 32(%ecx)
-        vmovdqu	%xmm5, 48(%ecx)
        vmovdqu	%xmm2, 32(%edx)
        vmovdqu	%xmm3, 48(%edx)
        # ghash encrypted counter
@@ -9155,12 +9124,10 @@ L_AES_GCM_decrypt_update_avx1_done_64:
 L_AES_GCM_decrypt_update_avx1_last_block_start:
        leal	(%esi,%ebx,1), %ecx
        leal	(%edi,%ebx,1), %edx
-        vmovdqu	(%ecx), %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm1, %xmm1
-        vmovdqu	%xmm1, (%esp)
+        vmovdqu	(%ecx), %xmm3
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm3, %xmm3
+        vpxor	%xmm6, %xmm3, %xmm3
        vmovdqu	64(%esp), %xmm1
-        vmovdqu	(%esp), %xmm3
        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
        vmovdqu	%xmm1, 64(%esp)
@@ -11036,8 +11003,6 @@ L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
        vmovdqu	16(%ecx), %xmm4
        vpxor	%xmm7, %xmm0, %xmm0
        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, (%ecx)
-        vmovdqu	%xmm4, 16(%ecx)
        vmovdqu	%xmm0, (%edx)
        vmovdqu	%xmm1, 16(%edx)
        vmovdqu	32(%ecx), %xmm7
@@ -12733,8 +12698,6 @@ L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
        vmovdqu	16(%ecx), %xmm4
        vpxor	%xmm7, %xmm0, %xmm0
        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, (%ecx)
-        vmovdqu	%xmm4, 16(%ecx)
        vmovdqu	%xmm0, (%edx)
        vmovdqu	%xmm1, 16(%edx)
        vmovdqu	32(%ecx), %xmm7
@@ -504,7 +504,6 @@ _poly1305_calc_powers_avx2:
        # Reduce 260-bit to 130-bit
        movq	%r15, %rax
        movq	%rsi, %rdx
-        movq	%rbx, %rbx
        andq	$-4, %rax
        andq	$3, %r15
        addq	%rax, %r13
@@ -454,7 +454,6 @@ poly1305_calc_powers_avx2 PROC
        ; Reduce 260-bit to 130-bit
        mov	rax, rdi
        mov	rdx, rsi
-        mov	rbx, rbx
        and	rax, -4
        and	rdi, 3
        add	r14, rax
@@ -45,6 +45,9 @@
    #undef WOLFSSL_ARMASM
    #undef WOLFSSL_RISCV_ASM
 #endif
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif

 #if defined(WOLFSSL_PSOC6_CRYPTO)
    #include <wolfssl/wolfcrypt/port/cypress/psoc6_crypto.h>
@@ -770,7 +770,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
        "mull	%[a]		\n\t"                    \
        "movl	%%eax, %[l]	\n\t"                    \
        "movl	%%edx, %[h]	\n\t"                    \
-        : [h] "+r" (vh), [l] "+r" (vl)                   \
+        : [h] "+rm" (vh), [l] "+rm" (vl)                 \
        : [a] "rm" (va), [b] "rm" (vb)                   \
        : "eax", "edx", "cc"                             \
    )
@@ -794,7 +794,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
-        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
+        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
        : [a] "rm" (va), [b] "rm" (vb)                   \
        : "eax", "edx", "cc"                             \
    )
@@ -820,7 +820,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
-        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
+        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
        : [a] "rm" (va), [b] "rm" (vb)                   \
        : "eax", "edx", "cc"                             \
    )
@@ -859,7 +859,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
-        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
+        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
        : [a] "rm" (va)                                  \
        : "eax", "edx", "cc"                             \
    )
@@ -7656,7 +7656,7 @@ _sp_2048_sqr_32:
        subq	$0x110, %rsp
        movq	%rdi, 256(%rsp)
        movq	%rsi, 264(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	128(%rsi), %r9
        movq	(%rsi), %rdx
@@ -7820,7 +7820,7 @@ _sp_2048_sqr_32:
        movq	256(%rsp), %rsi
        leaq	128(%rsp), %r8
        addq	$0x180, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-128(%r8), %rax
        subq	-128(%rsi), %rax
        movq	-120(%r8), %rdx
@@ -8197,7 +8197,7 @@ _sp_2048_sqr_avx2_32:
        subq	$0x110, %rsp
        movq	%rdi, 256(%rsp)
        movq	%rsi, 264(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	128(%rsi), %r9
        movq	(%rsi), %rdx
@@ -8361,7 +8361,7 @@ _sp_2048_sqr_avx2_32:
        movq	256(%rsp), %rsi
        leaq	128(%rsp), %r8
        addq	$0x180, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-128(%r8), %rax
        subq	-128(%rsi), %rax
        movq	-120(%r8), %rdx
@@ -9405,7 +9405,6 @@ L_2048_mont_reduce_16_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$0x80, %rdi
 #ifndef __APPLE__
        callq	sp_2048_cond_sub_16@plt
@@ -10017,7 +10016,6 @@ _sp_2048_mont_reduce_avx2_16:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$0x40, %rdi
-        xorq	%rbp, %rbp
 L_2048_mont_reduce_avx2_16_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -11482,7 +11480,6 @@ L_2048_mont_reduce_32_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$0x100, %rdi
 #ifndef __APPLE__
        callq	sp_2048_cond_sub_32@plt
@@ -12368,7 +12365,6 @@ _sp_2048_mont_reduce_avx2_32:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$0x80, %rdi
-        xorq	%rbp, %rbp
 L_2048_mont_reduce_avx2_32_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -15173,7 +15169,7 @@ sp_2048_lshift_32:
 _sp_2048_lshift_32:
 #endif /* __APPLE__ */
        movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
        movq	216(%rsi), %r11
        movq	224(%rsi), %rdx
        movq	232(%rsi), %rax
@@ -22716,7 +22712,7 @@ _sp_3072_sqr_24:
        subq	$0xd0, %rsp
        movq	%rdi, 192(%rsp)
        movq	%rsi, 200(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	96(%rsi), %r9
        movq	(%rsi), %rdx
@@ -22848,7 +22844,7 @@ _sp_3072_sqr_24:
        movq	192(%rsp), %rsi
        leaq	96(%rsp), %r8
        addq	$0x120, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-96(%r8), %rax
        subq	-96(%rsi), %rax
        movq	-88(%r8), %rdx
@@ -23141,7 +23137,7 @@ _sp_3072_sqr_avx2_24:
        subq	$0xd0, %rsp
        movq	%rdi, 192(%rsp)
        movq	%rsi, 200(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	96(%rsi), %r9
        movq	(%rsi), %rdx
@@ -23273,7 +23269,7 @@ _sp_3072_sqr_avx2_24:
        movq	192(%rsp), %rsi
        leaq	96(%rsp), %r8
        addq	$0x120, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-96(%r8), %rax
        subq	-96(%rsi), %rax
        movq	-88(%r8), %rdx
@@ -23566,7 +23562,7 @@ _sp_3072_sqr_48:
        subq	$0x190, %rsp
        movq	%rdi, 384(%rsp)
        movq	%rsi, 392(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	192(%rsi), %r9
        movq	(%rsi), %rdx
@@ -23794,7 +23790,7 @@ _sp_3072_sqr_48:
        movq	384(%rsp), %rsi
        leaq	192(%rsp), %r8
        addq	$0x240, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-192(%r8), %rax
        subq	-192(%rsi), %rax
        movq	-184(%r8), %rdx
@@ -24339,7 +24335,7 @@ _sp_3072_sqr_avx2_48:
        subq	$0x190, %rsp
        movq	%rdi, 384(%rsp)
        movq	%rsi, 392(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	192(%rsi), %r9
        movq	(%rsi), %rdx
@@ -24567,7 +24563,7 @@ _sp_3072_sqr_avx2_48:
        movq	384(%rsp), %rsi
        leaq	192(%rsp), %r8
        addq	$0x240, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-192(%r8), %rax
        subq	-192(%rsi), %rax
        movq	-184(%r8), %rdx
@@ -25973,7 +25969,6 @@ L_3072_mont_reduce_24_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$0xc0, %rdi
 #ifndef __APPLE__
        callq	sp_3072_cond_sub_24@plt
@@ -26801,7 +26796,6 @@ _sp_3072_mont_reduce_avx2_24:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$0x60, %rdi
-        xorq	%rbp, %rbp
 L_3072_mont_reduce_avx2_24_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -28885,7 +28879,6 @@ L_3072_mont_reduce_48_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$0x180, %rdi
 #ifndef __APPLE__
        callq	sp_3072_cond_sub_48@plt
@@ -30123,7 +30116,6 @@ _sp_3072_mont_reduce_avx2_48:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$0xc0, %rdi
-        xorq	%rbp, %rbp
 L_3072_mont_reduce_avx2_48_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -31900,7 +31892,7 @@ sp_3072_lshift_48:
 _sp_3072_lshift_48:
 #endif /* __APPLE__ */
        movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
        movq	344(%rsi), %r11
        movq	352(%rsi), %rdx
        movq	360(%rsi), %rax
@@ -35658,7 +35650,7 @@ _sp_4096_sqr_64:
        subq	$0x210, %rsp
        movq	%rdi, 512(%rsp)
        movq	%rsi, 520(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	256(%rsi), %r9
        movq	(%rsi), %rdx
@@ -35950,7 +35942,7 @@ _sp_4096_sqr_64:
        movq	512(%rsp), %rsi
        leaq	256(%rsp), %r8
        addq	$0x300, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-256(%r8), %rax
        subq	-256(%rsi), %rax
        movq	-248(%r8), %rdx
@@ -36663,7 +36655,7 @@ _sp_4096_sqr_avx2_64:
        subq	$0x210, %rsp
        movq	%rdi, 512(%rsp)
        movq	%rsi, 520(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	%rsp, %r8
        leaq	256(%rsi), %r9
        movq	(%rsi), %rdx
@@ -36955,7 +36947,7 @@ _sp_4096_sqr_avx2_64:
        movq	512(%rsp), %rsi
        leaq	256(%rsp), %r8
        addq	$0x300, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
        movq	-256(%r8), %rax
        subq	-256(%rsi), %rax
        movq	-248(%r8), %rdx
@@ -39337,7 +39329,6 @@ L_4096_mont_reduce_64_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$0x200, %rdi
 #ifndef __APPLE__
        callq	sp_4096_cond_sub_64@plt
@@ -40927,7 +40918,6 @@ _sp_4096_mont_reduce_avx2_64:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$0x100, %rdi
-        xorq	%rbp, %rbp
 L_4096_mont_reduce_avx2_64_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -43260,7 +43250,7 @@ sp_4096_lshift_64:
 _sp_4096_lshift_64:
 #endif /* __APPLE__ */
        movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
        movq	472(%rsi), %r11
        movq	480(%rsi), %rdx
        movq	488(%rsi), %rax
@@ -44326,15 +44316,11 @@ _sp_256_mont_sqr_4:
        #  A[0] * A[0]
        movq	(%rsi), %rax
        mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
        movq	%rax, %r8
        movq	%rdx, %rbx
        #  A[1] * A[1]
        movq	8(%rsi), %rax
        mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
        addq	%rbx, %r9
        adcq	%rax, %r10
        adcq	$0x00, %rdx
@@ -44342,8 +44328,6 @@ _sp_256_mont_sqr_4:
        #  A[2] * A[2]
        movq	16(%rsi), %rax
        mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
        addq	%rbx, %r11
        adcq	%rax, %r12
        adcq	$0x00, %rdx
@@ -44351,8 +44335,6 @@ _sp_256_mont_sqr_4:
        #  A[3] * A[3]
        movq	24(%rsi), %rax
        mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
        addq	%rbx, %r13
        adcq	%rax, %r14
        adcq	%rdx, %r15
@@ -48981,7 +48963,6 @@ L_384_mont_reduce_order_6_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$48, %rdi
 #ifndef __APPLE__
        callq	sp_384_cond_sub_6@plt
@@ -56409,7 +56390,6 @@ _sp_521_mont_reduce_order_avx2_9:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$32, %rdi
-        xorq	%rbp, %rbp
 L_521_mont_reduce_order_avx2_9_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -57531,7 +57511,7 @@ sp_521_lshift_9:
 _sp_521_lshift_9:
 #endif /* __APPLE__ */
        movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
        movq	32(%rsi), %r11
        movq	40(%rsi), %rdx
        movq	48(%rsi), %rax
@@ -57584,7 +57564,7 @@ sp_521_lshift_18:
 _sp_521_lshift_18:
 #endif /* __APPLE__ */
        movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
        movq	104(%rsi), %r11
        movq	112(%rsi), %rdx
        movq	120(%rsi), %rax
@@ -64747,7 +64727,6 @@ L_1024_mont_reduce_16_loop:
        movq	%rsi, %rdx
 #endif /* _WIN64 */
        movq	%rdi, %rsi
-        movq	%rdi, %rdi
        subq	$0x80, %rdi
 #ifndef __APPLE__
        callq	sp_1024_cond_sub_16@plt
@@ -65797,7 +65776,6 @@ _sp_1024_mont_reduce_avx2_16:
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        addq	$0x40, %rdi
-        xorq	%rbp, %rbp
 L_1024_mont_reduce_avx2_16_loop:
        # mu = a[i] * mp
        movq	%r12, %rdx
@@ -7505,7 +7505,7 @@ sp_2048_sqr_32 PROC
        sub	rsp, 272
        mov	QWORD PTR [rsp+256], rcx
        mov	QWORD PTR [rsp+264], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+128]
        mov	rax, QWORD PTR [rdx]
@@ -7657,7 +7657,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+256]
        lea	r10, QWORD PTR [rsp+128]
        add	rdx, 384
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-128]
        sub	r8, QWORD PTR [rdx+-128]
        mov	rax, QWORD PTR [r10+-120]
@@ -8023,7 +8023,7 @@ sp_2048_sqr_avx2_32 PROC
        sub	rsp, 272
        mov	QWORD PTR [rsp+256], rcx
        mov	QWORD PTR [rsp+264], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+128]
        mov	rax, QWORD PTR [rdx]
@@ -8175,7 +8175,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+256]
        lea	r10, QWORD PTR [rsp+128]
        add	rdx, 384
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-128]
        sub	r8, QWORD PTR [rdx+-128]
        mov	rax, QWORD PTR [r10+-120]
@@ -9179,7 +9179,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 128
        call	sp_2048_cond_sub_16
        pop	rsi
@@ -9736,7 +9735,6 @@ sp_2048_mont_reduce_avx2_16 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 64
-        xor	rbp, rbp
 L_2048_mont_reduce_avx2_16_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -11190,7 +11188,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 256
        call	sp_2048_cond_sub_32
        pop	rsi
@@ -12019,7 +12016,6 @@ sp_2048_mont_reduce_avx2_32 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 128
-        xor	rbp, rbp
 L_2048_mont_reduce_avx2_32_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -14805,7 +14801,7 @@ sp_2048_lshift_32 PROC
        push	r13
        mov	rax, rcx
        mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
        mov	r13, QWORD PTR [rdx+216]
        mov	r8, QWORD PTR [rdx+224]
        mov	r9, QWORD PTR [rdx+232]
@@ -22145,7 +22141,7 @@ sp_3072_sqr_24 PROC
        sub	rsp, 208
        mov	QWORD PTR [rsp+192], rcx
        mov	QWORD PTR [rsp+200], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+96]
        mov	rax, QWORD PTR [rdx]
@@ -22265,7 +22261,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+192]
        lea	r10, QWORD PTR [rsp+96]
        add	rdx, 288
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-96]
        sub	r8, QWORD PTR [rdx+-96]
        mov	rax, QWORD PTR [r10+-88]
@@ -22547,7 +22543,7 @@ sp_3072_sqr_avx2_24 PROC
        sub	rsp, 208
        mov	QWORD PTR [rsp+192], rcx
        mov	QWORD PTR [rsp+200], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+96]
        mov	rax, QWORD PTR [rdx]
@@ -22667,7 +22663,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+192]
        lea	r10, QWORD PTR [rsp+96]
        add	rdx, 288
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-96]
        sub	r8, QWORD PTR [rdx+-96]
        mov	rax, QWORD PTR [r10+-88]
@@ -22949,7 +22945,7 @@ sp_3072_sqr_48 PROC
        sub	rsp, 400
        mov	QWORD PTR [rsp+384], rcx
        mov	QWORD PTR [rsp+392], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+192]
        mov	rax, QWORD PTR [rdx]
@@ -23165,7 +23161,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+384]
        lea	r10, QWORD PTR [rsp+192]
        add	rdx, 576
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-192]
        sub	r8, QWORD PTR [rdx+-192]
        mov	rax, QWORD PTR [r10+-184]
@@ -23699,7 +23695,7 @@ sp_3072_sqr_avx2_48 PROC
        sub	rsp, 400
        mov	QWORD PTR [rsp+384], rcx
        mov	QWORD PTR [rsp+392], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+192]
        mov	rax, QWORD PTR [rdx]
@@ -23915,7 +23911,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+384]
        lea	r10, QWORD PTR [rsp+192]
        add	rdx, 576
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-192]
        sub	r8, QWORD PTR [rdx+-192]
        mov	rax, QWORD PTR [r10+-184]
@@ -25292,7 +25288,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 192
        call	sp_3072_cond_sub_24
        pop	rsi
@@ -26065,7 +26060,6 @@ sp_3072_mont_reduce_avx2_24 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 96
-        xor	rbp, rbp
 L_3072_mont_reduce_avx2_24_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -28138,7 +28132,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 384
        call	sp_3072_cond_sub_48
        pop	rsi
@@ -29319,7 +29312,6 @@ sp_3072_mont_reduce_avx2_48 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 192
-        xor	rbp, rbp
 L_3072_mont_reduce_avx2_48_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -31077,7 +31069,7 @@ sp_3072_lshift_48 PROC
        push	r13
        mov	rax, rcx
        mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
        mov	r13, QWORD PTR [rdx+344]
        mov	r8, QWORD PTR [rdx+352]
        mov	r9, QWORD PTR [rdx+360]
@@ -34728,7 +34720,7 @@ sp_4096_sqr_64 PROC
        sub	rsp, 528
        mov	QWORD PTR [rsp+512], rcx
        mov	QWORD PTR [rsp+520], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+256]
        mov	rax, QWORD PTR [rdx]
@@ -35008,7 +35000,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+512]
        lea	r10, QWORD PTR [rsp+256]
        add	rdx, 768
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-256]
        sub	r8, QWORD PTR [rdx+-256]
        mov	rax, QWORD PTR [r10+-248]
@@ -35710,7 +35702,7 @@ sp_4096_sqr_avx2_64 PROC
        sub	rsp, 528
        mov	QWORD PTR [rsp+512], rcx
        mov	QWORD PTR [rsp+520], rdx
-        mov	r9, 0
+        xor	r9, r9
        mov	r10, rsp
        lea	r11, QWORD PTR [rdx+256]
        mov	rax, QWORD PTR [rdx]
@@ -35990,7 +35982,7 @@ ENDIF
        mov	rdx, QWORD PTR [rsp+512]
        lea	r10, QWORD PTR [rsp+256]
        add	rdx, 768
-        mov	r9, 0
+        xor	r9, r9
        mov	r8, QWORD PTR [r10+-256]
        sub	r8, QWORD PTR [rdx+-256]
        mov	rax, QWORD PTR [r10+-248]
@@ -38343,7 +38335,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 512
        call	sp_4096_cond_sub_64
        pop	rsi
@@ -39876,7 +39867,6 @@ sp_4096_mont_reduce_avx2_64 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 256
-        xor	rbp, rbp
 L_4096_mont_reduce_avx2_64_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -42190,7 +42180,7 @@ sp_4096_lshift_64 PROC
        push	r13
        mov	rax, rcx
        mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
        mov	r13, QWORD PTR [rdx+472]
        mov	r8, QWORD PTR [rdx+480]
        mov	r9, QWORD PTR [rdx+488]
@@ -43187,15 +43177,11 @@ sp_256_mont_sqr_4 PROC
        ;  A[0] * A[0]
        mov	rax, QWORD PTR [r8]
        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
        mov	r10, rax
        mov	rbx, rdx
        ;  A[1] * A[1]
        mov	rax, QWORD PTR [r8+8]
        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
        add	r11, rbx
        adc	r12, rax
        adc	rdx, 0
@@ -43203,8 +43189,6 @@ sp_256_mont_sqr_4 PROC
        ;  A[2] * A[2]
        mov	rax, QWORD PTR [r8+16]
        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
        add	r13, rbx
        adc	r14, rax
        adc	rdx, 0
@@ -43212,8 +43196,6 @@ sp_256_mont_sqr_4 PROC
        ;  A[3] * A[3]
        mov	rax, QWORD PTR [r8+24]
        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
        add	r15, rbx
        adc	rdi, rax
        adc	rsi, rdx
@@ -47531,7 +47513,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 48
        call	sp_384_cond_sub_6
        pop	rsi
@@ -54689,7 +54670,6 @@ sp_521_mont_reduce_order_avx2_9 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 32
-        xor	rbp, rbp
 L_521_mont_reduce_order_avx2_9_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -55781,7 +55761,7 @@ sp_521_lshift_9 PROC
        push	r13
        mov	rax, rcx
        mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
        mov	r13, QWORD PTR [rdx+32]
        mov	r8, QWORD PTR [rdx+40]
        mov	r9, QWORD PTR [rdx+48]
@@ -55828,7 +55808,7 @@ sp_521_lshift_18 PROC
        push	r13
        mov	rax, rcx
        mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
        mov	r13, QWORD PTR [rdx+104]
        mov	r8, QWORD PTR [rdx+112]
        mov	r9, QWORD PTR [rdx+120]
@@ -62803,7 +62783,6 @@ ELSE
        mov	r8, r9
 ENDIF
        mov	rdx, rcx
-        mov	rcx, rcx
        sub	rcx, 128
        call	sp_1024_cond_sub_16
        pop	rsi
@@ -63804,7 +63783,6 @@ sp_1024_mont_reduce_avx2_16 PROC
        mov	rdi, QWORD PTR [r9+16]
        mov	rsi, QWORD PTR [r9+24]
        add	r9, 64
-        xor	rbp, rbp
 L_1024_mont_reduce_avx2_16_loop:
        ; mu = a[i] * mp
        mov	rdx, r14
@@ -166,6 +166,10 @@
    #include <wolfcrypt/src/misc.c>
 #endif

+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif
+
 #if defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM_PRECALC) && \
        !defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM)
    #define WOLFSSL_MLDSA_SIGN_SMALL_MEM
@@ -74,6 +74,9 @@
    #undef WOLFSSL_ARMASM
    #undef WOLFSSL_RISCV_ASM
 #endif
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif

 #include <wolfssl/wolfcrypt/wc_mlkem.h>
 #include <wolfssl/wolfcrypt/sha3.h>
@@ -52,6 +52,9 @@
    #undef WOLFSSL_ARMASM
    #undef WOLFSSL_RISCV_ASM
 #endif
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif

 #if defined(USE_INTEL_SPEEDUP)
 /* CPU information for Intel. */
@@ -29,7 +29,8 @@

 #include <wolfssl/wolfcrypt/types.h>

-#if defined(USE_INTEL_SPEEDUP) && !defined(NO_CURVED25519_X64)
+#if defined(USE_INTEL_SPEEDUP) && defined(WOLFSSL_X86_64_BUILD) && \
+    !defined(NO_CURVED25519_X64)
    #define CURVED25519_X64
 #elif defined(HAVE___UINT128_T) && !defined(NO_CURVED25519_128BIT)
    #define CURVED25519_128BIT