From f54266c2c6850f4c15f13f29789dd286d500963f Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 17 Dec 2025 13:25:36 +1000 Subject: [PATCH] Curve25519: improved smul Use the Ed25519 base smul in Curve25519 base mul and covert to Montogmery curve for a faster implementation. Only when Ed25519 is compiled in or WOLFSSL_CURVE25519_USE_ED25519 is defined. When compiling Intel x64 assembly and Aarch64 assembly, always define WOLFSSL_CURVE25519_USE_ED25519. Can't use with blinding - normal C implementation. Optimized the Curve25519 smul slightly for Intel x64 and Aarch64. Improved the conditional table lookup on Intel x64 to use AVX2 when available. --- configure.ac | 6 + src/include.am | 3 +- wolfcrypt/src/curve25519.c | 30 + wolfcrypt/src/ed25519.c | 37 +- wolfcrypt/src/fe_x25519_asm.S | 7432 +++++++++++++++-- wolfcrypt/src/ge_operations.c | 21 +- wolfcrypt/src/port/arm/armv8-32-curve25519.S | 12 +- .../src/port/arm/armv8-32-curve25519_c.c | 14 +- wolfcrypt/src/port/arm/armv8-curve25519.S | 3059 ++++++- wolfcrypt/src/port/arm/armv8-curve25519_c.c | 3037 ++++++- wolfcrypt/src/port/arm/thumb2-curve25519.S | 12 +- wolfcrypt/src/port/arm/thumb2-curve25519_c.c | 12 +- wolfssl/wolfcrypt/ed25519.h | 2 +- wolfssl/wolfcrypt/fe_operations.h | 7 + wolfssl/wolfcrypt/ge_operations.h | 4 +- 15 files changed, 12791 insertions(+), 897 deletions(-) diff --git a/configure.ac b/configure.ac index afef3fc59..099dbd8bc 100644 --- a/configure.ac +++ b/configure.ac @@ -10275,6 +10275,12 @@ then AM_CFLAGS="$AM_CFLAGS -DNO_CURVED25519_128BIT" fi + if test "$ENABLED_CURVE25519" = "ed" + then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_CURVE25519_USE_ED25519" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_CURVE25519_USE_ED25519" + fi + AM_CFLAGS="$AM_CFLAGS -DHAVE_CURVE25519" AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_CURVE25519" ENABLED_FEMATH=yes diff --git a/src/include.am b/src/include.am index 742eed461..96736c2b9 100644 --- a/src/include.am +++ b/src/include.am @@ -1401,6 +1401,7 @@ endif !BUILD_FIPS_V6_PLUS if BUILD_FEMATH src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_low_mem.c +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c if BUILD_CURVE25519_INTELASM if !BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S @@ -1460,8 +1461,8 @@ endif BUILD_FEMATH if BUILD_GEMATH src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_low_mem.c -src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c if !BUILD_FEMATH +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c if BUILD_CURVE25519_INTELASM if !BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S diff --git a/wolfcrypt/src/curve25519.c b/wolfcrypt/src/curve25519.c index cbd15ee09..93c74ed42 100644 --- a/wolfcrypt/src/curve25519.c +++ b/wolfcrypt/src/curve25519.c @@ -31,6 +31,7 @@ #ifdef HAVE_CURVE25519 #include +#include #ifdef NO_INLINE #include #else @@ -54,6 +55,8 @@ #error "Blinding not needed nor available for small implementation" #elif defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM) #error "Blinding not needed nor available for assembly implementation" + #elif defined(WOLFSSL_CURVE25519_USE_ED25519) + #error "Ed25519 base scalar mult cannot be used with blinding " #endif #endif @@ -72,6 +75,8 @@ const curve25519_set_type curve25519_sets[] = { } }; +#if !defined(WOLFSSL_CURVE25519_USE_ED25519) || \ + defined(WOLFSSL_CURVE25519_BLINDING) static const word32 kCurve25519BasePoint[CURVE25519_KEYSIZE/sizeof(word32)] = { #ifdef BIG_ENDIAN_ORDER 0x09000000 @@ -79,6 +84,7 @@ static const word32 kCurve25519BasePoint[CURVE25519_KEYSIZE/sizeof(word32)] = { 9 #endif }; +#endif /* !WOLFSSL_CURVE25519_USE_ED25519 || WOLFSSL_CURVE25519_BLINDING */ /* Curve25519 private key must be less than order */ /* These functions clamp private k and check it */ @@ -154,7 +160,31 @@ int wc_curve25519_make_pub(int public_size, byte* pub, int private_size, SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(WOLFSSL_CURVE25519_USE_ED25519) + { + ge_p3 A; + + ge_scalarmult_base(&A, priv); + #ifndef CURVE25519_SMALL + fe_add(A.X, A.Z, A.Y); + fe_sub(A.T, A.Z, A.Y); + fe_invert(A.T, A.T); + fe_mul(A.T, A.X, A.T); + fe_tobytes(pub, A.T); + #else + lm_add(A.X, A.Z, A.Y); + lm_sub(A.T, A.Z, A.Y); + lm_invert(A.T, A.T); + lm_mul(pub, A.X, A.T); + #endif + ret = 0; + } +#elif defined(CURVED25519_X64) || (defined(WOLFSSL_ARMASM) && \ + defined(__aarch64__)) + ret = curve25519_base(pub, priv); +#else ret = curve25519(pub, priv, (byte*)kCurve25519BasePoint); +#endif RESTORE_VECTOR_REGISTERS(); #else diff --git a/wolfcrypt/src/ed25519.c b/wolfcrypt/src/ed25519.c index 69c7a1b30..a03efb560 100644 --- a/wolfcrypt/src/ed25519.c +++ b/wolfcrypt/src/ed25519.c @@ -88,13 +88,7 @@ static int ed25519_hash_init(ed25519_key* key, wc_Sha512 *sha) { int ret; -#ifndef WOLFSSL_ED25519_PERSISTENT_SHA - /* when not using persistent SHA, we'll zero the sha param */ - XMEMSET(sha, 0, sizeof(wc_Sha512)); -#endif - ret = wc_InitSha512_ex(sha, key->heap, - #if defined(WOLF_CRYPTO_CB) key->devId #else @@ -103,8 +97,9 @@ static int ed25519_hash_init(ed25519_key* key, wc_Sha512 *sha) ); #ifdef WOLFSSL_ED25519_PERSISTENT_SHA - if (ret == 0) + if (ret == 0) { key->sha_clean_flag = 1; + } #endif return ret; @@ -114,8 +109,10 @@ static int ed25519_hash_init(ed25519_key* key, wc_Sha512 *sha) static int ed25519_hash_reset(ed25519_key* key) { int ret; - if (key->sha_clean_flag) + + if (key->sha_clean_flag) { ret = 0; + } else { wc_Sha512Free(&key->sha); ret = wc_InitSha512_ex(&key->sha, key->heap, @@ -128,6 +125,7 @@ static int ed25519_hash_reset(ed25519_key* key) if (ret == 0) key->sha_clean_flag = 1; } + return ret; } #endif /* WOLFSSL_ED25519_PERSISTENT_SHA */ @@ -136,8 +134,9 @@ static int ed25519_hash_update(ed25519_key* key, wc_Sha512 *sha, const byte* data, word32 len) { #ifdef WOLFSSL_ED25519_PERSISTENT_SHA - if (key->sha_clean_flag) + if (key->sha_clean_flag) { key->sha_clean_flag = 0; + } #else (void)key; #endif @@ -148,8 +147,9 @@ static int ed25519_hash_final(ed25519_key* key, wc_Sha512 *sha, byte* hash) { int ret = wc_Sha512Final(sha, hash); #ifdef WOLFSSL_ED25519_PERSISTENT_SHA - if (ret == 0) + if (ret == 0) { key->sha_clean_flag = 1; + } #else (void)key; #endif @@ -187,16 +187,15 @@ static int ed25519_hash(ed25519_key* key, const byte* in, word32 inLen, #else ret = ed25519_hash_init(key, sha); #endif - if (ret < 0) - return ret; + if (ret == 0) { + ret = ed25519_hash_update(key, sha, in, inLen); + if (ret == 0) + ret = ed25519_hash_final(key, sha, hash); - ret = ed25519_hash_update(key, sha, in, inLen); - if (ret == 0) - ret = ed25519_hash_final(key, sha, hash); - -#ifndef WOLFSSL_ED25519_PERSISTENT_SHA - ed25519_hash_free(key, sha); -#endif + #ifndef WOLFSSL_ED25519_PERSISTENT_SHA + ed25519_hash_free(key, sha); + #endif + } return ret; } diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index f0373a09b..2ac32677a 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -84,6 +84,17 @@ L_fe_init_get_flags: andl $0x50, %eax cmpl $0x50, %eax jne L_fe_init_flags_done +#ifndef __APPLE__ + movq fe_cmov_table_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_cmov_table_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_cmov_table_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_cmov_table_p(%rip) +#endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul_avx2@GOTPCREL(%rip), %rax #else @@ -139,18 +150,6 @@ L_fe_init_get_flags: #else movq %rax, _curve25519_p(%rip) #endif /* __APPLE__ */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ - movq fe_sq2_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_sq2_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq2_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_sq2_p(%rip) -#endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_pow22523_avx2@GOTPCREL(%rip), %rax #else @@ -239,6 +238,31 @@ L_fe_init_get_flags: #else movq %rax, _ge_sub_p(%rip) #endif /* __APPLE__ */ +#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#ifndef __APPLE__ + movq curve25519_base_avx2@GOTPCREL(%rip), %rax +#else + leaq _curve25519_base_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq curve25519_base_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _curve25519_base_p(%rip) +#endif /* __APPLE__ */ +#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ + movq fe_sq2_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_sq2_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_sq2_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_sq2_p(%rip) +#endif /* __APPLE__ */ #ifndef __APPLE__ movq sc_reduce_avx2@GOTPCREL(%rip), %rax #else @@ -627,6 +651,598 @@ fe_cmov_table: .globl _fe_cmov_table .p2align 4 _fe_cmov_table: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_cmov_table_p(%rip) +#else + jmpq *_fe_cmov_table_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_cmov_table,.-fe_cmov_table +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul +.type fe_mul,@function +.align 16 +fe_mul: +#else +.section __TEXT,__text +.globl _fe_mul +.p2align 4 +_fe_mul: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_mul_p(%rip) +#else + jmpq *_fe_mul_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_mul,.-fe_mul +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq +.type fe_sq,@function +.align 16 +fe_sq: +#else +.section __TEXT,__text +.globl _fe_sq +.p2align 4 +_fe_sq: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_sq_p(%rip) +#else + jmpq *_fe_sq_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_sq,.-fe_sq +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul121666 +.type fe_mul121666,@function +.align 16 +fe_mul121666: +#else +.section __TEXT,__text +.globl _fe_mul121666 +.p2align 4 +_fe_mul121666: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_mul121666_p(%rip) +#else + jmpq *_fe_mul121666_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_mul121666,.-fe_mul121666 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_invert +.type fe_invert,@function +.align 16 +fe_invert: +#else +.section __TEXT,__text +.globl _fe_invert +.p2align 4 +_fe_invert: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_invert_p(%rip) +#else + jmpq *_fe_invert_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_invert,.-fe_invert +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl curve25519 +.type curve25519,@function +.align 16 +curve25519: +#else +.section __TEXT,__text +.globl _curve25519 +.p2align 4 +_curve25519: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *curve25519_p(%rip) +#else + jmpq *_curve25519_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size curve25519,.-curve25519 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_pow22523 +.type fe_pow22523,@function +.align 16 +fe_pow22523: +#else +.section __TEXT,__text +.globl _fe_pow22523 +.p2align 4 +_fe_pow22523: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_pow22523_p(%rip) +#else + jmpq *_fe_pow22523_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_pow22523,.-fe_pow22523 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_p1p1_to_p2 +.type ge_p1p1_to_p2,@function +.align 16 +ge_p1p1_to_p2: +#else +.section __TEXT,__text +.globl _ge_p1p1_to_p2 +.p2align 4 +_ge_p1p1_to_p2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_p1p1_to_p2_p(%rip) +#else + jmpq *_ge_p1p1_to_p2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_p1p1_to_p2,.-ge_p1p1_to_p2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_p1p1_to_p3 +.type ge_p1p1_to_p3,@function +.align 16 +ge_p1p1_to_p3: +#else +.section __TEXT,__text +.globl _ge_p1p1_to_p3 +.p2align 4 +_ge_p1p1_to_p3: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_p1p1_to_p3_p(%rip) +#else + jmpq *_ge_p1p1_to_p3_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_p1p1_to_p3,.-ge_p1p1_to_p3 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_p2_dbl +.type ge_p2_dbl,@function +.align 16 +ge_p2_dbl: +#else +.section __TEXT,__text +.globl _ge_p2_dbl +.p2align 4 +_ge_p2_dbl: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_p2_dbl_p(%rip) +#else + jmpq *_ge_p2_dbl_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_p2_dbl,.-ge_p2_dbl +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_madd +.type ge_madd,@function +.align 16 +ge_madd: +#else +.section __TEXT,__text +.globl _ge_madd +.p2align 4 +_ge_madd: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_madd_p(%rip) +#else + jmpq *_ge_madd_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_madd,.-ge_madd +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_msub +.type ge_msub,@function +.align 16 +ge_msub: +#else +.section __TEXT,__text +.globl _ge_msub +.p2align 4 +_ge_msub: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_msub_p(%rip) +#else + jmpq *_ge_msub_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_msub,.-ge_msub +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_add +.type ge_add,@function +.align 16 +ge_add: +#else +.section __TEXT,__text +.globl _ge_add +.p2align 4 +_ge_add: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_add_p(%rip) +#else + jmpq *_ge_add_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_add,.-ge_add +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_sub +.type ge_sub,@function +.align 16 +ge_sub: +#else +.section __TEXT,__text +.globl _ge_sub +.p2align 4 +_ge_sub: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_sub_p(%rip) +#else + jmpq *_ge_sub_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_sub,.-ge_sub +#endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl fe_sq2 +.type fe_sq2,@function +.align 16 +fe_sq2: +#else +.section __TEXT,__text +.globl _fe_sq2 +.p2align 4 +_fe_sq2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_sq2_p(%rip) +#else + jmpq *_fe_sq2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_sq2,.-fe_sq2 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl sc_reduce +.type sc_reduce,@function +.align 16 +sc_reduce: +#else +.section __TEXT,__text +.globl _sc_reduce +.p2align 4 +_sc_reduce: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *sc_reduce_p(%rip) +#else + jmpq *_sc_reduce_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size sc_reduce,.-sc_reduce +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl sc_muladd +.type sc_muladd,@function +.align 16 +sc_muladd: +#else +.section __TEXT,__text +.globl _sc_muladd +.p2align 4 +_sc_muladd: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *sc_muladd_p(%rip) +#else + jmpq *_sc_muladd_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size sc_muladd,.-sc_muladd +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#endif /* HAVE_ED25519 */ +#ifndef __APPLE__ +.data +.type cpuFlagsSet, @object +.size cpuFlagsSet,4 +cpuFlagsSet: + .long 0 +#else +.section __DATA,__data +.p2align 3 +_cpuFlagsSet: + .long 0 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type intelFlags, @object +.size intelFlags,4 +intelFlags: + .long 0 +#else +.section __DATA,__data +.p2align 3 +_intelFlags: + .long 0 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_cmov_table_p, @object +.size fe_cmov_table_p,8 +fe_cmov_table_p: + .quad fe_cmov_table_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_cmov_table_p: + .quad _fe_cmov_table_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_mul_p, @object +.size fe_mul_p,8 +fe_mul_p: + .quad fe_mul_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_mul_p: + .quad _fe_mul_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_sq_p, @object +.size fe_sq_p,8 +fe_sq_p: + .quad fe_sq_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_sq_p: + .quad _fe_sq_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_mul121666_p, @object +.size fe_mul121666_p,8 +fe_mul121666_p: + .quad fe_mul121666_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_mul121666_p: + .quad _fe_mul121666_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_invert_p, @object +.size fe_invert_p,8 +fe_invert_p: + .quad fe_invert_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_invert_p: + .quad _fe_invert_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type curve25519_p, @object +.size curve25519_p,8 +curve25519_p: + .quad curve25519_x64 +#else +.section __DATA,__data +.p2align 3 +_curve25519_p: + .quad _curve25519_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_pow22523_p, @object +.size fe_pow22523_p,8 +fe_pow22523_p: + .quad fe_pow22523_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_pow22523_p: + .quad _fe_pow22523_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_p1p1_to_p2_p, @object +.size ge_p1p1_to_p2_p,8 +ge_p1p1_to_p2_p: + .quad ge_p1p1_to_p2_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_p1p1_to_p2_p: + .quad _ge_p1p1_to_p2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_p1p1_to_p3_p, @object +.size ge_p1p1_to_p3_p,8 +ge_p1p1_to_p3_p: + .quad ge_p1p1_to_p3_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_p1p1_to_p3_p: + .quad _ge_p1p1_to_p3_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_p2_dbl_p, @object +.size ge_p2_dbl_p,8 +ge_p2_dbl_p: + .quad ge_p2_dbl_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_p2_dbl_p: + .quad _ge_p2_dbl_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_madd_p, @object +.size ge_madd_p,8 +ge_madd_p: + .quad ge_madd_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_madd_p: + .quad _ge_madd_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_msub_p, @object +.size ge_msub_p,8 +ge_msub_p: + .quad ge_msub_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_msub_p: + .quad _ge_msub_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_add_p, @object +.size ge_add_p,8 +ge_add_p: + .quad ge_add_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_add_p: + .quad _ge_add_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_sub_p, @object +.size ge_sub_p,8 +ge_sub_p: + .quad ge_sub_x64 +#else +.section __DATA,__data +.p2align 3 +_ge_sub_p: + .quad _ge_sub_x64 +#endif /* __APPLE__ */ +#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#ifndef __APPLE__ +.data +.type curve25519_base_p, @object +.size curve25519_base_p,8 +curve25519_base_p: + .quad curve25519_base_x64 +#else +.section __DATA,__data +.p2align 3 +_curve25519_base_p: + .quad _curve25519_base_x64 +#endif /* __APPLE__ */ +#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.data +.type fe_sq2_p, @object +.size fe_sq2_p,8 +fe_sq2_p: + .quad fe_sq2_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_sq2_p: + .quad _fe_sq2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type sc_reduce_p, @object +.size sc_reduce_p,8 +sc_reduce_p: + .quad sc_reduce_x64 +#else +.section __DATA,__data +.p2align 3 +_sc_reduce_p: + .quad _sc_reduce_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type sc_muladd_p, @object +.size sc_muladd_p,8 +sc_muladd_p: + .quad sc_muladd_x64 +#else +.section __DATA,__data +.p2align 3 +_sc_muladd_p: + .quad _sc_muladd_x64 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifndef __APPLE__ +.text +.globl fe_cmov_table_x64 +.type fe_cmov_table_x64,@function +.align 16 +fe_cmov_table_x64: +#else +.section __TEXT,__text +.globl _fe_cmov_table_x64 +.p2align 4 +_fe_cmov_table_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 @@ -902,572 +1518,10 @@ _fe_cmov_table: popq %r12 repz retq #ifndef __APPLE__ -.size fe_cmov_table,.-fe_cmov_table +.size fe_cmov_table_x64,.-fe_cmov_table_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_mul -.type fe_mul,@function -.align 16 -fe_mul: -#else -.section __TEXT,__text -.globl _fe_mul -.p2align 4 -_fe_mul: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_mul_p(%rip) -#else - jmpq *_fe_mul_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_mul,.-fe_mul -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq -.type fe_sq,@function -.align 16 -fe_sq: -#else -.section __TEXT,__text -.globl _fe_sq -.p2align 4 -_fe_sq: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_sq_p(%rip) -#else - jmpq *_fe_sq_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_sq,.-fe_sq -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_mul121666 -.type fe_mul121666,@function -.align 16 -fe_mul121666: -#else -.section __TEXT,__text -.globl _fe_mul121666 -.p2align 4 -_fe_mul121666: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_mul121666_p(%rip) -#else - jmpq *_fe_mul121666_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_mul121666,.-fe_mul121666 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_invert -.type fe_invert,@function -.align 16 -fe_invert: -#else -.section __TEXT,__text -.globl _fe_invert -.p2align 4 -_fe_invert: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_invert_p(%rip) -#else - jmpq *_fe_invert_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_invert,.-fe_invert -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl curve25519 -.type curve25519,@function -.align 16 -curve25519: -#else -.section __TEXT,__text -.globl _curve25519 -.p2align 4 -_curve25519: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *curve25519_p(%rip) -#else - jmpq *_curve25519_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size curve25519,.-curve25519 -#endif /* __APPLE__ */ -#ifdef HAVE_ED25519 -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl fe_sq2 -.type fe_sq2,@function -.align 16 -fe_sq2: -#else -.section __TEXT,__text -.globl _fe_sq2 -.p2align 4 -_fe_sq2: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_sq2_p(%rip) -#else - jmpq *_fe_sq2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_sq2,.-fe_sq2 -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl fe_pow22523 -.type fe_pow22523,@function -.align 16 -fe_pow22523: -#else -.section __TEXT,__text -.globl _fe_pow22523 -.p2align 4 -_fe_pow22523: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_pow22523_p(%rip) -#else - jmpq *_fe_pow22523_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_pow22523,.-fe_pow22523 -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_p1p1_to_p2 -.type ge_p1p1_to_p2,@function -.align 16 -ge_p1p1_to_p2: -#else -.section __TEXT,__text -.globl _ge_p1p1_to_p2 -.p2align 4 -_ge_p1p1_to_p2: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_p1p1_to_p2_p(%rip) -#else - jmpq *_ge_p1p1_to_p2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_p1p1_to_p2,.-ge_p1p1_to_p2 -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_p1p1_to_p3 -.type ge_p1p1_to_p3,@function -.align 16 -ge_p1p1_to_p3: -#else -.section __TEXT,__text -.globl _ge_p1p1_to_p3 -.p2align 4 -_ge_p1p1_to_p3: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_p1p1_to_p3_p(%rip) -#else - jmpq *_ge_p1p1_to_p3_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_p1p1_to_p3,.-ge_p1p1_to_p3 -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_p2_dbl -.type ge_p2_dbl,@function -.align 16 -ge_p2_dbl: -#else -.section __TEXT,__text -.globl _ge_p2_dbl -.p2align 4 -_ge_p2_dbl: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_p2_dbl_p(%rip) -#else - jmpq *_ge_p2_dbl_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_p2_dbl,.-ge_p2_dbl -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_madd -.type ge_madd,@function -.align 16 -ge_madd: -#else -.section __TEXT,__text -.globl _ge_madd -.p2align 4 -_ge_madd: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_madd_p(%rip) -#else - jmpq *_ge_madd_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_madd,.-ge_madd -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_msub -.type ge_msub,@function -.align 16 -ge_msub: -#else -.section __TEXT,__text -.globl _ge_msub -.p2align 4 -_ge_msub: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_msub_p(%rip) -#else - jmpq *_ge_msub_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_msub,.-ge_msub -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_add -.type ge_add,@function -.align 16 -ge_add: -#else -.section __TEXT,__text -.globl _ge_add -.p2align 4 -_ge_add: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_add_p(%rip) -#else - jmpq *_ge_add_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_add,.-ge_add -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl ge_sub -.type ge_sub,@function -.align 16 -ge_sub: -#else -.section __TEXT,__text -.globl _ge_sub -.p2align 4 -_ge_sub: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *ge_sub_p(%rip) -#else - jmpq *_ge_sub_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size ge_sub,.-ge_sub -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl sc_reduce -.type sc_reduce,@function -.align 16 -sc_reduce: -#else -.section __TEXT,__text -.globl _sc_reduce -.p2align 4 -_sc_reduce: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *sc_reduce_p(%rip) -#else - jmpq *_sc_reduce_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size sc_reduce,.-sc_reduce -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl sc_muladd -.type sc_muladd,@function -.align 16 -sc_muladd: -#else -.section __TEXT,__text -.globl _sc_muladd -.p2align 4 -_sc_muladd: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *sc_muladd_p(%rip) -#else - jmpq *_sc_muladd_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size sc_muladd,.-sc_muladd -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#endif /* HAVE_ED25519 */ -#ifndef __APPLE__ -.data -.type cpuFlagsSet, @object -.size cpuFlagsSet,4 -cpuFlagsSet: - .long 0 -#else -.section __DATA,__data -.p2align 3 -_cpuFlagsSet: - .long 0 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type intelFlags, @object -.size intelFlags,4 -intelFlags: - .long 0 -#else -.section __DATA,__data -.p2align 3 -_intelFlags: - .long 0 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_mul_p, @object -.size fe_mul_p,8 -fe_mul_p: - .quad fe_mul_x64 -#else -.section __DATA,__data -.p2align 3 -_fe_mul_p: - .quad _fe_mul_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_sq_p, @object -.size fe_sq_p,8 -fe_sq_p: - .quad fe_sq_x64 -#else -.section __DATA,__data -.p2align 3 -_fe_sq_p: - .quad _fe_sq_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_mul121666_p, @object -.size fe_mul121666_p,8 -fe_mul121666_p: - .quad fe_mul121666_x64 -#else -.section __DATA,__data -.p2align 3 -_fe_mul121666_p: - .quad _fe_mul121666_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_invert_p, @object -.size fe_invert_p,8 -fe_invert_p: - .quad fe_invert_x64 -#else -.section __DATA,__data -.p2align 3 -_fe_invert_p: - .quad _fe_invert_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type curve25519_p, @object -.size curve25519_p,8 -curve25519_p: - .quad curve25519_x64 -#else -.section __DATA,__data -.p2align 3 -_curve25519_p: - .quad _curve25519_x64 -#endif /* __APPLE__ */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.data -.type fe_sq2_p, @object -.size fe_sq2_p,8 -fe_sq2_p: - .quad fe_sq2_x64 -#else -.section __DATA,__data -.p2align 3 -_fe_sq2_p: - .quad _fe_sq2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_pow22523_p, @object -.size fe_pow22523_p,8 -fe_pow22523_p: - .quad fe_pow22523_x64 -#else -.section __DATA,__data -.p2align 3 -_fe_pow22523_p: - .quad _fe_pow22523_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_p1p1_to_p2_p, @object -.size ge_p1p1_to_p2_p,8 -ge_p1p1_to_p2_p: - .quad ge_p1p1_to_p2_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_p1p1_to_p2_p: - .quad _ge_p1p1_to_p2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_p1p1_to_p3_p, @object -.size ge_p1p1_to_p3_p,8 -ge_p1p1_to_p3_p: - .quad ge_p1p1_to_p3_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_p1p1_to_p3_p: - .quad _ge_p1p1_to_p3_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_p2_dbl_p, @object -.size ge_p2_dbl_p,8 -ge_p2_dbl_p: - .quad ge_p2_dbl_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_p2_dbl_p: - .quad _ge_p2_dbl_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_madd_p, @object -.size ge_madd_p,8 -ge_madd_p: - .quad ge_madd_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_madd_p: - .quad _ge_madd_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_msub_p, @object -.size ge_msub_p,8 -ge_msub_p: - .quad ge_msub_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_msub_p: - .quad _ge_msub_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_add_p, @object -.size ge_add_p,8 -ge_add_p: - .quad ge_add_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_add_p: - .quad _ge_add_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type ge_sub_p, @object -.size ge_sub_p,8 -ge_sub_p: - .quad ge_sub_x64 -#else -.section __DATA,__data -.p2align 3 -_ge_sub_p: - .quad _ge_sub_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type sc_reduce_p, @object -.size sc_reduce_p,8 -sc_reduce_p: - .quad sc_reduce_x64 -#else -.section __DATA,__data -.p2align 3 -_sc_reduce_p: - .quad _sc_reduce_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type sc_muladd_p, @object -.size sc_muladd_p,8 -sc_muladd_p: - .quad sc_muladd_x64 -#else -.section __DATA,__data -.p2align 3 -_sc_muladd_p: - .quad _sc_muladd_x64 -#endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifndef __APPLE__ -.text .globl fe_mul_x64 .type fe_mul_x64,@function .align 16 @@ -2214,6 +2268,2378 @@ _fe_invert_x64: movq 128(%rsp), %rdi addq $0x90, %rsp repz retq +#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_curve25519_base_x64_x2: +.quad 0x5cae469cdd684efb, 0x8f3f5ced1e350b5c +.quad 0xd9750c687d157114, 0x20d342d51873f1b7 +#ifndef __APPLE__ +.text +.globl curve25519_base_x64 +.type curve25519_base_x64,@function +.align 16 +curve25519_base_x64: +#else +.section __TEXT,__text +.globl _curve25519_base_x64 +.p2align 4 +_curve25519_base_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + subq $0xa8, %rsp + xorq %r15, %r15 + movq %rdi, 160(%rsp) + # Set base point x + movq $9, (%rdi) + movq $0x00, 8(%rdi) + movq $0x00, 16(%rdi) + movq $0x00, 24(%rdi) + # Set one + movq $0x01, (%rsp) + movq $0x00, 8(%rsp) + movq $0x00, 16(%rsp) + movq $0x00, 24(%rsp) + movq 0+L_curve25519_base_x64_x2(%rip), %rcx + movq 8+L_curve25519_base_x64_x2(%rip), %r8 + movq 16+L_curve25519_base_x64_x2(%rip), %r9 + movq 24+L_curve25519_base_x64_x2(%rip), %r10 + # Set one + movq $0x01, 32(%rsp) + movq $0x00, 40(%rsp) + movq $0x00, 48(%rsp) + movq $0x00, 56(%rsp) + movq %rcx, 64(%rsp) + movq %r8, 72(%rsp) + movq %r9, 80(%rsp) + movq %r10, 88(%rsp) + movq $0xfd, %rbp +L_curve25519_base_x64_bits: + movq %rbp, %r8 + movq %rbp, %rcx + andq $63, %rcx + shrq $6, %r8 + movq (%rsi,%r8,8), %rbx + shrq %cl, %rbx + andq $0x01, %rbx + xorq %rbx, %r15 + negq %r15 + # Conditional Swap + movq (%rdi), %rcx + movq 8(%rdi), %r8 + movq 16(%rdi), %r9 + movq 24(%rdi), %r10 + xorq 64(%rsp), %rcx + xorq 72(%rsp), %r8 + xorq 80(%rsp), %r9 + xorq 88(%rsp), %r10 + andq %r15, %rcx + andq %r15, %r8 + andq %r15, %r9 + andq %r15, %r10 + xorq %rcx, (%rdi) + xorq %r8, 8(%rdi) + xorq %r9, 16(%rdi) + xorq %r10, 24(%rdi) + xorq %rcx, 64(%rsp) + xorq %r8, 72(%rsp) + xorq %r9, 80(%rsp) + xorq %r10, 88(%rsp) + # Conditional Swap + movq (%rsp), %rcx + movq 8(%rsp), %r8 + movq 16(%rsp), %r9 + movq 24(%rsp), %r10 + xorq 32(%rsp), %rcx + xorq 40(%rsp), %r8 + xorq 48(%rsp), %r9 + xorq 56(%rsp), %r10 + andq %r15, %rcx + andq %r15, %r8 + andq %r15, %r9 + andq %r15, %r10 + xorq %rcx, (%rsp) + xorq %r8, 8(%rsp) + xorq %r9, 16(%rsp) + xorq %r10, 24(%rsp) + xorq %rcx, 32(%rsp) + xorq %r8, 40(%rsp) + xorq %r9, 48(%rsp) + xorq %r10, 56(%rsp) + movq %rbx, %r15 + # Add-Sub + # Add + movq (%rdi), %rcx + movq 8(%rdi), %r8 + movq 16(%rdi), %r9 + movq 24(%rdi), %r10 + movq %rcx, %r11 + addq (%rsp), %rcx + movq %r8, %r12 + adcq 8(%rsp), %r8 + movq %r9, %r13 + adcq 16(%rsp), %r9 + movq %r10, %r14 + adcq 24(%rsp), %r10 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r10 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Sub + subq (%rsp), %r11 + sbbq 8(%rsp), %r12 + sbbq 16(%rsp), %r13 + sbbq 24(%rsp), %r14 + sbbq %rax, %rax + shldq $0x01, %r14, %rax + imulq $-19, %rax + andq %rdx, %r14 + # Add modulus (if underflow) + subq %rax, %r11 + sbbq $0x00, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 128(%rsp) + movq %r12, 136(%rsp) + movq %r13, 144(%rsp) + movq %r14, 152(%rsp) + # Add-Sub + # Add + movq 64(%rsp), %rcx + movq 72(%rsp), %r8 + movq 80(%rsp), %r9 + movq 88(%rsp), %r10 + movq %rcx, %r11 + addq 32(%rsp), %rcx + movq %r8, %r12 + adcq 40(%rsp), %r8 + movq %r9, %r13 + adcq 48(%rsp), %r9 + movq %r10, %r14 + adcq 56(%rsp), %r10 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r10 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Sub + subq 32(%rsp), %r11 + sbbq 40(%rsp), %r12 + sbbq 48(%rsp), %r13 + sbbq 56(%rsp), %r14 + sbbq %rax, %rax + shldq $0x01, %r14, %rax + imulq $-19, %rax + andq %rdx, %r14 + # Add modulus (if underflow) + subq %rax, %r11 + sbbq $0x00, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + movq %rcx, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 96(%rsp) + movq %r12, 104(%rsp) + movq %r13, 112(%rsp) + movq %r14, 120(%rsp) + # Multiply + # A[0] * B[0] + movq 128(%rsp), %rax + mulq 32(%rsp) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 136(%rsp), %rax + mulq 32(%rsp) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq 128(%rsp), %rax + mulq 40(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 144(%rsp), %rax + mulq 32(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 136(%rsp), %rax + mulq 40(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq 128(%rsp), %rax + mulq 48(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 152(%rsp), %rax + mulq 32(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 144(%rsp), %rax + mulq 40(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 136(%rsp), %rax + mulq 48(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq 128(%rsp), %rax + mulq 56(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 152(%rsp), %rax + mulq 40(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 144(%rsp), %rax + mulq 48(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 136(%rsp), %rax + mulq 56(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 152(%rsp), %rax + mulq 48(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 144(%rsp), %rax + mulq 56(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 152(%rsp), %rax + mulq 56(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + # Multiply + # A[0] * B[0] + movq (%rdi), %rax + mulq 96(%rsp) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 8(%rdi), %rax + mulq 96(%rsp) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq (%rdi), %rax + mulq 104(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 16(%rdi), %rax + mulq 96(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 8(%rdi), %rax + mulq 104(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq (%rdi), %rax + mulq 112(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 24(%rdi), %rax + mulq 96(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 16(%rdi), %rax + mulq 104(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 8(%rdi), %rax + mulq 112(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq (%rdi), %rax + mulq 120(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 24(%rdi), %rax + mulq 104(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 16(%rdi), %rax + mulq 112(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 8(%rdi), %rax + mulq 120(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 24(%rdi), %rax + mulq 112(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 16(%rdi), %rax + mulq 120(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 24(%rdi), %rax + mulq 120(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, (%rsp) + movq %r8, 8(%rsp) + movq %r9, 16(%rsp) + movq %r10, 24(%rsp) + # Square + # A[0] * A[1] + movq 128(%rsp), %rax + mulq 136(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq 128(%rsp), %rax + mulq 144(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq 128(%rsp), %rax + mulq 152(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 136(%rsp), %rax + mulq 144(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 136(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 144(%rsp), %rax + mulq 152(%rsp) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq 128(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbx + # A[1] * A[1] + movq 136(%rsp), %rax + mulq %rax + addq %rbx, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 144(%rsp), %rax + mulq %rax + addq %rbx, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 152(%rsp), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %rbx, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 96(%rsp) + movq %r8, 104(%rsp) + movq %r9, 112(%rsp) + movq %r10, 120(%rsp) + # Square + # A[0] * A[1] + movq (%rdi), %rax + mulq 8(%rdi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq (%rdi), %rax + mulq 16(%rdi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq (%rdi), %rax + mulq 24(%rdi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 8(%rdi), %rax + mulq 16(%rdi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 8(%rdi), %rax + mulq 24(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 16(%rdi), %rax + mulq 24(%rdi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq (%rdi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbx + # A[1] * A[1] + movq 8(%rdi), %rax + mulq %rax + addq %rbx, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 16(%rdi), %rax + mulq %rax + addq %rbx, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 24(%rdi), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %rbx, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 128(%rsp) + movq %r8, 136(%rsp) + movq %r9, 144(%rsp) + movq %r10, 152(%rsp) + # Add-Sub + # Add + movq (%rsp), %rcx + movq 8(%rsp), %r8 + movq 16(%rsp), %r9 + movq 24(%rsp), %r10 + movq %rcx, %r11 + addq 32(%rsp), %rcx + movq %r8, %r12 + adcq 40(%rsp), %r8 + movq %r9, %r13 + adcq 48(%rsp), %r9 + movq %r10, %r14 + adcq 56(%rsp), %r10 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r10 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Sub + subq 32(%rsp), %r11 + sbbq 40(%rsp), %r12 + sbbq 48(%rsp), %r13 + sbbq 56(%rsp), %r14 + sbbq %rax, %rax + shldq $0x01, %r14, %rax + imulq $-19, %rax + andq %rdx, %r14 + # Add modulus (if underflow) + subq %rax, %r11 + sbbq $0x00, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + movq %rcx, 64(%rsp) + movq %r8, 72(%rsp) + movq %r9, 80(%rsp) + movq %r10, 88(%rsp) + movq %r11, 32(%rsp) + movq %r12, 40(%rsp) + movq %r13, 48(%rsp) + movq %r14, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + # Sub + movq 128(%rsp), %rcx + movq 136(%rsp), %r8 + movq 144(%rsp), %r9 + movq 152(%rsp), %r10 + subq 96(%rsp), %rcx + sbbq 104(%rsp), %r8 + sbbq 112(%rsp), %r9 + sbbq 120(%rsp), %r10 + sbbq %rax, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $-19, %rax + andq %rdx, %r10 + # Add modulus (if underflow) + subq %rax, %rcx + sbbq $0x00, %r8 + sbbq $0x00, %r9 + sbbq $0x00, %r10 + movq %rcx, 128(%rsp) + movq %r8, 136(%rsp) + movq %r9, 144(%rsp) + movq %r10, 152(%rsp) + # Square + # A[0] * A[1] + movq 32(%rsp), %rax + mulq 40(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq 32(%rsp), %rax + mulq 48(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq 32(%rsp), %rax + mulq 56(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 40(%rsp), %rax + mulq 48(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 40(%rsp), %rax + mulq 56(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 48(%rsp), %rax + mulq 56(%rsp) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq 32(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbx + # A[1] * A[1] + movq 40(%rsp), %rax + mulq %rax + addq %rbx, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 48(%rsp), %rax + mulq %rax + addq %rbx, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 56(%rsp), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %rbx, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + # Multiply by 121666 + movq $0x1db42, %rax + mulq 128(%rsp) + xorq %r9, %r9 + movq %rax, %rcx + movq %rdx, %r8 + movq $0x1db42, %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + movq $0x1db42, %rax + mulq 144(%rsp) + xorq %r12, %r12 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 152(%rsp) + movq $0x7fffffffffffffff, %r11 + addq %rax, %r10 + adcq %rdx, %r12 + shldq $0x01, %r10, %r12 + andq %r11, %r10 + movq $19, %rax + mulq %r12 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + movq %rcx, (%rsp) + movq %r8, 8(%rsp) + movq %r9, 16(%rsp) + movq %r10, 24(%rsp) + # Square + # A[0] * A[1] + movq 64(%rsp), %rax + mulq 72(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq 64(%rsp), %rax + mulq 80(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq 64(%rsp), %rax + mulq 88(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 72(%rsp), %rax + mulq 80(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 72(%rsp), %rax + mulq 88(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 80(%rsp), %rax + mulq 88(%rsp) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq 64(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbx + # A[1] * A[1] + movq 72(%rsp), %rax + mulq %rax + addq %rbx, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 80(%rsp), %rax + mulq %rax + addq %rbx, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 88(%rsp), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %rbx, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 64(%rsp) + movq %r8, 72(%rsp) + movq %r9, 80(%rsp) + movq %r10, 88(%rsp) + # Add + movq 96(%rsp), %rcx + movq 104(%rsp), %r8 + addq (%rsp), %rcx + movq 112(%rsp), %r9 + adcq 8(%rsp), %r8 + movq 120(%rsp), %r10 + adcq 16(%rsp), %r9 + adcq 24(%rsp), %r10 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r10 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + movq %rcx, 96(%rsp) + movq %r8, 104(%rsp) + movq %r9, 112(%rsp) + movq %r10, 120(%rsp) + # Multiply by 9 + movq $9, %rax + mulq 32(%rsp) + xorq %r9, %r9 + movq %rax, %rcx + movq %rdx, %r8 + movq $9, %rax + mulq 40(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + movq $9, %rax + mulq 48(%rsp) + xorq %r12, %r12 + addq %rax, %r9 + adcq %rdx, %r10 + movq $9, %rax + mulq 56(%rsp) + movq $0x7fffffffffffffff, %r11 + addq %rax, %r10 + adcq %rdx, %r12 + shldq $0x01, %r10, %r12 + andq %r11, %r10 + movq $19, %rax + mulq %r12 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + movq %rcx, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, (%rsp) + movq %r8, 8(%rsp) + movq %r9, 16(%rsp) + movq %r10, 24(%rsp) + decq %rbp + cmpq $3, %rbp + jge L_curve25519_base_x64_bits + negq %r15 + # Conditional Swap + movq (%rdi), %rcx + movq 8(%rdi), %r8 + movq 16(%rdi), %r9 + movq 24(%rdi), %r10 + xorq 64(%rsp), %rcx + xorq 72(%rsp), %r8 + xorq 80(%rsp), %r9 + xorq 88(%rsp), %r10 + andq %r15, %rcx + andq %r15, %r8 + andq %r15, %r9 + andq %r15, %r10 + xorq %rcx, (%rdi) + xorq %r8, 8(%rdi) + xorq %r9, 16(%rdi) + xorq %r10, 24(%rdi) + xorq %rcx, 64(%rsp) + xorq %r8, 72(%rsp) + xorq %r9, 80(%rsp) + xorq %r10, 88(%rsp) + # Conditional Swap + movq (%rsp), %rcx + movq 8(%rsp), %r8 + movq 16(%rsp), %r9 + movq 24(%rsp), %r10 + xorq 32(%rsp), %rcx + xorq 40(%rsp), %r8 + xorq 48(%rsp), %r9 + xorq 56(%rsp), %r10 + andq %r15, %rcx + andq %r15, %r8 + andq %r15, %r9 + andq %r15, %r10 + xorq %rcx, (%rsp) + xorq %r8, 8(%rsp) + xorq %r9, 16(%rsp) + xorq %r10, 24(%rsp) + xorq %rcx, 32(%rsp) + xorq %r8, 40(%rsp) + xorq %r9, 48(%rsp) + xorq %r10, 56(%rsp) +L_curve25519_base_x64_3: + # Add-Sub + # Add + movq (%rdi), %rcx + movq 8(%rdi), %r8 + movq 16(%rdi), %r9 + movq 24(%rdi), %r10 + movq %rcx, %r11 + addq (%rsp), %rcx + movq %r8, %r12 + adcq 8(%rsp), %r8 + movq %r9, %r13 + adcq 16(%rsp), %r9 + movq %r10, %r14 + adcq 24(%rsp), %r10 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r10 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Sub + subq (%rsp), %r11 + sbbq 8(%rsp), %r12 + sbbq 16(%rsp), %r13 + sbbq 24(%rsp), %r14 + sbbq %rax, %rax + shldq $0x01, %r14, %rax + imulq $-19, %rax + andq %rdx, %r14 + # Add modulus (if underflow) + subq %rax, %r11 + sbbq $0x00, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 128(%rsp) + movq %r12, 136(%rsp) + movq %r13, 144(%rsp) + movq %r14, 152(%rsp) + # Square + # A[0] * A[1] + movq 128(%rsp), %rax + mulq 136(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq 128(%rsp), %rax + mulq 144(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq 128(%rsp), %rax + mulq 152(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 136(%rsp), %rax + mulq 144(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 136(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 144(%rsp), %rax + mulq 152(%rsp) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq 128(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbx + # A[1] * A[1] + movq 136(%rsp), %rax + mulq %rax + addq %rbx, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 144(%rsp), %rax + mulq %rax + addq %rbx, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 152(%rsp), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %rbx, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 96(%rsp) + movq %r8, 104(%rsp) + movq %r9, 112(%rsp) + movq %r10, 120(%rsp) + # Square + # A[0] * A[1] + movq (%rdi), %rax + mulq 8(%rdi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq (%rdi), %rax + mulq 16(%rdi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq (%rdi), %rax + mulq 24(%rdi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 8(%rdi), %rax + mulq 16(%rdi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 8(%rdi), %rax + mulq 24(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 16(%rdi), %rax + mulq 24(%rdi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq (%rdi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbx + # A[1] * A[1] + movq 8(%rdi), %rax + mulq %rax + addq %rbx, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 16(%rdi), %rax + mulq %rax + addq %rbx, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 24(%rdi), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %rbx, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, 128(%rsp) + movq %r8, 136(%rsp) + movq %r9, 144(%rsp) + movq %r10, 152(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + # Sub + movq 128(%rsp), %rcx + movq 136(%rsp), %r8 + movq 144(%rsp), %r9 + movq 152(%rsp), %r10 + subq 96(%rsp), %rcx + sbbq 104(%rsp), %r8 + sbbq 112(%rsp), %r9 + sbbq 120(%rsp), %r10 + sbbq %rax, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $-19, %rax + andq %rdx, %r10 + # Add modulus (if underflow) + subq %rax, %rcx + sbbq $0x00, %r8 + sbbq $0x00, %r9 + sbbq $0x00, %r10 + movq %rcx, 128(%rsp) + movq %r8, 136(%rsp) + movq %r9, 144(%rsp) + movq %r10, 152(%rsp) + # Multiply by 121666 + movq $0x1db42, %rax + mulq 128(%rsp) + xorq %r9, %r9 + movq %rax, %rcx + movq %rdx, %r8 + movq $0x1db42, %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + movq $0x1db42, %rax + mulq 144(%rsp) + xorq %r12, %r12 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 152(%rsp) + movq $0x7fffffffffffffff, %r11 + addq %rax, %r10 + adcq %rdx, %r12 + shldq $0x01, %r10, %r12 + andq %r11, %r10 + movq $19, %rax + mulq %r12 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + movq %rcx, (%rsp) + movq %r8, 8(%rsp) + movq %r9, 16(%rsp) + movq %r10, 24(%rsp) + # Add + movq 96(%rsp), %rcx + movq 104(%rsp), %r8 + addq (%rsp), %rcx + movq 112(%rsp), %r9 + adcq 8(%rsp), %r8 + movq 120(%rsp), %r10 + adcq 16(%rsp), %r9 + adcq 24(%rsp), %r10 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r10, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r10 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + movq %rcx, 96(%rsp) + movq %r8, 104(%rsp) + movq %r9, 112(%rsp) + movq %r10, 120(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + # Store + movq %rcx, (%rsp) + movq %r8, 8(%rsp) + movq %r9, 16(%rsp) + movq %r10, 24(%rsp) + decq %rbp + jge L_curve25519_base_x64_3 + # Invert + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + movq %rsp, %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq 160(%rsp), %rdi + # Multiply + # A[0] * B[0] + movq (%rsp), %rax + mulq (%rdi) + movq %rax, %rcx + movq %rdx, %r8 + # A[0] * B[1] + movq 8(%rsp), %rax + mulq (%rdi) + xorq %r9, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + # A[1] * B[0] + movq (%rsp), %rax + mulq 8(%rdi) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r10 + # A[0] * B[2] + movq 16(%rsp), %rax + mulq (%rdi) + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[1] + movq 8(%rsp), %rax + mulq 8(%rdi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[2] * B[0] + movq (%rsp), %rax + mulq 16(%rdi) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[3] + movq 24(%rsp), %rax + mulq (%rdi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[2] + movq 16(%rsp), %rax + mulq 8(%rdi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[1] + movq 8(%rsp), %rax + mulq 16(%rdi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * B[0] + movq (%rsp), %rax + mulq 24(%rdi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * B[3] + movq 24(%rsp), %rax + mulq 8(%rdi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[2] + movq 16(%rsp), %rax + mulq 16(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[1] + movq 8(%rsp), %rax + mulq 24(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[3] + movq 24(%rsp), %rax + mulq 16(%rdi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[2] + movq 16(%rsp), %rax + mulq 24(%rdi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[3] + movq 24(%rsp), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %rbx, %r10 + movq %rdx, %rbx + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %rbx, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + movq $0x7fffffffffffffff, %rbx + movq %r10, %rax + sarq $63, %rax + andq $19, %rax + andq %rbx, %r10 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + movq $0x7fffffffffffffff, %rax + movq %rcx, %rdx + addq $19, %rdx + movq %r8, %rdx + adcq $0x00, %rdx + movq %r9, %rdx + adcq $0x00, %rdx + movq %r10, %rdx + adcq $0x00, %rdx + sarq $63, %rdx + andq $19, %rdx + andq %rax, %r10 + addq %rdx, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + xorq %rax, %rax + addq $0xa8, %rsp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size curve25519_base_x64,.-curve25519_base_x64 +#endif /* __APPLE__ */ +#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519_x64 @@ -3617,7 +6043,658 @@ L_curve25519_x64_bits: movq %r11, 24(%rsp) movq 160(%rsp), %r9 decq %r9 + cmpq $3, %r9 jge L_curve25519_x64_bits + movq %r9, 160(%rsp) + negq %rbx + # Conditional Swap + movq (%rdi), %rcx + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + xorq 64(%rsp), %rcx + xorq 72(%rsp), %r9 + xorq 80(%rsp), %r10 + xorq 88(%rsp), %r11 + andq %rbx, %rcx + andq %rbx, %r9 + andq %rbx, %r10 + andq %rbx, %r11 + xorq %rcx, (%rdi) + xorq %r9, 8(%rdi) + xorq %r10, 16(%rdi) + xorq %r11, 24(%rdi) + xorq %rcx, 64(%rsp) + xorq %r9, 72(%rsp) + xorq %r10, 80(%rsp) + xorq %r11, 88(%rsp) + # Conditional Swap + movq (%rsp), %rcx + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + xorq 32(%rsp), %rcx + xorq 40(%rsp), %r9 + xorq 48(%rsp), %r10 + xorq 56(%rsp), %r11 + andq %rbx, %rcx + andq %rbx, %r9 + andq %rbx, %r10 + andq %rbx, %r11 + xorq %rcx, (%rsp) + xorq %r9, 8(%rsp) + xorq %r10, 16(%rsp) + xorq %r11, 24(%rsp) + xorq %rcx, 32(%rsp) + xorq %r9, 40(%rsp) + xorq %r10, 48(%rsp) + xorq %r11, 56(%rsp) +L_curve25519_x64_3: + # Add-Sub + # Add + movq (%rdi), %rcx + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq %rcx, %r12 + addq (%rsp), %rcx + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 + adcq 16(%rsp), %r10 + movq %r11, %r15 + adcq 24(%rsp), %r11 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r11, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r11 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Sub + subq (%rsp), %r12 + sbbq 8(%rsp), %r13 + sbbq 16(%rsp), %r14 + sbbq 24(%rsp), %r15 + sbbq %rax, %rax + shldq $0x01, %r15, %rax + imulq $-19, %rax + andq %rdx, %r15 + # Add modulus (if underflow) + subq %rax, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + movq %rcx, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 128(%rsp) + movq %r13, 136(%rsp) + movq %r14, 144(%rsp) + movq %r15, 152(%rsp) + # Square + # A[0] * A[1] + movq 128(%rsp), %rax + mulq 136(%rsp) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq 128(%rsp), %rax + mulq 144(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq 128(%rsp), %rax + mulq 152(%rsp) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 136(%rsp), %rax + mulq 144(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 136(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 144(%rsp), %rax + mulq 152(%rsp) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq 128(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbp + # A[1] * A[1] + movq 136(%rsp), %rax + mulq %rax + addq %rbp, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[2] * A[2] + movq 144(%rsp), %rax + mulq %rax + addq %rbp, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[3] * A[3] + movq 152(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbp, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbp + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx + andq %rbp, %r11 + movq %rdx, %rbp + movq $38, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $38, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + adcq %rdx, %r14 + addq %rbp, %rcx + adcq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + # Store + movq %rcx, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Square + # A[0] * A[1] + movq (%rdi), %rax + mulq 8(%rdi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rdi), %rax + mulq 16(%rdi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rdi), %rax + mulq 24(%rdi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rdi), %rax + mulq 16(%rdi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rdi), %rax + mulq 24(%rdi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rdi), %rax + mulq 24(%rdi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rdi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbp + # A[1] * A[1] + movq 8(%rdi), %rax + mulq %rax + addq %rbp, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[2] * A[2] + movq 16(%rdi), %rax + mulq %rax + addq %rbp, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[3] * A[3] + movq 24(%rdi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbp, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbp + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx + andq %rbp, %r11 + movq %rdx, %rbp + movq $38, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $38, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + adcq %rdx, %r14 + addq %rbp, %rcx + adcq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + # Store + movq %rcx, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbp + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx + andq %rbp, %r11 + movq %rdx, %rbp + movq $38, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $38, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + adcq %rdx, %r14 + addq %rbp, %rcx + adcq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + # Store + movq %rcx, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + # Sub + movq 128(%rsp), %rcx + movq 136(%rsp), %r9 + movq 144(%rsp), %r10 + movq 152(%rsp), %r11 + subq 96(%rsp), %rcx + sbbq 104(%rsp), %r9 + sbbq 112(%rsp), %r10 + sbbq 120(%rsp), %r11 + sbbq %rax, %rax + shldq $0x01, %r11, %rax + movq $0x7fffffffffffffff, %rdx + imulq $-19, %rax + andq %rdx, %r11 + # Add modulus (if underflow) + subq %rax, %rcx + sbbq $0x00, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + movq %rcx, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Multiply by 121666 + movq $0x1db42, %rax + mulq 128(%rsp) + xorq %r10, %r10 + movq %rax, %rcx + movq %rdx, %r9 + movq $0x1db42, %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 144(%rsp) + xorq %r13, %r13 + addq %rax, %r10 + adcq %rdx, %r11 + movq $0x1db42, %rax + mulq 152(%rsp) + movq $0x7fffffffffffffff, %r12 + addq %rax, %r11 + adcq %rdx, %r13 + shldq $0x01, %r11, %r13 + andq %r12, %r11 + movq $19, %rax + mulq %r13 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + # Add + movq 96(%rsp), %rcx + movq 104(%rsp), %r9 + addq (%rsp), %rcx + movq 112(%rsp), %r10 + adcq 8(%rsp), %r9 + movq 120(%rsp), %r11 + adcq 16(%rsp), %r10 + adcq 24(%rsp), %r11 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r11, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r11 + # Sub modulus (if overflow) + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %rcx, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %rbp + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx + andq %rbp, %r11 + movq %rdx, %rbp + movq $38, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $38, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + adcq %rdx, %r14 + addq %rbp, %rcx + adcq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + # Store + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + movq 160(%rsp), %r9 + decq %r9 + movq %r9, 160(%rsp) + jge L_curve25519_x64_3 # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi @@ -4025,149 +7102,6 @@ L_curve25519_x64_bits: #ifndef __APPLE__ .size curve25519_x64,.-curve25519_x64 #endif /* __APPLE__ */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl fe_sq2_x64 -.type fe_sq2_x64,@function -.align 16 -fe_sq2_x64: -#else -.section __TEXT,__text -.globl _fe_sq2_x64 -.p2align 4 -_fe_sq2_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - # Square * 2 - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r13, %r13 - addq %rax, %r12 - adcq %rdx, %r13 - # Double - xorq %r14, %r14 - addq %r8, %r8 - adcq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq $0x00, %r14 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %r15 - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %r15, %r8 - adcq %rax, %r9 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %r15, %r10 - adcq %rax, %r11 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r13 - adcq %rdx, %r14 - addq %r15, %r12 - adcq $0x00, %r13 - adcq $0x00, %r14 - movq $38, %rax - mulq %r14 - addq %rax, %r10 - adcq $0x00, %rdx - movq $0x7fffffffffffffff, %r15 - shldq $0x01, %r10, %rdx - imulq $19, %rdx, %rdx - andq %r15, %r10 - movq %rdx, %r15 - movq $38, %rax - mulq %r11 - xorq %r11, %r11 - addq %rax, %rcx - movq $38, %rax - adcq %rdx, %r11 - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $38, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - adcq %rdx, %r13 - addq %r15, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - adcq %r13, %r10 - movq %r10, %rax - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shldq $0x01, %rcx, %r8 - shlq $0x01, %rcx - movq $0x7fffffffffffffff, %r15 - shrq $62, %rax - andq %r15, %r10 - imulq $19, %rax, %rax - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Store - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_sq2_x64,.-fe_sq2_x64 -#endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_x64 @@ -8625,6 +11559,149 @@ _ge_sub_x64: #ifndef __APPLE__ .size ge_sub_x64,.-ge_sub_x64 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl fe_sq2_x64 +.type fe_sq2_x64,@function +.align 16 +fe_sq2_x64: +#else +.section __TEXT,__text +.globl _fe_sq2_x64 +.p2align 4 +_fe_sq2_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square * 2 + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %r15 + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %r15, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %r15, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %r15, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r15 + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %r15, %r10 + movq %rdx, %r15 + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %r15, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + movq %r10, %rax + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shldq $0x01, %rcx, %r8 + shlq $0x01, %rcx + movq $0x7fffffffffffffff, %r15 + shrq $62, %rax + andq %r15, %r10 + imulq $19, %rax, %rax + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_sq2_x64,.-fe_sq2_x64 +#endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_reduce_x64 @@ -9100,6 +12177,177 @@ _sc_muladd_x64: #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ .text +.globl fe_cmov_table_avx2 +.type fe_cmov_table_avx2,@function +.align 16 +fe_cmov_table_avx2: +#else +.section __TEXT,__text +.globl _fe_cmov_table_avx2 +.p2align 4 +_fe_cmov_table_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx + xor %rbx, %rbx + movsbq %cl, %rax + cdq + xorb %dl, %al + subb %dl, %al + movb %al, %bl + movd %ebx, %xmm7 + movq $0x01, %rbx + movd %rbx, %xmm9 + vmovdqa %ymm9, %ymm3 + vmovdqa %ymm9, %ymm4 + vpxor %ymm8, %ymm8, %ymm8 + vpermd %ymm7, %ymm8, %ymm7 + vpermd %ymm9, %ymm8, %ymm9 + vpxor %ymm0, %ymm0, %ymm0 + vpxor %ymm1, %ymm1, %ymm1 + vpxor %ymm2, %ymm2, %ymm2 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpxor %ymm5, %ymm5, %ymm5 + vpand %ymm6, %ymm3, %ymm3 + vpand %ymm6, %ymm4, %ymm4 + vmovdqa %ymm9, %ymm8 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd (%rsi), %ymm0 + vmovupd 32(%rsi), %ymm1 + vmovupd 64(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 96(%rsi), %ymm0 + vmovupd 128(%rsi), %ymm1 + vmovupd 160(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 192(%rsi), %ymm0 + vmovupd 224(%rsi), %ymm1 + vmovupd 256(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 288(%rsi), %ymm0 + vmovupd 320(%rsi), %ymm1 + vmovupd 352(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 384(%rsi), %ymm0 + vmovupd 416(%rsi), %ymm1 + vmovupd 448(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 480(%rsi), %ymm0 + vmovupd 512(%rsi), %ymm1 + vmovupd 544(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 576(%rsi), %ymm0 + vmovupd 608(%rsi), %ymm1 + vmovupd 640(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd 672(%rsi), %ymm0 + vmovupd 704(%rsi), %ymm1 + vmovupd 736(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + movsbq %cl, %rax + sarq $63, %rax + vmovd %eax, %xmm6 + vpxor %ymm8, %ymm8, %ymm8 + vpermd %ymm6, %ymm8, %ymm6 + vpxor %ymm4, %ymm3, %ymm8 + vpand %ymm6, %ymm8, %ymm8 + vpxor %ymm8, %ymm3, %ymm3 + vpxor %ymm8, %ymm4, %ymm4 + vmovupd %ymm3, (%rdi) + vmovupd %ymm4, 32(%rdi) + vmovupd %ymm5, 64(%rdi) + movq 64(%rdi), %r8 + movq 72(%rdi), %r9 + movq 80(%rdi), %r10 + movq 88(%rdi), %r11 + movq $-19, %r12 + movq $-1, %r13 + movq $-1, %r14 + movq $0x7fffffffffffffff, %r15 + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + cmpb $0x00, %cl + cmovlq %r12, %r8 + cmovlq %r13, %r9 + cmovlq %r14, %r10 + cmovlq %r15, %r11 + movq %r8, 64(%rdi) + movq %r9, 72(%rdi) + movq %r10, 80(%rdi) + movq %r11, 88(%rdi) + vzeroupper + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_cmov_table_avx2,.-fe_cmov_table_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text .globl fe_mul_avx2 .type fe_mul_avx2,@function .align 16 @@ -9760,6 +13008,1984 @@ _fe_invert_avx2: movq 128(%rsp), %rdi addq $0x90, %rsp repz retq +#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_curve25519_base_avx2_x2: +.quad 0x5cae469cdd684efb, 0x8f3f5ced1e350b5c +.quad 0xd9750c687d157114, 0x20d342d51873f1b7 +#ifndef __APPLE__ +.text +.globl curve25519_base_avx2 +.type curve25519_base_avx2,@function +.align 16 +curve25519_base_avx2: +#else +.section __TEXT,__text +.globl _curve25519_base_avx2 +.p2align 4 +_curve25519_base_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + subq $0xb0, %rsp + movq $0x00, 168(%rsp) + movq %rdi, 160(%rsp) + # Set base point x + movq $9, (%rdi) + movq $0x00, 8(%rdi) + movq $0x00, 16(%rdi) + movq $0x00, 24(%rdi) + # Set one + movq $0x01, (%rsp) + movq $0x00, 8(%rsp) + movq $0x00, 16(%rsp) + movq $0x00, 24(%rsp) + movq 0+L_curve25519_base_avx2_x2(%rip), %r8 + movq 8+L_curve25519_base_avx2_x2(%rip), %r9 + movq 16+L_curve25519_base_avx2_x2(%rip), %r10 + movq 24+L_curve25519_base_avx2_x2(%rip), %r11 + # Set one + movq $0x01, 32(%rsp) + movq $0x00, 40(%rsp) + movq $0x00, 48(%rsp) + movq $0x00, 56(%rsp) + movq %r8, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + movq $0xfd, %rbp +L_curve25519_base_avx2_bits: + movq 168(%rsp), %rax + movq %rbp, %rbx + movq %rbp, %rcx + shrq $6, %rbx + andq $63, %rcx + movq (%rsi,%rbx,8), %rbx + shrq %cl, %rbx + andq $0x01, %rbx + xorq %rbx, %rax + negq %rax + # Conditional Swap + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + xorq 64(%rsp), %r8 + xorq 72(%rsp), %r9 + xorq 80(%rsp), %r10 + xorq 88(%rsp), %r11 + andq %rax, %r8 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + xorq %r8, (%rdi) + xorq %r9, 8(%rdi) + xorq %r10, 16(%rdi) + xorq %r11, 24(%rdi) + xorq %r8, 64(%rsp) + xorq %r9, 72(%rsp) + xorq %r10, 80(%rsp) + xorq %r11, 88(%rsp) + # Conditional Swap + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + xorq 32(%rsp), %r8 + xorq 40(%rsp), %r9 + xorq 48(%rsp), %r10 + xorq 56(%rsp), %r11 + andq %rax, %r8 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + xorq %r8, (%rsp) + xorq %r9, 8(%rsp) + xorq %r10, 16(%rsp) + xorq %r11, 24(%rsp) + xorq %r8, 32(%rsp) + xorq %r9, 40(%rsp) + xorq %r10, 48(%rsp) + xorq %r11, 56(%rsp) + movq %rbx, 168(%rsp) + # Add-Sub + # Add + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq %r8, %r12 + addq (%rsp), %r8 + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 + adcq 16(%rsp), %r10 + movq %r11, %r15 + adcq 24(%rsp), %r11 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r11 + # Sub modulus (if overflow) + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Sub + subq (%rsp), %r12 + sbbq 8(%rsp), %r13 + sbbq 16(%rsp), %r14 + sbbq 24(%rsp), %r15 + sbbq %rcx, %rcx + shldq $0x01, %r15, %rcx + imulq $-19, %rcx + andq %rbx, %r15 + # Add modulus (if underflow) + subq %rcx, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 128(%rsp) + movq %r13, 136(%rsp) + movq %r14, 144(%rsp) + movq %r15, 152(%rsp) + # Add-Sub + # Add + movq 64(%rsp), %r8 + movq 72(%rsp), %r9 + movq 80(%rsp), %r10 + movq 88(%rsp), %r11 + movq %r8, %r12 + addq 32(%rsp), %r8 + movq %r9, %r13 + adcq 40(%rsp), %r9 + movq %r10, %r14 + adcq 48(%rsp), %r10 + movq %r11, %r15 + adcq 56(%rsp), %r11 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r11 + # Sub modulus (if overflow) + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Sub + subq 32(%rsp), %r12 + sbbq 40(%rsp), %r13 + sbbq 48(%rsp), %r14 + sbbq 56(%rsp), %r15 + sbbq %rcx, %rcx + shldq $0x01, %r15, %rcx + imulq $-19, %rcx + andq %rbx, %r15 + # Add modulus (if underflow) + subq %rcx, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + movq %r12, 96(%rsp) + movq %r13, 104(%rsp) + movq %r14, 112(%rsp) + movq %r15, 120(%rsp) + # Multiply + # A[0] * B[0] + movq 128(%rsp), %rdx + mulxq 32(%rsp), %r8, %r9 + # A[2] * B[0] + mulxq 48(%rsp), %r10, %r11 + # A[1] * B[0] + mulxq 40(%rsp), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 136(%rsp), %rdx + mulxq 56(%rsp), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq 32(%rsp), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 48(%rsp), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 144(%rsp), %rdx + mulxq 40(%rsp), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq 32(%rsp), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 136(%rsp), %rdx + mulxq 40(%rsp), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 152(%rsp), %rdx + adoxq %rcx, %r11 + mulxq 40(%rsp), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 144(%rsp), %rdx + mulxq 48(%rsp), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 152(%rsp), %rdx + adoxq %rcx, %r13 + mulxq 56(%rsp), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq 32(%rsp), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 56(%rsp), %rdx + adcxq %rcx, %r12 + mulxq 128(%rsp), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 56(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 152(%rsp), %rdx + adcxq %rcx, %r14 + mulxq 48(%rsp), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + # Store + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + # Multiply + # A[0] * B[0] + movq (%rdi), %rdx + mulxq 96(%rsp), %r8, %r9 + # A[2] * B[0] + mulxq 112(%rsp), %r10, %r11 + # A[1] * B[0] + mulxq 104(%rsp), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 8(%rdi), %rdx + mulxq 120(%rsp), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq 96(%rsp), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 112(%rsp), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rdi), %rdx + mulxq 104(%rsp), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq 96(%rsp), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rdi), %rdx + mulxq 104(%rsp), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r11 + mulxq 104(%rsp), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rdi), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r13 + mulxq 120(%rsp), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq 96(%rsp), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 120(%rsp), %rdx + adcxq %rcx, %r12 + mulxq (%rdi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 120(%rsp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 24(%rdi), %rdx + adcxq %rcx, %r14 + mulxq 112(%rsp), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + # Store + movq %r8, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + # Square + movq 128(%rsp), %rdx + movq 136(%rsp), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 152(%rsp), %r11, %r12 + # A[2] * A[1] + movq 144(%rsp), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 152(%rsp), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 152(%rsp), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 144(%rsp), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 152(%rsp), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + # Store + movq %r8, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Square + movq (%rdi), %rdx + movq 8(%rdi), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 24(%rdi), %r11, %r12 + # A[2] * A[1] + movq 16(%rdi), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rdi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 24(%rdi), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 16(%rdi), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + # Store + movq %r8, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Add-Sub + # Add + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq %r8, %r12 + addq 32(%rsp), %r8 + movq %r9, %r13 + adcq 40(%rsp), %r9 + movq %r10, %r14 + adcq 48(%rsp), %r10 + movq %r11, %r15 + adcq 56(%rsp), %r11 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r11 + # Sub modulus (if overflow) + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Sub + subq 32(%rsp), %r12 + sbbq 40(%rsp), %r13 + sbbq 48(%rsp), %r14 + sbbq 56(%rsp), %r15 + sbbq %rcx, %rcx + shldq $0x01, %r15, %rcx + imulq $-19, %rcx + andq %rbx, %r15 + # Add modulus (if underflow) + subq %rcx, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + movq %r8, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + movq %r12, 32(%rsp) + movq %r13, 40(%rsp) + movq %r14, 48(%rsp) + movq %r15, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r8, %r9 + # A[2] * B[0] + mulxq 144(%rsp), %r10, %r11 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r11 + mulxq 136(%rsp), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r13 + mulxq 152(%rsp), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 152(%rsp), %rdx + adcxq %rcx, %r12 + mulxq 96(%rsp), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 120(%rsp), %rdx + adcxq %rcx, %r14 + mulxq 144(%rsp), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + # Sub + movq 128(%rsp), %r8 + movq 136(%rsp), %r9 + movq 144(%rsp), %r10 + movq 152(%rsp), %r11 + subq 96(%rsp), %r8 + sbbq 104(%rsp), %r9 + sbbq 112(%rsp), %r10 + sbbq 120(%rsp), %r11 + sbbq %rcx, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $-19, %rcx + andq %rbx, %r11 + # Add modulus (if underflow) + subq %rcx, %r8 + sbbq $0x00, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + movq %r8, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Square + movq 32(%rsp), %rdx + movq 40(%rsp), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 56(%rsp), %r11, %r12 + # A[2] * A[1] + movq 48(%rsp), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 56(%rsp), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 56(%rsp), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 48(%rsp), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 56(%rsp), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + # Store + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + movq $0x1db42, %rdx + mulxq 128(%rsp), %r8, %r15 + mulxq 136(%rsp), %r9, %r14 + mulxq 144(%rsp), %r10, %r13 + mulxq 152(%rsp), %r11, %r12 + addq %r15, %r9 + adcq %r14, %r10 + adcq %r13, %r11 + adcq $0x00, %r12 + movq $0x7fffffffffffffff, %r15 + shldq $0x01, %r11, %r12 + andq %r15, %r11 + imulq $19, %r12, %r12 + addq %r12, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %r8, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + # Square + movq 64(%rsp), %rdx + movq 72(%rsp), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 88(%rsp), %r11, %r12 + # A[2] * A[1] + movq 80(%rsp), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 88(%rsp), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 88(%rsp), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 80(%rsp), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 88(%rsp), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + # Store + movq %r8, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + # Add + movq 96(%rsp), %r8 + movq 104(%rsp), %r9 + addq (%rsp), %r8 + movq 112(%rsp), %r10 + adcq 8(%rsp), %r9 + movq 120(%rsp), %r11 + adcq 16(%rsp), %r10 + adcq 24(%rsp), %r11 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r11 + # Sub modulus (if overflow) + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %r8, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + movq $9, %rdx + mulxq 32(%rsp), %r8, %r15 + mulxq 40(%rsp), %r9, %r14 + mulxq 48(%rsp), %r10, %r13 + mulxq 56(%rsp), %r11, %r12 + addq %r15, %r9 + adcq %r14, %r10 + adcq %r13, %r11 + adcq $0x00, %r12 + movq $0x7fffffffffffffff, %r15 + shldq $0x01, %r11, %r12 + andq %r15, %r11 + imulq $19, %r12, %r12 + addq %r12, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r8, %r9 + # A[2] * B[0] + mulxq 144(%rsp), %r10, %r11 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r11 + mulxq 136(%rsp), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r13 + mulxq 152(%rsp), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 152(%rsp), %rdx + adcxq %rcx, %r12 + mulxq 96(%rsp), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 120(%rsp), %rdx + adcxq %rcx, %r14 + mulxq 144(%rsp), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + # Store + movq %r8, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + decq %rbp + cmpq $3, %rbp + jge L_curve25519_base_avx2_bits + movq 168(%rsp), %rax + negq %rax + # Conditional Swap + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + xorq 64(%rsp), %r8 + xorq 72(%rsp), %r9 + xorq 80(%rsp), %r10 + xorq 88(%rsp), %r11 + andq %rax, %r8 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + xorq %r8, (%rdi) + xorq %r9, 8(%rdi) + xorq %r10, 16(%rdi) + xorq %r11, 24(%rdi) + xorq %r8, 64(%rsp) + xorq %r9, 72(%rsp) + xorq %r10, 80(%rsp) + xorq %r11, 88(%rsp) + # Conditional Swap + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + xorq 32(%rsp), %r8 + xorq 40(%rsp), %r9 + xorq 48(%rsp), %r10 + xorq 56(%rsp), %r11 + andq %rax, %r8 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + xorq %r8, (%rsp) + xorq %r9, 8(%rsp) + xorq %r10, 16(%rsp) + xorq %r11, 24(%rsp) + xorq %r8, 32(%rsp) + xorq %r9, 40(%rsp) + xorq %r10, 48(%rsp) + xorq %r11, 56(%rsp) +L_curve25519_base_avx2_last_3: + # Add-Sub + # Add + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq %r8, %r12 + addq (%rsp), %r8 + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 + adcq 16(%rsp), %r10 + movq %r11, %r15 + adcq 24(%rsp), %r11 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r11 + # Sub modulus (if overflow) + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Sub + subq (%rsp), %r12 + sbbq 8(%rsp), %r13 + sbbq 16(%rsp), %r14 + sbbq 24(%rsp), %r15 + sbbq %rcx, %rcx + shldq $0x01, %r15, %rcx + imulq $-19, %rcx + andq %rbx, %r15 + # Add modulus (if underflow) + subq %rcx, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 128(%rsp) + movq %r13, 136(%rsp) + movq %r14, 144(%rsp) + movq %r15, 152(%rsp) + # Square + movq 128(%rsp), %rdx + movq 136(%rsp), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 152(%rsp), %r11, %r12 + # A[2] * A[1] + movq 144(%rsp), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 152(%rsp), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 152(%rsp), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 144(%rsp), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 152(%rsp), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + # Store + movq %r8, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Square + movq (%rdi), %rdx + movq 8(%rdi), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 24(%rdi), %r11, %r12 + # A[2] * A[1] + movq 16(%rdi), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rdi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 24(%rdi), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 16(%rdi), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + # Store + movq %r8, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r8, %r9 + # A[2] * B[0] + mulxq 144(%rsp), %r10, %r11 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r11 + mulxq 136(%rsp), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r13 + mulxq 152(%rsp), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 152(%rsp), %rdx + adcxq %rcx, %r12 + mulxq 96(%rsp), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 120(%rsp), %rdx + adcxq %rcx, %r14 + mulxq 144(%rsp), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + # Sub + movq 128(%rsp), %r8 + movq 136(%rsp), %r9 + movq 144(%rsp), %r10 + movq 152(%rsp), %r11 + subq 96(%rsp), %r8 + sbbq 104(%rsp), %r9 + sbbq 112(%rsp), %r10 + sbbq 120(%rsp), %r11 + sbbq %rcx, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $-19, %rcx + andq %rbx, %r11 + # Add modulus (if underflow) + subq %rcx, %r8 + sbbq $0x00, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + movq %r8, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + movq $0x1db42, %rdx + mulxq 128(%rsp), %r8, %r15 + mulxq 136(%rsp), %r9, %r14 + mulxq 144(%rsp), %r10, %r13 + mulxq 152(%rsp), %r11, %r12 + addq %r15, %r9 + adcq %r14, %r10 + adcq %r13, %r11 + adcq $0x00, %r12 + movq $0x7fffffffffffffff, %r15 + shldq $0x01, %r11, %r12 + andq %r15, %r11 + imulq $19, %r12, %r12 + addq %r12, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %r8, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + # Add + movq 96(%rsp), %r8 + movq 104(%rsp), %r9 + addq (%rsp), %r8 + movq 112(%rsp), %r10 + adcq 8(%rsp), %r9 + movq 120(%rsp), %r11 + adcq 16(%rsp), %r10 + adcq 24(%rsp), %r11 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r11, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r11 + # Sub modulus (if overflow) + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %r8, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r8, %r9 + # A[2] * B[0] + mulxq 144(%rsp), %r10, %r11 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r11 + mulxq 136(%rsp), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r13 + mulxq 152(%rsp), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 152(%rsp), %rdx + adcxq %rcx, %r12 + mulxq 96(%rsp), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 120(%rsp), %rdx + adcxq %rcx, %r14 + mulxq 144(%rsp), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + # Store + movq %r8, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + decq %rbp + jge L_curve25519_base_avx2_last_3 + # Invert + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + movq %rsp, %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq 160(%rsp), %rdi + # Multiply + # A[0] * B[0] + movq (%rsp), %rdx + mulxq (%rdi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rdi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rdi), %rcx, %rbx + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[3] * B[1] + movq 8(%rsp), %rdx + mulxq 24(%rdi), %r12, %r13 + adcxq %rbx, %r10 + # A[0] * B[1] + mulxq (%rdi), %rcx, %rbx + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rdi), %rcx, %r14 + adoxq %rbx, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rsp), %rdx + mulxq 8(%rdi), %rcx, %rbx + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rbx, %r12 + # A[0] * B[2] + mulxq (%rdi), %rcx, %rbx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rsp), %rdx + mulxq 8(%rdi), %rdx, %rcx + adcxq %rbx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 24(%rsp), %rdx + adoxq %rcx, %r11 + mulxq 8(%rdi), %rcx, %rbx + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rsp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rbx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rsp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rdi), %rcx, %rbx + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rdi), %rdx, %rcx + adcxq %rbx, %r15 + xorq %rbx, %rbx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 24(%rdi), %rdx + adcxq %rcx, %r12 + mulxq (%rsp), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[3] * B[2] + movq 24(%rdi), %rdx + mulxq 16(%rsp), %rdx, %rcx + adcxq %rdx, %r13 + # A[2] * B[3] + movq 24(%rsp), %rdx + adcxq %rcx, %r14 + mulxq 16(%rdi), %rcx, %rdx + adcxq %rbx, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rcx + addq %r15, %r11 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r11, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r11 + xorq %rbx, %rbx + adoxq %rcx, %r8 + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + adcxq %rbx, %r11 + movq $0x7fffffffffffffff, %rbx + movq %r11, %rdx + sarq $63, %rdx + andq $19, %rdx + andq %rbx, %r11 + addq %rdx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq $0x7fffffffffffffff, %rcx + movq %r8, %rdx + addq $19, %rdx + movq %r9, %rdx + adcq $0x00, %rdx + movq %r10, %rdx + adcq $0x00, %rdx + movq %r11, %rdx + adcq $0x00, %rdx + sarq $63, %rdx + andq $19, %rdx + andq %rcx, %r11 + addq %rdx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + xorq %rax, %rax + addq $0xb0, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size curve25519_base_avx2,.-curve25519_base_avx2 +#endif /* __APPLE__ */ +#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519_avx2 @@ -10902,7 +16128,537 @@ L_curve25519_avx2_bits: movq %r12, 24(%rsp) movq 160(%rsp), %rbx decq %rbx + cmpq $3, %rbx jge L_curve25519_avx2_bits + movq %rbx, 160(%rsp) + movq 176(%rsp), %rax + negq %rax + # Conditional Swap + movq (%rdi), %r9 + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + movq 24(%rdi), %r12 + xorq 64(%rsp), %r9 + xorq 72(%rsp), %r10 + xorq 80(%rsp), %r11 + xorq 88(%rsp), %r12 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + andq %rax, %r12 + xorq %r9, (%rdi) + xorq %r10, 8(%rdi) + xorq %r11, 16(%rdi) + xorq %r12, 24(%rdi) + xorq %r9, 64(%rsp) + xorq %r10, 72(%rsp) + xorq %r11, 80(%rsp) + xorq %r12, 88(%rsp) + # Conditional Swap + movq (%rsp), %r9 + movq 8(%rsp), %r10 + movq 16(%rsp), %r11 + movq 24(%rsp), %r12 + xorq 32(%rsp), %r9 + xorq 40(%rsp), %r10 + xorq 48(%rsp), %r11 + xorq 56(%rsp), %r12 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + andq %rax, %r12 + xorq %r9, (%rsp) + xorq %r10, 8(%rsp) + xorq %r11, 16(%rsp) + xorq %r12, 24(%rsp) + xorq %r9, 32(%rsp) + xorq %r10, 40(%rsp) + xorq %r11, 48(%rsp) + xorq %r12, 56(%rsp) +L_curve25519_avx2_last_3: + # Add-Sub + # Add + movq (%rdi), %r9 + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + movq 24(%rdi), %r12 + movq %r9, %r13 + addq (%rsp), %r9 + movq %r10, %r14 + adcq 8(%rsp), %r10 + movq %r11, %r15 + adcq 16(%rsp), %r11 + movq %r12, %rbp + adcq 24(%rsp), %r12 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r12, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r12 + # Sub modulus (if overflow) + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Sub + subq (%rsp), %r13 + sbbq 8(%rsp), %r14 + sbbq 16(%rsp), %r15 + sbbq 24(%rsp), %rbp + sbbq %rcx, %rcx + shldq $0x01, %rbp, %rcx + imulq $-19, %rcx + andq %rbx, %rbp + # Add modulus (if underflow) + subq %rcx, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbp + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %r13, 128(%rsp) + movq %r14, 136(%rsp) + movq %r15, 144(%rsp) + movq %rbp, 152(%rsp) + # Square + movq 128(%rsp), %rdx + movq 136(%rsp), %rax + # A[0] * A[1] + movq %rdx, %rbp + mulxq %rax, %r10, %r11 + # A[0] * A[3] + mulxq 152(%rsp), %r12, %r13 + # A[2] * A[1] + movq 144(%rsp), %rdx + mulxq %rax, %rcx, %rbx + xorq %r9, %r9 + adoxq %rcx, %r12 + # A[2] * A[3] + mulxq 152(%rsp), %r14, %r15 + adoxq %rbx, %r13 + # A[2] * A[0] + mulxq %rbp, %rcx, %rbx + adoxq %r9, %r14 + adcxq %rcx, %r11 + adoxq %r9, %r15 + # A[1] * A[3] + movq %rax, %rdx + mulxq 152(%rsp), %rcx, %rdx + adcxq %rbx, %r12 + adcxq %rcx, %r13 + adcxq %rdx, %r14 + adcxq %r9, %r15 + # A[0] * A[0] + movq %rbp, %rdx + mulxq %rdx, %r9, %rcx + xorq %rbp, %rbp + adcxq %r10, %r10 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r10 + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rcx, %r11 + adcxq %r12, %r12 + # A[2] * A[2] + movq 144(%rsp), %rdx + adoxq %rbx, %r12 + mulxq %rdx, %rbx, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r13 + adcxq %r14, %r14 + # A[3] * A[3] + movq 152(%rsp), %rdx + adoxq %rcx, %r14 + mulxq %rdx, %rcx, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r15 + adcxq %rbp, %rbp + adoxq %rbx, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %rbx + addq %rbp, %r12 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r12, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r12 + xorq %rcx, %rcx + adoxq %rbx, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rbx, %r15 + adcxq %rbx, %r11 + adoxq %r15, %r12 + adcxq %rcx, %r12 + # Store + movq %r9, 96(%rsp) + movq %r10, 104(%rsp) + movq %r11, 112(%rsp) + movq %r12, 120(%rsp) + # Square + movq (%rdi), %rdx + movq 8(%rdi), %rax + # A[0] * A[1] + movq %rdx, %rbp + mulxq %rax, %r10, %r11 + # A[0] * A[3] + mulxq 24(%rdi), %r12, %r13 + # A[2] * A[1] + movq 16(%rdi), %rdx + mulxq %rax, %rcx, %rbx + xorq %r9, %r9 + adoxq %rcx, %r12 + # A[2] * A[3] + mulxq 24(%rdi), %r14, %r15 + adoxq %rbx, %r13 + # A[2] * A[0] + mulxq %rbp, %rcx, %rbx + adoxq %r9, %r14 + adcxq %rcx, %r11 + adoxq %r9, %r15 + # A[1] * A[3] + movq %rax, %rdx + mulxq 24(%rdi), %rcx, %rdx + adcxq %rbx, %r12 + adcxq %rcx, %r13 + adcxq %rdx, %r14 + adcxq %r9, %r15 + # A[0] * A[0] + movq %rbp, %rdx + mulxq %rdx, %r9, %rcx + xorq %rbp, %rbp + adcxq %r10, %r10 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r10 + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rcx, %r11 + adcxq %r12, %r12 + # A[2] * A[2] + movq 16(%rdi), %rdx + adoxq %rbx, %r12 + mulxq %rdx, %rbx, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r13 + adcxq %r14, %r14 + # A[3] * A[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r14 + mulxq %rdx, %rcx, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r15 + adcxq %rbp, %rbp + adoxq %rbx, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %rbx + addq %rbp, %r12 + adcq $0x00, %rbx + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r12, %rbx + imulq $19, %rbx, %rbx + andq %rcx, %r12 + xorq %rcx, %rcx + adoxq %rbx, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rbx, %r15 + adcxq %rbx, %r11 + adoxq %r15, %r12 + adcxq %rcx, %r12 + # Store + movq %r9, 128(%rsp) + movq %r10, 136(%rsp) + movq %r11, 144(%rsp) + movq %r12, 152(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r9, %r10 + # A[2] * B[0] + mulxq 144(%rsp), %r11, %r12 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 136(%rsp), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 152(%rsp), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq 152(%rsp), %rdx + adcxq %rcx, %r13 + mulxq 96(%rsp), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 120(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 144(%rsp), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r12 + xorq %rbx, %rbx + adoxq %rcx, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %rbx, %r12 + # Store + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + # Sub + movq 128(%rsp), %r9 + movq 136(%rsp), %r10 + movq 144(%rsp), %r11 + movq 152(%rsp), %r12 + subq 96(%rsp), %r9 + sbbq 104(%rsp), %r10 + sbbq 112(%rsp), %r11 + sbbq 120(%rsp), %r12 + sbbq %rcx, %rcx + shldq $0x01, %r12, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $-19, %rcx + andq %rbx, %r12 + # Add modulus (if underflow) + subq %rcx, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + sbbq $0x00, %r12 + movq %r9, 128(%rsp) + movq %r10, 136(%rsp) + movq %r11, 144(%rsp) + movq %r12, 152(%rsp) + movq $0x1db42, %rdx + mulxq 128(%rsp), %r9, %rbp + mulxq 136(%rsp), %r10, %r15 + mulxq 144(%rsp), %r11, %r14 + mulxq 152(%rsp), %r12, %r13 + addq %rbp, %r10 + adcq %r15, %r11 + adcq %r14, %r12 + adcq $0x00, %r13 + movq $0x7fffffffffffffff, %rbp + shldq $0x01, %r12, %r13 + andq %rbp, %r12 + imulq $19, %r13, %r13 + addq %r13, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) + # Add + movq 96(%rsp), %r9 + movq 104(%rsp), %r10 + addq (%rsp), %r9 + movq 112(%rsp), %r11 + adcq 8(%rsp), %r10 + movq 120(%rsp), %r12 + adcq 16(%rsp), %r11 + adcq 24(%rsp), %r12 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r12, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rcx + andq %rbx, %r12 + # Sub modulus (if overflow) + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + movq %r9, 96(%rsp) + movq %r10, 104(%rsp) + movq %r11, 112(%rsp) + movq %r12, 120(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r9, %r10 + # A[2] * B[0] + mulxq 144(%rsp), %r11, %r12 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 136(%rsp), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 152(%rsp), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq 152(%rsp), %rdx + adcxq %rcx, %r13 + mulxq 96(%rsp), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 120(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 144(%rsp), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %rbx + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %rbx, %r12 + xorq %rbx, %rbx + adoxq %rcx, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %rbx, %r12 + # Store + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) + movq 160(%rsp), %rbx + decq %rbx + movq %rbx, 160(%rsp) + jge L_curve25519_avx2_last_3 # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi @@ -11280,126 +17036,6 @@ L_curve25519_avx2_bits: #ifndef __APPLE__ .size curve25519_avx2,.-curve25519_avx2 #endif /* __APPLE__ */ -#ifdef HAVE_ED25519 -#ifndef __APPLE__ -.text -.globl fe_sq2_avx2 -.type fe_sq2_avx2,@function -.align 16 -fe_sq2_avx2: -#else -.section __TEXT,__text -.globl _fe_sq2_avx2 -.p2align 4 -_fe_sq2_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - # Square * 2 - movq (%rsi), %rdx - movq 8(%rsi), %rax - # A[0] * A[1] - movq %rdx, %r15 - mulxq %rax, %r9, %r10 - # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] - movq 16(%rsi), %rdx - mulxq %rax, %rcx, %rbx - xorq %r8, %r8 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq %r15, %rcx, %rbx - adoxq %r8, %r13 - adcxq %rcx, %r10 - adoxq %r8, %r14 - # A[1] * A[3] - movq %rax, %rdx - mulxq 24(%rsi), %rcx, %rdx - adcxq %rbx, %r11 - adcxq %rcx, %r12 - adcxq %rdx, %r13 - adcxq %r8, %r14 - # A[0] * A[0] - movq %r15, %rdx - mulxq %rdx, %r8, %rcx - xorq %r15, %r15 - adcxq %r9, %r9 - # A[1] * A[1] - movq %rax, %rdx - adoxq %rcx, %r9 - mulxq %rdx, %rcx, %rbx - adcxq %r10, %r10 - adoxq %rcx, %r10 - adcxq %r11, %r11 - # A[2] * A[2] - movq 16(%rsi), %rdx - adoxq %rbx, %r11 - mulxq %rdx, %rbx, %rcx - adcxq %r12, %r12 - adoxq %rbx, %r12 - adcxq %r13, %r13 - # A[3] * A[3] - movq 24(%rsi), %rdx - adoxq %rcx, %r13 - mulxq %rdx, %rcx, %rbx - adcxq %r14, %r14 - adoxq %rcx, %r14 - adcxq %r15, %r15 - adoxq %rbx, %r15 - movq $38, %rdx - mulxq %r15, %r15, %rax - addq %r15, %r11 - adcq $0x00, %rax - movq $0x7fffffffffffffff, %rcx - shldq $0x01, %r11, %rax - imulq $19, %rax, %rax - andq %rcx, %r11 - xorq %rcx, %rcx - adoxq %rax, %r8 - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 - adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - adcxq %rcx, %r11 - movq %r11, %rax - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - movq $0x7fffffffffffffff, %rcx - shrq $62, %rax - andq %rcx, %r11 - imulq $19, %rax, %rax - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_sq2_avx2,.-fe_sq2_avx2 -#endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_avx2 @@ -15158,6 +20794,126 @@ _ge_sub_avx2: #ifndef __APPLE__ .size ge_sub_avx2,.-ge_sub_avx2 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl fe_sq2_avx2 +.type fe_sq2_avx2,@function +.align 16 +fe_sq2_avx2: +#else +.section __TEXT,__text +.globl _fe_sq2_avx2 +.p2align 4 +_fe_sq2_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square * 2 + movq (%rsi), %rdx + movq 8(%rsi), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq 16(%rsi), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 24(%rsi), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 16(%rsi), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 24(%rsi), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rax + addq %r15, %r11 + adcq $0x00, %rax + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rax + imulq $19, %rax, %rax + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rax, %r8 + mulxq %r12, %rax, %r12 + adcxq %rax, %r8 + adoxq %r12, %r9 + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + movq %r11, %rax + shldq $0x01, %r10, %r11 + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shlq $0x01, %r8 + movq $0x7fffffffffffffff, %rcx + shrq $62, %rax + andq %rcx, %r11 + imulq $19, %rax, %rax + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_sq2_avx2,.-fe_sq2_avx2 +#endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_reduce_avx2 diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c index c6120d4d6..c3264e94f 100644 --- a/wolfcrypt/src/ge_operations.c +++ b/wolfcrypt/src/ge_operations.c @@ -24,10 +24,11 @@ #include -#ifdef HAVE_ED25519 +#include + +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef ED25519_SMALL /* run when not defined to use small memory math */ -#include #include #ifdef NO_INLINE #include @@ -52,7 +53,8 @@ static void ge_p2_0(ge_p2 *h); #ifndef CURVED25519_ASM -#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) || \ + defined(WOLFSSL_CURVE25519_USE_ED25519) static void ge_precomp_0(ge_precomp *h); #endif static void ge_p3_to_p2(ge_p2 *r,const ge_p3 *p); @@ -968,7 +970,8 @@ static unsigned char equal(unsigned char b,unsigned char c) return (unsigned char)y; } -#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) || \ + defined(WOLFSSL_CURVE25519_USE_ED25519) static unsigned char negative(signed char b) { return ((unsigned char)b) >> 7; @@ -986,7 +989,8 @@ static WC_INLINE void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b, } #endif -#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) || \ + defined(WOLFSSL_CURVE25519_USE_ED25519) #ifdef CURVED25519_ASM_64BIT static const ge_precomp base[64][8] = { { @@ -9066,7 +9070,6 @@ static const ge_precomp base[32][8] = { } ; #endif - static void ge_select(ge_precomp *t,int pos,signed char b) { #ifndef CURVED25519_ASM @@ -9165,7 +9168,8 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) } #endif } -#endif /* HAVE_ED25519_SIGN || HAVE_ED25519_MAKE_KEY */ +#endif /* HAVE_ED25519_SIGN || HAVE_ED25519_MAKE_KEY || + * WOLFSSL_CURVE25519_USE_ED25519 */ #define SLIDE_SIZE 256 @@ -9769,7 +9773,8 @@ void ge_p3_tobytes(unsigned char *s,const ge_p3 *h) #ifndef CURVED25519_ASM -#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) || \ + defined(WOLFSSL_CURVE25519_USE_ED25519) /* ge_precomp_0 */ static void ge_precomp_0(ge_precomp *h) { diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index cc9de006f..165372ebe 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -345,7 +345,7 @@ fe_add: bl fe_add_op pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_add,.-fe_add -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) .text .align 4 .globl fe_frombytes @@ -590,7 +590,7 @@ fe_isnegative: eor r0, r0, r1 pop {r4, r5, pc} .size fe_isnegative,.-fe_isnegative -#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) +#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WC_NO_CACHE_RESISTANT .text .align 4 @@ -2235,8 +2235,8 @@ fe_cmov_table: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_cmov_table,.-fe_cmov_table #endif /* WC_NO_CACHE_RESISTANT */ -#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ -#endif /* HAVE_ED25519 */ +#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN || WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) .text .align 4 @@ -3981,7 +3981,7 @@ L_curve25519_inv_8: .size curve25519,.-curve25519 #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_CURVE25519 */ -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) .text .align 4 .globl fe_invert @@ -5217,6 +5217,8 @@ ge_sub: add sp, sp, #44 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size ge_sub,.-ge_sub +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ +#ifdef HAVE_ED25519 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) .text .align 4 diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 04b9eb441..4c4af4dd1 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -459,7 +459,7 @@ WC_OMIT_FRAME_POINTER void fe_add(fe r, const fe a, const fe b) ); } -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void fe_frombytes(fe out_p, const unsigned char* in_p) #else @@ -823,7 +823,8 @@ WC_OMIT_FRAME_POINTER int fe_isnegative(const fe a) return (word32)(size_t)a; } -#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) +#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) || \ + defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WC_NO_CACHE_RESISTANT #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) @@ -2502,8 +2503,9 @@ WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r, fe* base, signed char b) } #endif /* WC_NO_CACHE_RESISTANT */ -#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ -#endif /* HAVE_ED25519 */ +#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN || + * WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) void fe_mul_op(void); #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -4422,7 +4424,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a) #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_CURVE25519 */ -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void fe_invert(fe r_p, const fe a_p) #else @@ -5860,6 +5862,8 @@ WC_OMIT_FRAME_POINTER void ge_sub(ge_p1p1 * r, const ge_p3 * p, ); } +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ +#ifdef HAVE_ED25519 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void sc_reduce(byte* s_p) diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index 888e2bdc7..ffaf2d6b5 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -47,7 +47,6 @@ _fe_init: #ifndef __APPLE__ .size fe_init,.-fe_init #endif /* __APPLE__ */ -#ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_frombytes @@ -554,7 +553,6 @@ _fe_cmov_table: #ifndef __APPLE__ .size fe_cmov_table,.-fe_cmov_table #endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ #ifndef __APPLE__ .text .globl fe_mul @@ -1693,6 +1691,2557 @@ L_fe_invert8: #ifndef __APPLE__ .size fe_invert,.-fe_invert #endif /* __APPLE__ */ +#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#ifndef __APPLE__ + .text + .type L_curve25519_base_x2, %object + .section .rodata + .size L_curve25519_base_x2, 32 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 5 +#else + .p2align 5 +#endif /* __APPLE__ */ +L_curve25519_base_x2: +.xword 0x5cae469cdd684efb, 0x8f3f5ced1e350b5c +.xword 0xd9750c687d157114, 0x20d342d51873f1b7 +#ifndef __APPLE__ +.text +.globl curve25519_base +.type curve25519_base,@function +.align 2 +curve25519_base: +#else +.section __TEXT,__text +.globl _curve25519_base +.p2align 2 +_curve25519_base: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-272]! + add x29, sp, #0 + stp x17, x19, [x29, #184] + stp x20, x21, [x29, #200] + stp x22, x23, [x29, #216] + stp x24, x25, [x29, #232] + stp x26, x27, [x29, #248] + str x28, [x29, #264] +#ifndef __APPLE__ + adrp x2, L_curve25519_base_x2 + add x2, x2, :lo12:L_curve25519_base_x2 +#else + adrp x2, L_curve25519_base_x2@PAGE + add x2, x2, :lo12:L_curve25519_base_x2@PAGEOFF +#endif /* __APPLE__ */ + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + mov x10, #1 + mov x11, xzr + mov x12, xzr + mov x13, xzr + # Set base point x-ordinate + mov x24, #9 + stp x24, xzr, [x0] + stp xzr, xzr, [x0, #16] + # Set one + mov x24, #1 + stp x24, xzr, [x29, #16] + stp xzr, xzr, [x29, #32] + mov x2, xzr + mov x23, x0 + mov x24, #0xfd +L_curve25519_base_bits: + lsr x3, x24, #6 + and x4, x24, #63 + ldr x5, [x1, x3, LSL 3] + lsr x5, x5, x4 + eor x2, x2, x5 + # Conditional Swap + subs xzr, xzr, x2, lsl 63 + ldp x25, x26, [x29, #16] + ldp x27, x28, [x29, #32] + csel x19, x25, x10, ne + csel x25, x10, x25, ne + csel x20, x26, x11, ne + csel x26, x11, x26, ne + csel x21, x27, x12, ne + csel x27, x12, x27, ne + csel x22, x28, x13, ne + csel x28, x13, x28, ne + # Conditional Swap + subs xzr, xzr, x2, lsl 63 + ldp x10, x11, [x0] + ldp x12, x13, [x0, #16] + csel x14, x10, x6, ne + csel x10, x6, x10, ne + csel x15, x11, x7, ne + csel x11, x7, x11, ne + csel x16, x12, x8, ne + csel x12, x8, x12, ne + csel x17, x13, x9, ne + csel x13, x9, x13, ne + mov x2, x5 + # Add + adds x6, x10, x25 + adcs x7, x11, x26 + adcs x8, x12, x27 + adcs x9, x13, x28 + cset x5, cs + mov x3, #19 + extr x5, x5, x9, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x6, x6, x3 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr + # Sub + subs x25, x10, x25 + sbcs x26, x11, x26 + sbcs x27, x12, x27 + sbcs x28, x13, x28 + csetm x5, cc + mov x3, #-19 + extr x5, x5, x28, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x25, x25, x3 + sbcs x26, x26, xzr + and x28, x28, #0x7fffffffffffffff + sbcs x27, x27, xzr + sbc x28, x28, xzr + stp x25, x26, [x29, #80] + stp x27, x28, [x29, #96] + # Add + adds x10, x14, x19 + adcs x11, x15, x20 + adcs x12, x16, x21 + adcs x13, x17, x22 + cset x5, cs + mov x3, #19 + extr x5, x5, x13, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x10, x10, x3 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr + # Sub + subs x14, x14, x19 + sbcs x15, x15, x20 + sbcs x16, x16, x21 + sbcs x17, x17, x22 + csetm x5, cc + mov x3, #-19 + extr x5, x5, x17, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x14, x14, x3 + sbcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + sbcs x16, x16, xzr + sbc x17, x17, xzr + # Multiply + # A[0] * B[0] + umulh x20, x14, x6 + mul x19, x14, x6 + # A[2] * B[0] + umulh x22, x16, x6 + mul x21, x16, x6 + # A[1] * B[0] + mul x3, x15, x6 + adds x20, x20, x3 + umulh x4, x15, x6 + adcs x21, x21, x4 + # A[1] * B[3] + umulh x26, x15, x9 + adc x22, x22, xzr + mul x25, x15, x9 + # A[0] * B[1] + mul x3, x14, x7 + adds x20, x20, x3 + umulh x4, x14, x7 + adcs x21, x21, x4 + # A[2] * B[1] + mul x3, x16, x7 + adcs x22, x22, x3 + umulh x4, x16, x7 + adcs x25, x25, x4 + adc x26, x26, xzr + # A[1] * B[2] + mul x3, x15, x8 + adds x22, x22, x3 + umulh x4, x15, x8 + adcs x25, x25, x4 + adcs x26, x26, xzr + adc x27, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + adds x21, x21, x3 + umulh x4, x14, x8 + adcs x22, x22, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + # A[1] * B[1] + mul x3, x15, x7 + adds x21, x21, x3 + umulh x4, x15, x7 + adcs x22, x22, x4 + # A[3] * B[1] + mul x3, x17, x7 + adcs x25, x25, x3 + umulh x4, x17, x7 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[2] + mul x3, x16, x8 + adds x25, x25, x3 + umulh x4, x16, x8 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x9 + adcs x27, x27, x3 + umulh x28, x17, x9 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x9 + adds x22, x22, x3 + umulh x4, x14, x9 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x16, x9 + adcs x26, x26, x3 + umulh x4, x16, x9 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x6 + adds x22, x22, x3 + umulh x4, x17, x6 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x17, x8 + adcs x26, x26, x3 + umulh x4, x17, x8 + adcs x27, x27, x4 + adc x28, x28, xzr + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x22, x22, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x19, x19, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x20, x20, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x21, x21, x4 + umulh x27, x3, x27 + adc x22, x22, xzr + # Add high product results in + adds x19, x19, x5 + adcs x20, x20, x25 + adcs x21, x21, x26 + adc x22, x22, x27 + # Store + stp x19, x20, [x29, #48] + stp x21, x22, [x29, #64] + # Multiply + ldp x25, x26, [x29, #80] + ldp x27, x28, [x29, #96] + # A[0] * B[0] + umulh x20, x10, x25 + mul x19, x10, x25 + # A[2] * B[0] + umulh x22, x12, x25 + mul x21, x12, x25 + # A[1] * B[0] + mul x3, x11, x25 + adds x20, x20, x3 + umulh x4, x11, x25 + adcs x21, x21, x4 + # A[1] * B[3] + umulh x15, x11, x28 + adc x22, x22, xzr + mul x14, x11, x28 + # A[0] * B[1] + mul x3, x10, x26 + adds x20, x20, x3 + umulh x4, x10, x26 + adcs x21, x21, x4 + # A[2] * B[1] + mul x3, x12, x26 + adcs x22, x22, x3 + umulh x4, x12, x26 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[1] * B[2] + mul x3, x11, x27 + adds x22, x22, x3 + umulh x4, x11, x27 + adcs x14, x14, x4 + adcs x15, x15, xzr + adc x16, xzr, xzr + # A[0] * B[2] + mul x3, x10, x27 + adds x21, x21, x3 + umulh x4, x10, x27 + adcs x22, x22, x4 + adcs x14, x14, xzr + adcs x15, x15, xzr + adc x16, x16, xzr + # A[1] * B[1] + mul x3, x11, x26 + adds x21, x21, x3 + umulh x4, x11, x26 + adcs x22, x22, x4 + # A[3] * B[1] + mul x3, x13, x26 + adcs x14, x14, x3 + umulh x4, x13, x26 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[2] * B[2] + mul x3, x12, x27 + adds x14, x14, x3 + umulh x4, x12, x27 + adcs x15, x15, x4 + # A[3] * B[3] + mul x3, x13, x28 + adcs x16, x16, x3 + umulh x17, x13, x28 + adc x17, x17, xzr + # A[0] * B[3] + mul x3, x10, x28 + adds x22, x22, x3 + umulh x4, x10, x28 + adcs x14, x14, x4 + # A[2] * B[3] + mul x3, x12, x28 + adcs x15, x15, x3 + umulh x4, x12, x28 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[3] * B[0] + mul x3, x13, x25 + adds x22, x22, x3 + umulh x4, x13, x25 + adcs x14, x14, x4 + # A[3] * B[2] + mul x3, x13, x27 + adcs x15, x15, x3 + umulh x4, x13, x27 + adcs x16, x16, x4 + adc x17, x17, xzr + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x22, x22, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x19, x19, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x20, x20, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x21, x21, x4 + umulh x16, x3, x16 + adc x22, x22, xzr + # Add high product results in + adds x19, x19, x5 + adcs x20, x20, x14 + adcs x21, x21, x15 + adc x22, x22, x16 + # Square + # A[0] * A[1] + umulh x12, x25, x26 + mul x11, x25, x26 + # A[0] * A[3] + umulh x14, x25, x28 + mul x13, x25, x28 + # A[0] * A[2] + mul x3, x25, x27 + adds x12, x12, x3 + umulh x4, x25, x27 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x26, x28 + adcs x14, x14, x3 + umulh x15, x26, x28 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x26, x27 + adds x13, x13, x3 + umulh x4, x26, x27 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x27, x28 + adcs x15, x15, x3 + umulh x16, x27, x28 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x25, x25 + mul x10, x25, x25 + # A[1] * A[1] + mul x3, x26, x26 + adds x11, x11, x4 + umulh x4, x26, x26 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x27, x27 + adcs x13, x13, x4 + umulh x4, x27, x27 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x28, x28 + adcs x15, x15, x4 + umulh x4, x28, x28 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x10, x10, x5 + adcs x11, x11, x14 + adcs x12, x12, x15 + adc x13, x13, x16 + # Square + # A[0] * A[1] + umulh x16, x6, x7 + mul x15, x6, x7 + # A[0] * A[3] + umulh x25, x6, x9 + mul x17, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x16, x16, x3 + umulh x4, x6, x8 + adcs x17, x17, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x25, x25, x3 + umulh x26, x7, x9 + adc x26, x26, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x17, x17, x3 + umulh x4, x7, x8 + adcs x25, x25, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x26, x26, x3 + umulh x27, x8, x9 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x25, x25, x25 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x14, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x15, x15, x4 + umulh x4, x7, x7 + adcs x16, x16, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x17, x17, x4 + umulh x4, x8, x8 + adcs x25, x25, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x26, x26, x4 + umulh x4, x9, x9 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x17, x17, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x14, x14, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x15, x15, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x16, x16, x4 + umulh x27, x3, x27 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, x27 + # Multiply + # A[0] * B[0] + umulh x7, x14, x10 + mul x6, x14, x10 + # A[2] * B[0] + umulh x9, x16, x10 + mul x8, x16, x10 + # A[1] * B[0] + mul x3, x15, x10 + adds x7, x7, x3 + umulh x4, x15, x10 + adcs x8, x8, x4 + # A[1] * B[3] + umulh x26, x15, x13 + adc x9, x9, xzr + mul x25, x15, x13 + # A[0] * B[1] + mul x3, x14, x11 + adds x7, x7, x3 + umulh x4, x14, x11 + adcs x8, x8, x4 + # A[2] * B[1] + mul x3, x16, x11 + adcs x9, x9, x3 + umulh x4, x16, x11 + adcs x25, x25, x4 + adc x26, x26, xzr + # A[1] * B[2] + mul x3, x15, x12 + adds x9, x9, x3 + umulh x4, x15, x12 + adcs x25, x25, x4 + adcs x26, x26, xzr + adc x27, xzr, xzr + # A[0] * B[2] + mul x3, x14, x12 + adds x8, x8, x3 + umulh x4, x14, x12 + adcs x9, x9, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + # A[1] * B[1] + mul x3, x15, x11 + adds x8, x8, x3 + umulh x4, x15, x11 + adcs x9, x9, x4 + # A[3] * B[1] + mul x3, x17, x11 + adcs x25, x25, x3 + umulh x4, x17, x11 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[2] + mul x3, x16, x12 + adds x25, x25, x3 + umulh x4, x16, x12 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x13 + adcs x27, x27, x3 + umulh x28, x17, x13 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x13 + adds x9, x9, x3 + umulh x4, x14, x13 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x16, x13 + adcs x26, x26, x3 + umulh x4, x16, x13 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x10 + adds x9, x9, x3 + umulh x4, x17, x10 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x17, x12 + adcs x26, x26, x3 + umulh x4, x17, x12 + adcs x27, x27, x4 + adc x28, x28, xzr + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x9, x9, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x6, x6, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x7, x7, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x8, x8, x4 + umulh x27, x3, x27 + adc x9, x9, xzr + # Add high product results in + adds x6, x6, x5 + adcs x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, x27 + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + # Sub + subs x14, x14, x10 + sbcs x15, x15, x11 + sbcs x16, x16, x12 + sbcs x17, x17, x13 + csetm x5, cc + mov x3, #-19 + # Mask the modulus + extr x5, x5, x17, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x14, x14, x3 + sbcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + sbcs x16, x16, xzr + sbc x17, x17, xzr + # Multiply by 121666 + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x14, x5 + umulh x7, x14, x5 + mul x3, x15, x5 + umulh x8, x15, x5 + adds x7, x7, x3 + adc x8, x8, xzr + mul x3, x16, x5 + umulh x9, x16, x5 + adds x8, x8, x3 + adc x9, x9, xzr + mul x3, x17, x5 + umulh x4, x17, x5 + adds x9, x9, x3 + adc x4, x4, xzr + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + adds x6, x6, x4 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr + # Add + adds x10, x10, x6 + adcs x11, x11, x7 + adcs x12, x12, x8 + adcs x13, x13, x9 + cset x5, cs + mov x3, #19 + # Mask the modulus + extr x5, x5, x13, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x10, x10, x3 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr + # Multiply + # A[0] * B[0] + umulh x7, x14, x10 + mul x6, x14, x10 + # A[2] * B[0] + umulh x9, x16, x10 + mul x8, x16, x10 + # A[1] * B[0] + mul x3, x15, x10 + adds x7, x7, x3 + umulh x4, x15, x10 + adcs x8, x8, x4 + # A[1] * B[3] + umulh x26, x15, x13 + adc x9, x9, xzr + mul x25, x15, x13 + # A[0] * B[1] + mul x3, x14, x11 + adds x7, x7, x3 + umulh x4, x14, x11 + adcs x8, x8, x4 + # A[2] * B[1] + mul x3, x16, x11 + adcs x9, x9, x3 + umulh x4, x16, x11 + adcs x25, x25, x4 + adc x26, x26, xzr + # A[1] * B[2] + mul x3, x15, x12 + adds x9, x9, x3 + umulh x4, x15, x12 + adcs x25, x25, x4 + adcs x26, x26, xzr + adc x27, xzr, xzr + # A[0] * B[2] + mul x3, x14, x12 + adds x8, x8, x3 + umulh x4, x14, x12 + adcs x9, x9, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + # A[1] * B[1] + mul x3, x15, x11 + adds x8, x8, x3 + umulh x4, x15, x11 + adcs x9, x9, x4 + # A[3] * B[1] + mul x3, x17, x11 + adcs x25, x25, x3 + umulh x4, x17, x11 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[2] + mul x3, x16, x12 + adds x25, x25, x3 + umulh x4, x16, x12 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x13 + adcs x27, x27, x3 + umulh x28, x17, x13 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x13 + adds x9, x9, x3 + umulh x4, x14, x13 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x16, x13 + adcs x26, x26, x3 + umulh x4, x16, x13 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x10 + adds x9, x9, x3 + umulh x4, x17, x10 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x17, x12 + adcs x26, x26, x3 + umulh x4, x17, x12 + adcs x27, x27, x4 + adc x28, x28, xzr + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x9, x9, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x6, x6, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x7, x7, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x8, x8, x4 + umulh x27, x3, x27 + adc x9, x9, xzr + # Add high product results in + adds x6, x6, x5 + adcs x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, x27 + # Store + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + # Add + ldp x25, x26, [x29, #48] + ldp x27, x28, [x29, #64] + adds x10, x25, x19 + adcs x11, x26, x20 + adcs x12, x27, x21 + adcs x13, x28, x22 + cset x5, cs + mov x3, #19 + extr x5, x5, x13, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x10, x10, x3 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr + # Sub + subs x19, x25, x19 + sbcs x20, x26, x20 + sbcs x21, x27, x21 + sbcs x22, x28, x22 + csetm x5, cc + mov x3, #-19 + extr x5, x5, x22, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x19, x19, x3 + sbcs x20, x20, xzr + and x22, x22, #0x7fffffffffffffff + sbcs x21, x21, xzr + sbc x22, x22, xzr + # Square + # A[0] * A[1] + umulh x8, x10, x11 + mul x7, x10, x11 + # A[0] * A[3] + umulh x25, x10, x13 + mul x9, x10, x13 + # A[0] * A[2] + mul x3, x10, x12 + adds x8, x8, x3 + umulh x4, x10, x12 + adcs x9, x9, x4 + # A[1] * A[3] + mul x3, x11, x13 + adcs x25, x25, x3 + umulh x26, x11, x13 + adc x26, x26, xzr + # A[1] * A[2] + mul x3, x11, x12 + adds x9, x9, x3 + umulh x4, x11, x12 + adcs x25, x25, x4 + # A[2] * A[3] + mul x3, x12, x13 + adcs x26, x26, x3 + umulh x27, x12, x13 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x25, x25, x25 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + umulh x4, x10, x10 + mul x6, x10, x10 + # A[1] * A[1] + mul x3, x11, x11 + adds x7, x7, x4 + umulh x4, x11, x11 + adcs x8, x8, x3 + # A[2] * A[2] + mul x3, x12, x12 + adcs x9, x9, x4 + umulh x4, x12, x12 + adcs x25, x25, x3 + # A[3] * A[3] + mul x3, x13, x13 + adcs x26, x26, x4 + umulh x4, x13, x13 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x9, x9, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x6, x6, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x7, x7, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x8, x8, x4 + umulh x27, x3, x27 + adc x9, x9, xzr + # Add high product results in + adds x6, x6, x5 + adcs x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, x27 + # Square + # A[0] * A[1] + umulh x16, x19, x20 + mul x15, x19, x20 + # A[0] * A[3] + umulh x25, x19, x22 + mul x17, x19, x22 + # A[0] * A[2] + mul x3, x19, x21 + adds x16, x16, x3 + umulh x4, x19, x21 + adcs x17, x17, x4 + # A[1] * A[3] + mul x3, x20, x22 + adcs x25, x25, x3 + umulh x26, x20, x22 + adc x26, x26, xzr + # A[1] * A[2] + mul x3, x20, x21 + adds x17, x17, x3 + umulh x4, x20, x21 + adcs x25, x25, x4 + # A[2] * A[3] + mul x3, x21, x22 + adcs x26, x26, x3 + umulh x27, x21, x22 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x25, x25, x25 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + umulh x4, x19, x19 + mul x14, x19, x19 + # A[1] * A[1] + mul x3, x20, x20 + adds x15, x15, x4 + umulh x4, x20, x20 + adcs x16, x16, x3 + # A[2] * A[2] + mul x3, x21, x21 + adcs x17, x17, x4 + umulh x4, x21, x21 + adcs x25, x25, x3 + # A[3] * A[3] + mul x3, x22, x22 + adcs x26, x26, x4 + umulh x4, x22, x22 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x17, x17, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x14, x14, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x15, x15, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x16, x16, x4 + umulh x27, x3, x27 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, x27 + # Multiply by 9 + mov x5, #9 + mul x10, x14, x5 + umulh x11, x14, x5 + mul x3, x15, x5 + umulh x12, x15, x5 + adds x11, x11, x3 + adc x12, x12, xzr + mul x3, x16, x5 + umulh x13, x16, x5 + adds x12, x12, x3 + adc x13, x13, xzr + mul x3, x17, x5 + umulh x4, x17, x5 + adds x13, x13, x3 + adc x4, x4, xzr + mov x5, #19 + extr x4, x4, x13, #63 + mul x4, x4, x5 + adds x10, x10, x4 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr + subs x24, x24, #1 + cmp x24, #3 + bge L_curve25519_base_bits + # Conditional Swap + subs xzr, xzr, x2, lsl 63 + ldp x25, x26, [x29, #16] + ldp x27, x28, [x29, #32] + csel x19, x25, x10, ne + csel x25, x10, x25, ne + csel x20, x26, x11, ne + csel x26, x11, x26, ne + csel x21, x27, x12, ne + csel x27, x12, x27, ne + csel x22, x28, x13, ne + csel x28, x13, x28, ne + # Conditional Swap + subs xzr, xzr, x2, lsl 63 + ldp x10, x11, [x0] + ldp x12, x13, [x0, #16] + csel x14, x10, x6, ne + csel x10, x6, x10, ne + csel x15, x11, x7, ne + csel x11, x7, x11, ne + csel x16, x12, x8, ne + csel x12, x8, x12, ne + csel x17, x13, x9, ne + csel x13, x9, x13, ne +L_curve25519_base_3: + # Add + adds x6, x10, x25 + adcs x7, x11, x26 + adcs x8, x12, x27 + adcs x9, x13, x28 + cset x5, cs + mov x3, #19 + extr x5, x5, x9, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x6, x6, x3 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr + # Sub + subs x25, x10, x25 + sbcs x26, x11, x26 + sbcs x27, x12, x27 + sbcs x28, x13, x28 + csetm x5, cc + mov x3, #-19 + extr x5, x5, x28, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x25, x25, x3 + sbcs x26, x26, xzr + and x28, x28, #0x7fffffffffffffff + sbcs x27, x27, xzr + sbc x28, x28, xzr + # Square + # A[0] * A[1] + umulh x21, x25, x26 + mul x20, x25, x26 + # A[0] * A[3] + umulh x14, x25, x28 + mul x22, x25, x28 + # A[0] * A[2] + mul x3, x25, x27 + adds x21, x21, x3 + umulh x4, x25, x27 + adcs x22, x22, x4 + # A[1] * A[3] + mul x3, x26, x28 + adcs x14, x14, x3 + umulh x15, x26, x28 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x26, x27 + adds x22, x22, x3 + umulh x4, x26, x27 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x27, x28 + adcs x15, x15, x3 + umulh x16, x27, x28 + adc x16, x16, xzr + # Double + adds x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x25, x25 + mul x19, x25, x25 + # A[1] * A[1] + mul x3, x26, x26 + adds x20, x20, x4 + umulh x4, x26, x26 + adcs x21, x21, x3 + # A[2] * A[2] + mul x3, x27, x27 + adcs x22, x22, x4 + umulh x4, x27, x27 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x28, x28 + adcs x15, x15, x4 + umulh x4, x28, x28 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x22, x22, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x19, x19, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x20, x20, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x21, x21, x4 + umulh x16, x3, x16 + adc x22, x22, xzr + # Add high product results in + adds x19, x19, x5 + adcs x20, x20, x14 + adcs x21, x21, x15 + adc x22, x22, x16 + # Square + # A[0] * A[1] + umulh x16, x6, x7 + mul x15, x6, x7 + # A[0] * A[3] + umulh x25, x6, x9 + mul x17, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x16, x16, x3 + umulh x4, x6, x8 + adcs x17, x17, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x25, x25, x3 + umulh x26, x7, x9 + adc x26, x26, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x17, x17, x3 + umulh x4, x7, x8 + adcs x25, x25, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x26, x26, x3 + umulh x27, x8, x9 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x25, x25, x25 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x14, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x15, x15, x4 + umulh x4, x7, x7 + adcs x16, x16, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x17, x17, x4 + umulh x4, x8, x8 + adcs x25, x25, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x26, x26, x4 + umulh x4, x9, x9 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x17, x17, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x14, x14, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x15, x15, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x16, x16, x4 + umulh x27, x3, x27 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, x27 + # Multiply + # A[0] * B[0] + umulh x11, x14, x19 + mul x10, x14, x19 + # A[2] * B[0] + umulh x13, x16, x19 + mul x12, x16, x19 + # A[1] * B[0] + mul x3, x15, x19 + adds x11, x11, x3 + umulh x4, x15, x19 + adcs x12, x12, x4 + # A[1] * B[3] + umulh x26, x15, x22 + adc x13, x13, xzr + mul x25, x15, x22 + # A[0] * B[1] + mul x3, x14, x20 + adds x11, x11, x3 + umulh x4, x14, x20 + adcs x12, x12, x4 + # A[2] * B[1] + mul x3, x16, x20 + adcs x13, x13, x3 + umulh x4, x16, x20 + adcs x25, x25, x4 + adc x26, x26, xzr + # A[1] * B[2] + mul x3, x15, x21 + adds x13, x13, x3 + umulh x4, x15, x21 + adcs x25, x25, x4 + adcs x26, x26, xzr + adc x27, xzr, xzr + # A[0] * B[2] + mul x3, x14, x21 + adds x12, x12, x3 + umulh x4, x14, x21 + adcs x13, x13, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + # A[1] * B[1] + mul x3, x15, x20 + adds x12, x12, x3 + umulh x4, x15, x20 + adcs x13, x13, x4 + # A[3] * B[1] + mul x3, x17, x20 + adcs x25, x25, x3 + umulh x4, x17, x20 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[2] + mul x3, x16, x21 + adds x25, x25, x3 + umulh x4, x16, x21 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x22 + adcs x27, x27, x3 + umulh x28, x17, x22 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x22 + adds x13, x13, x3 + umulh x4, x14, x22 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x16, x22 + adcs x26, x26, x3 + umulh x4, x16, x22 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x19 + adds x13, x13, x3 + umulh x4, x17, x19 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x17, x21 + adcs x26, x26, x3 + umulh x4, x17, x21 + adcs x27, x27, x4 + adc x28, x28, xzr + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x13, x13, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x10, x10, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x11, x11, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x12, x12, x4 + umulh x27, x3, x27 + adc x13, x13, xzr + # Add high product results in + adds x10, x10, x5 + adcs x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, x27 + # Store + stp x10, x11, [x0] + stp x12, x13, [x0, #16] + # Sub + subs x14, x14, x19 + sbcs x15, x15, x20 + sbcs x16, x16, x21 + sbcs x17, x17, x22 + csetm x5, cc + mov x3, #-19 + # Mask the modulus + extr x5, x5, x17, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x14, x14, x3 + sbcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + sbcs x16, x16, xzr + sbc x17, x17, xzr + # Multiply by 121666 + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x14, x5 + umulh x7, x14, x5 + mul x3, x15, x5 + umulh x8, x15, x5 + adds x7, x7, x3 + adc x8, x8, xzr + mul x3, x16, x5 + umulh x9, x16, x5 + adds x8, x8, x3 + adc x9, x9, xzr + mul x3, x17, x5 + umulh x4, x17, x5 + adds x9, x9, x3 + adc x4, x4, xzr + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + adds x6, x6, x4 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr + # Add + adds x19, x19, x6 + adcs x20, x20, x7 + adcs x21, x21, x8 + adcs x22, x22, x9 + cset x5, cs + mov x3, #19 + # Mask the modulus + extr x5, x5, x22, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x19, x19, x3 + adcs x20, x20, xzr + and x22, x22, #0x7fffffffffffffff + adcs x21, x21, xzr + adc x22, x22, xzr + # Multiply + # A[0] * B[0] + umulh x26, x14, x19 + mul x25, x14, x19 + # A[2] * B[0] + umulh x28, x16, x19 + mul x27, x16, x19 + # A[1] * B[0] + mul x3, x15, x19 + adds x26, x26, x3 + umulh x4, x15, x19 + adcs x27, x27, x4 + # A[1] * B[3] + umulh x7, x15, x22 + adc x28, x28, xzr + mul x6, x15, x22 + # A[0] * B[1] + mul x3, x14, x20 + adds x26, x26, x3 + umulh x4, x14, x20 + adcs x27, x27, x4 + # A[2] * B[1] + mul x3, x16, x20 + adcs x28, x28, x3 + umulh x4, x16, x20 + adcs x6, x6, x4 + adc x7, x7, xzr + # A[1] * B[2] + mul x3, x15, x21 + adds x28, x28, x3 + umulh x4, x15, x21 + adcs x6, x6, x4 + adcs x7, x7, xzr + adc x8, xzr, xzr + # A[0] * B[2] + mul x3, x14, x21 + adds x27, x27, x3 + umulh x4, x14, x21 + adcs x28, x28, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr + # A[1] * B[1] + mul x3, x15, x20 + adds x27, x27, x3 + umulh x4, x15, x20 + adcs x28, x28, x4 + # A[3] * B[1] + mul x3, x17, x20 + adcs x6, x6, x3 + umulh x4, x17, x20 + adcs x7, x7, x4 + adc x8, x8, xzr + # A[2] * B[2] + mul x3, x16, x21 + adds x6, x6, x3 + umulh x4, x16, x21 + adcs x7, x7, x4 + # A[3] * B[3] + mul x3, x17, x22 + adcs x8, x8, x3 + umulh x9, x17, x22 + adc x9, x9, xzr + # A[0] * B[3] + mul x3, x14, x22 + adds x28, x28, x3 + umulh x4, x14, x22 + adcs x6, x6, x4 + # A[2] * B[3] + mul x3, x16, x22 + adcs x7, x7, x3 + umulh x4, x16, x22 + adcs x8, x8, x4 + adc x9, x9, xzr + # A[3] * B[0] + mul x3, x17, x19 + adds x28, x28, x3 + umulh x4, x17, x19 + adcs x6, x6, x4 + # A[3] * B[2] + mul x3, x17, x21 + adcs x7, x7, x3 + umulh x4, x17, x21 + adcs x8, x8, x4 + adc x9, x9, xzr + # Reduce + mov x3, #38 + mul x4, x3, x9 + adds x28, x28, x4 + umulh x5, x3, x9 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x28, #63 + mul x5, x5, x3 + and x28, x28, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x6 + adds x25, x25, x4 + umulh x6, x3, x6 + mul x4, x3, x7 + adcs x26, x26, x4 + umulh x7, x3, x7 + mul x4, x3, x8 + adcs x27, x27, x4 + umulh x8, x3, x8 + adc x28, x28, xzr + # Add high product results in + adds x25, x25, x5 + adcs x26, x26, x6 + adcs x27, x27, x7 + adc x28, x28, x8 + # Store + stp x25, x26, [x29, #16] + stp x27, x28, [x29, #32] + subs x24, x24, #1 + bge L_curve25519_base_3 + # Invert + add x0, x29, #48 + add x1, x29, #16 +#ifndef __APPLE__ + bl fe_sq +#else + bl _fe_sq +#endif /* __APPLE__ */ + add x0, x29, #0x50 + add x1, x29, #48 +#ifndef __APPLE__ + bl fe_sq +#else + bl _fe_sq +#endif /* __APPLE__ */ +#ifndef NDEBUG + add x0, x29, #0x50 +#endif /* !NDEBUG */ + add x1, x29, #0x50 +#ifndef __APPLE__ + bl fe_sq +#else + bl _fe_sq +#endif /* __APPLE__ */ +#ifndef NDEBUG + add x0, x29, #0x50 +#endif /* !NDEBUG */ + add x1, x29, #16 + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + add x0, x29, #0x70 +#ifndef NDEBUG + add x1, x29, #48 +#endif /* !NDEBUG */ +#ifndef __APPLE__ + bl fe_sq +#else + bl _fe_sq +#endif /* __APPLE__ */ + add x0, x29, #0x50 + add x1, x29, #0x50 + add x2, x29, #0x70 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 5 times + mov x24, #5 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] +L_curve25519_base_inv_1: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_1 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] +#ifndef NDEBUG + add x0, x29, #0x50 +#endif /* !NDEBUG */ + add x1, x29, #0x70 + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 10 times + mov x24, #10 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] +L_curve25519_base_inv_2: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_2 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + add x0, x29, #0x70 +#ifndef NDEBUG + add x1, x29, #0x70 +#endif /* !NDEBUG */ + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 20 times + mov x24, #20 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] +L_curve25519_base_inv_3: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_3 + # Store + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] +#ifndef NDEBUG + add x0, x29, #0x70 +#endif /* !NDEBUG */ + add x1, x29, #0x90 + add x2, x29, #0x70 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 10 times + mov x24, #10 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] +L_curve25519_base_inv_4: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_4 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + add x0, x29, #0x50 + add x1, x29, #0x70 + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 50 times + mov x24, #50 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] +L_curve25519_base_inv_5: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_5 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + add x0, x29, #0x70 +#ifndef NDEBUG + add x1, x29, #0x70 +#endif /* !NDEBUG */ + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 100 times + mov x24, #0x64 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] +L_curve25519_base_inv_6: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_6 + # Store + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] +#ifndef NDEBUG + add x0, x29, #0x70 +#endif /* !NDEBUG */ + add x1, x29, #0x90 + add x2, x29, #0x70 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 50 times + mov x24, #50 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] +L_curve25519_base_inv_7: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_7 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + add x0, x29, #0x50 + add x1, x29, #0x70 + add x2, x29, #0x50 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 5 times + mov x24, #5 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] +L_curve25519_base_inv_8: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_base_inv_8 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + add x0, x29, #16 + add x1, x29, #0x50 + add x2, x29, #48 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + mov x0, x23 + # Multiply + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] + # A[0] * B[0] + umulh x15, x6, x10 + mul x14, x6, x10 + # A[2] * B[0] + umulh x17, x8, x10 + mul x16, x8, x10 + # A[1] * B[0] + mul x3, x7, x10 + adds x15, x15, x3 + umulh x4, x7, x10 + adcs x16, x16, x4 + # A[1] * B[3] + umulh x20, x7, x13 + adc x17, x17, xzr + mul x19, x7, x13 + # A[0] * B[1] + mul x3, x6, x11 + adds x15, x15, x3 + umulh x4, x6, x11 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x8, x11 + adcs x17, x17, x3 + umulh x4, x8, x11 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[2] + mul x3, x7, x12 + adds x17, x17, x3 + umulh x4, x7, x12 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x6, x12 + adds x16, x16, x3 + umulh x4, x6, x12 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x7, x11 + adds x16, x16, x3 + umulh x4, x7, x11 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x9, x11 + adcs x19, x19, x3 + umulh x4, x9, x11 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x8, x12 + adds x19, x19, x3 + umulh x4, x8, x12 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x9, x13 + adcs x21, x21, x3 + umulh x22, x9, x13 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x6, x13 + adds x17, x17, x3 + umulh x4, x6, x13 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x8, x13 + adcs x20, x20, x3 + umulh x4, x8, x13 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x9, x10 + adds x17, x17, x3 + umulh x4, x9, x10 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x9, x12 + adcs x20, x20, x3 + umulh x4, x9, x12 + adcs x21, x21, x4 + adc x22, x22, xzr + # Reduce + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 + # Reduce if top bit set + mov x3, #19 + and x4, x3, x17, asr 63 + adds x14, x14, x4 + adcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + adcs x16, x16, xzr + adc x17, x17, xzr + adds x4, x14, x3 + adcs x4, x15, xzr + adcs x4, x16, xzr + adc x4, x17, xzr + and x4, x3, x4, asr 63 + adds x14, x14, x4 + adcs x15, x15, xzr + mov x4, #0x7fffffffffffffff + adcs x16, x16, xzr + adc x17, x17, xzr + and x17, x17, x4 + # Store + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + mov x0, xzr + ldp x17, x19, [x29, #184] + ldp x20, x21, [x29, #200] + ldp x22, x23, [x29, #216] + ldp x24, x25, [x29, #232] + ldp x26, x27, [x29, #248] + ldr x28, [x29, #264] + ldp x29, x30, [sp], #0x110 + ret +#ifndef __APPLE__ + .size curve25519_base,.-curve25519_base +#endif /* __APPLE__ */ +#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519 @@ -1715,7 +4264,6 @@ _curve25519: str x28, [x29, #280] mov x23, xzr str x0, [x29, #176] - str x2, [x29, #184] ldp x6, x7, [x2] ldp x8, x9, [x2, #16] mov x10, #1 @@ -2799,7 +5347,510 @@ L_curve25519_bits: adcs x12, x12, x26 adc x13, x13, x27 subs x24, x24, #1 + cmp x24, #3 bge L_curve25519_bits + # Conditional Swap + subs xzr, xzr, x23, lsl 63 + ldp x25, x26, [x29, #16] + ldp x27, x28, [x29, #32] + csel x19, x25, x10, ne + csel x25, x10, x25, ne + csel x20, x26, x11, ne + csel x26, x11, x26, ne + csel x21, x27, x12, ne + csel x27, x12, x27, ne + csel x22, x28, x13, ne + csel x28, x13, x28, ne + # Conditional Swap + subs xzr, xzr, x23, lsl 63 + ldp x10, x11, [x0] + ldp x12, x13, [x0, #16] + csel x14, x10, x6, ne + csel x10, x6, x10, ne + csel x15, x11, x7, ne + csel x11, x7, x11, ne + csel x16, x12, x8, ne + csel x12, x8, x12, ne + csel x17, x13, x9, ne + csel x13, x9, x13, ne +L_curve25519_3: + # Add + adds x6, x10, x25 + adcs x7, x11, x26 + adcs x8, x12, x27 + adcs x9, x13, x28 + cset x5, cs + mov x3, #19 + extr x5, x5, x9, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x6, x6, x3 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr + # Sub + subs x25, x10, x25 + sbcs x26, x11, x26 + sbcs x27, x12, x27 + sbcs x28, x13, x28 + csetm x5, cc + mov x3, #-19 + extr x5, x5, x28, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x25, x25, x3 + sbcs x26, x26, xzr + and x28, x28, #0x7fffffffffffffff + sbcs x27, x27, xzr + sbc x28, x28, xzr + # Square + # A[0] * A[1] + umulh x21, x25, x26 + mul x20, x25, x26 + # A[0] * A[3] + umulh x14, x25, x28 + mul x22, x25, x28 + # A[0] * A[2] + mul x3, x25, x27 + adds x21, x21, x3 + umulh x4, x25, x27 + adcs x22, x22, x4 + # A[1] * A[3] + mul x3, x26, x28 + adcs x14, x14, x3 + umulh x15, x26, x28 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x26, x27 + adds x22, x22, x3 + umulh x4, x26, x27 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x27, x28 + adcs x15, x15, x3 + umulh x16, x27, x28 + adc x16, x16, xzr + # Double + adds x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x25, x25 + mul x19, x25, x25 + # A[1] * A[1] + mul x3, x26, x26 + adds x20, x20, x4 + umulh x4, x26, x26 + adcs x21, x21, x3 + # A[2] * A[2] + mul x3, x27, x27 + adcs x22, x22, x4 + umulh x4, x27, x27 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x28, x28 + adcs x15, x15, x4 + umulh x4, x28, x28 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x22, x22, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x19, x19, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x20, x20, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x21, x21, x4 + umulh x16, x3, x16 + adc x22, x22, xzr + # Add high product results in + adds x19, x19, x5 + adcs x20, x20, x14 + adcs x21, x21, x15 + adc x22, x22, x16 + # Square + # A[0] * A[1] + umulh x16, x6, x7 + mul x15, x6, x7 + # A[0] * A[3] + umulh x25, x6, x9 + mul x17, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x16, x16, x3 + umulh x4, x6, x8 + adcs x17, x17, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x25, x25, x3 + umulh x26, x7, x9 + adc x26, x26, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x17, x17, x3 + umulh x4, x7, x8 + adcs x25, x25, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x26, x26, x3 + umulh x27, x8, x9 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x25, x25, x25 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x14, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x15, x15, x4 + umulh x4, x7, x7 + adcs x16, x16, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x17, x17, x4 + umulh x4, x8, x8 + adcs x25, x25, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x26, x26, x4 + umulh x4, x9, x9 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x17, x17, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x14, x14, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x15, x15, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x16, x16, x4 + umulh x27, x3, x27 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, x27 + # Multiply + # A[0] * B[0] + umulh x11, x14, x19 + mul x10, x14, x19 + # A[2] * B[0] + umulh x13, x16, x19 + mul x12, x16, x19 + # A[1] * B[0] + mul x3, x15, x19 + adds x11, x11, x3 + umulh x4, x15, x19 + adcs x12, x12, x4 + # A[1] * B[3] + umulh x26, x15, x22 + adc x13, x13, xzr + mul x25, x15, x22 + # A[0] * B[1] + mul x3, x14, x20 + adds x11, x11, x3 + umulh x4, x14, x20 + adcs x12, x12, x4 + # A[2] * B[1] + mul x3, x16, x20 + adcs x13, x13, x3 + umulh x4, x16, x20 + adcs x25, x25, x4 + adc x26, x26, xzr + # A[1] * B[2] + mul x3, x15, x21 + adds x13, x13, x3 + umulh x4, x15, x21 + adcs x25, x25, x4 + adcs x26, x26, xzr + adc x27, xzr, xzr + # A[0] * B[2] + mul x3, x14, x21 + adds x12, x12, x3 + umulh x4, x14, x21 + adcs x13, x13, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + # A[1] * B[1] + mul x3, x15, x20 + adds x12, x12, x3 + umulh x4, x15, x20 + adcs x13, x13, x4 + # A[3] * B[1] + mul x3, x17, x20 + adcs x25, x25, x3 + umulh x4, x17, x20 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[2] + mul x3, x16, x21 + adds x25, x25, x3 + umulh x4, x16, x21 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x22 + adcs x27, x27, x3 + umulh x28, x17, x22 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x22 + adds x13, x13, x3 + umulh x4, x14, x22 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x16, x22 + adcs x26, x26, x3 + umulh x4, x16, x22 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x19 + adds x13, x13, x3 + umulh x4, x17, x19 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x17, x21 + adcs x26, x26, x3 + umulh x4, x17, x21 + adcs x27, x27, x4 + adc x28, x28, xzr + # Reduce + mov x3, #38 + mul x4, x3, x28 + adds x13, x13, x4 + umulh x5, x3, x28 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x10, x10, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x11, x11, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x12, x12, x4 + umulh x27, x3, x27 + adc x13, x13, xzr + # Add high product results in + adds x10, x10, x5 + adcs x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, x27 + # Store + stp x10, x11, [x0] + stp x12, x13, [x0, #16] + # Sub + subs x14, x14, x19 + sbcs x15, x15, x20 + sbcs x16, x16, x21 + sbcs x17, x17, x22 + csetm x5, cc + mov x3, #-19 + # Mask the modulus + extr x5, x5, x17, #63 + mul x3, x5, x3 + # Add modulus (if underflow) + subs x14, x14, x3 + sbcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + sbcs x16, x16, xzr + sbc x17, x17, xzr + # Multiply by 121666 + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x14, x5 + umulh x7, x14, x5 + mul x3, x15, x5 + umulh x8, x15, x5 + adds x7, x7, x3 + adc x8, x8, xzr + mul x3, x16, x5 + umulh x9, x16, x5 + adds x8, x8, x3 + adc x9, x9, xzr + mul x3, x17, x5 + umulh x4, x17, x5 + adds x9, x9, x3 + adc x4, x4, xzr + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + adds x6, x6, x4 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr + # Add + adds x19, x19, x6 + adcs x20, x20, x7 + adcs x21, x21, x8 + adcs x22, x22, x9 + cset x5, cs + mov x3, #19 + # Mask the modulus + extr x5, x5, x22, #63 + mul x3, x5, x3 + # Sub modulus (if overflow) + adds x19, x19, x3 + adcs x20, x20, xzr + and x22, x22, #0x7fffffffffffffff + adcs x21, x21, xzr + adc x22, x22, xzr + # Multiply + # A[0] * B[0] + umulh x26, x14, x19 + mul x25, x14, x19 + # A[2] * B[0] + umulh x28, x16, x19 + mul x27, x16, x19 + # A[1] * B[0] + mul x3, x15, x19 + adds x26, x26, x3 + umulh x4, x15, x19 + adcs x27, x27, x4 + # A[1] * B[3] + umulh x7, x15, x22 + adc x28, x28, xzr + mul x6, x15, x22 + # A[0] * B[1] + mul x3, x14, x20 + adds x26, x26, x3 + umulh x4, x14, x20 + adcs x27, x27, x4 + # A[2] * B[1] + mul x3, x16, x20 + adcs x28, x28, x3 + umulh x4, x16, x20 + adcs x6, x6, x4 + adc x7, x7, xzr + # A[1] * B[2] + mul x3, x15, x21 + adds x28, x28, x3 + umulh x4, x15, x21 + adcs x6, x6, x4 + adcs x7, x7, xzr + adc x8, xzr, xzr + # A[0] * B[2] + mul x3, x14, x21 + adds x27, x27, x3 + umulh x4, x14, x21 + adcs x28, x28, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr + # A[1] * B[1] + mul x3, x15, x20 + adds x27, x27, x3 + umulh x4, x15, x20 + adcs x28, x28, x4 + # A[3] * B[1] + mul x3, x17, x20 + adcs x6, x6, x3 + umulh x4, x17, x20 + adcs x7, x7, x4 + adc x8, x8, xzr + # A[2] * B[2] + mul x3, x16, x21 + adds x6, x6, x3 + umulh x4, x16, x21 + adcs x7, x7, x4 + # A[3] * B[3] + mul x3, x17, x22 + adcs x8, x8, x3 + umulh x9, x17, x22 + adc x9, x9, xzr + # A[0] * B[3] + mul x3, x14, x22 + adds x28, x28, x3 + umulh x4, x14, x22 + adcs x6, x6, x4 + # A[2] * B[3] + mul x3, x16, x22 + adcs x7, x7, x3 + umulh x4, x16, x22 + adcs x8, x8, x4 + adc x9, x9, xzr + # A[3] * B[0] + mul x3, x17, x19 + adds x28, x28, x3 + umulh x4, x17, x19 + adcs x6, x6, x4 + # A[3] * B[2] + mul x3, x17, x21 + adcs x7, x7, x3 + umulh x4, x17, x21 + adcs x8, x8, x4 + adc x9, x9, xzr + # Reduce + mov x3, #38 + mul x4, x3, x9 + adds x28, x28, x4 + umulh x5, x3, x9 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x28, #63 + mul x5, x5, x3 + and x28, x28, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x6 + adds x25, x25, x4 + umulh x6, x3, x6 + mul x4, x3, x7 + adcs x26, x26, x4 + umulh x7, x3, x7 + mul x4, x3, x8 + adcs x27, x27, x4 + umulh x8, x3, x8 + adc x28, x28, xzr + # Add high product results in + adds x25, x25, x5 + adcs x26, x26, x6 + adcs x27, x27, x7 + adc x28, x28, x8 + # Store + stp x25, x26, [x29, #16] + stp x27, x28, [x29, #32] + subs x24, x24, #1 + bge L_curve25519_3 # Invert add x0, x29, #48 add x1, x29, #16 @@ -3803,7 +6854,6 @@ L_curve25519_inv_8: #ifndef __APPLE__ .size curve25519,.-curve25519 #endif /* __APPLE__ */ -#ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_pow22523 @@ -8307,6 +11357,7 @@ _ge_sub: #ifndef __APPLE__ .size ge_sub,.-ge_sub #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl sc_reduce diff --git a/wolfcrypt/src/port/arm/armv8-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c index 323932163..c91e32e1a 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519_c.c @@ -48,7 +48,6 @@ void fe_init() ); } -#ifdef HAVE_ED25519 void fe_frombytes(fe out, const unsigned char* in) { __asm__ __volatile__ ( @@ -472,7 +471,6 @@ void fe_cmov_table(fe* r, fe* base, signed char b) ); } -#endif /* HAVE_ED25519 */ void fe_mul(fe r, const fe a, const fe b) { __asm__ __volatile__ ( @@ -1591,6 +1589,2534 @@ void fe_invert(fe r, const fe a) ); } +#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +static const word64 L_curve25519_base_x2[] = { + 0x5cae469cdd684efb, 0x8f3f5ced1e350b5c, + 0xd9750c687d157114, 0x20d342d51873f1b7, +}; + +int curve25519_base(byte* r, const byte* n) +{ + const word64* x2 = L_curve25519_base_x2; + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-176]!\n\t" + "add x29, sp, #0\n\t" + "ldp x6, x7, [%[x2]]\n\t" + "ldp x8, x9, [%[x2], #16]\n\t" + "mov x10, #1\n\t" + "mov x11, xzr\n\t" + "mov x12, xzr\n\t" + "mov x13, xzr\n\t" + /* Set base point x-ordinate */ + "mov x24, #9\n\t" + "stp x24, xzr, [%x[r]]\n\t" + "stp xzr, xzr, [%x[r], #16]\n\t" + /* Set one */ + "mov x24, #1\n\t" + "stp x24, xzr, [x29, #16]\n\t" + "stp xzr, xzr, [x29, #32]\n\t" + "mov %[x2], xzr\n\t" + "mov x23, %x[r]\n\t" + "mov x24, #0xfd\n\t" + "\n" + "L_curve25519_base_bits_%=: \n\t" + "lsr x3, x24, #6\n\t" + "and x4, x24, #63\n\t" + "ldr x5, [%x[n], x3, LSL 3]\n\t" + "lsr x5, x5, x4\n\t" + "eor %[x2], %[x2], x5\n\t" + /* Conditional Swap */ + "subs xzr, xzr, %[x2], lsl 63\n\t" + "ldp x25, x26, [x29, #16]\n\t" + "ldp x27, x28, [x29, #32]\n\t" + "csel x19, x25, x10, ne\n\t" + "csel x25, x10, x25, ne\n\t" + "csel x20, x26, x11, ne\n\t" + "csel x26, x11, x26, ne\n\t" + "csel x21, x27, x12, ne\n\t" + "csel x27, x12, x27, ne\n\t" + "csel x22, x28, x13, ne\n\t" + "csel x28, x13, x28, ne\n\t" + /* Conditional Swap */ + "subs xzr, xzr, %[x2], lsl 63\n\t" + "ldp x10, x11, [%x[r]]\n\t" + "ldp x12, x13, [%x[r], #16]\n\t" + "csel x14, x10, x6, ne\n\t" + "csel x10, x6, x10, ne\n\t" + "csel x15, x11, x7, ne\n\t" + "csel x11, x7, x11, ne\n\t" + "csel x16, x12, x8, ne\n\t" + "csel x12, x8, x12, ne\n\t" + "csel x17, x13, x9, ne\n\t" + "csel x13, x9, x13, ne\n\t" + "mov %[x2], x5\n\t" + /* Add */ + "adds x6, x10, x25\n\t" + "adcs x7, x11, x26\n\t" + "adcs x8, x12, x27\n\t" + "adcs x9, x13, x28\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Sub */ + "subs x25, x10, x25\n\t" + "sbcs x26, x11, x26\n\t" + "sbcs x27, x12, x27\n\t" + "sbcs x28, x13, x28\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + "extr x5, x5, x28, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x25, x25, x3\n\t" + "sbcs x26, x26, xzr\n\t" + "and x28, x28, #0x7fffffffffffffff\n\t" + "sbcs x27, x27, xzr\n\t" + "sbc x28, x28, xzr\n\t" + "stp x25, x26, [x29, #80]\n\t" + "stp x27, x28, [x29, #96]\n\t" + /* Add */ + "adds x10, x14, x19\n\t" + "adcs x11, x15, x20\n\t" + "adcs x12, x16, x21\n\t" + "adcs x13, x17, x22\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x10, x10, x3\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Sub */ + "subs x14, x14, x19\n\t" + "sbcs x15, x15, x20\n\t" + "sbcs x16, x16, x21\n\t" + "sbcs x17, x17, x22\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "sbcs x16, x16, xzr\n\t" + "sbc x17, x17, xzr\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x20, x14, x6\n\t" + "mul x19, x14, x6\n\t" + /* A[2] * B[0] */ + "umulh x22, x16, x6\n\t" + "mul x21, x16, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x6\n\t" + "adds x20, x20, x3\n\t" + "umulh x4, x15, x6\n\t" + "adcs x21, x21, x4\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x9\n\t" + "adc x22, x22, xzr\n\t" + "mul x25, x15, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x7\n\t" + "adds x20, x20, x3\n\t" + "umulh x4, x14, x7\n\t" + "adcs x21, x21, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x7\n\t" + "adcs x22, x22, x3\n\t" + "umulh x4, x16, x7\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x8\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x15, x8\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x8\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x14, x8\n\t" + "adcs x22, x22, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x7\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x15, x7\n\t" + "adcs x22, x22, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x7\n\t" + "adcs x25, x25, x3\n\t" + "umulh x4, x17, x7\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x8\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x8\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x9\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x9\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x9\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x14, x9\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x9\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x16, x9\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x6\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x17, x6\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x8\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x17, x8\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x22, x22, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x19, x19, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x20, x20, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x21, x21, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x22, x22, xzr\n\t" + /* Add high product results in */ + "adds x19, x19, x5\n\t" + "adcs x20, x20, x25\n\t" + "adcs x21, x21, x26\n\t" + "adc x22, x22, x27\n\t" + /* Store */ + "stp x19, x20, [x29, #48]\n\t" + "stp x21, x22, [x29, #64]\n\t" + /* Multiply */ + "ldp x25, x26, [x29, #80]\n\t" + "ldp x27, x28, [x29, #96]\n\t" + /* A[0] * B[0] */ + "umulh x20, x10, x25\n\t" + "mul x19, x10, x25\n\t" + /* A[2] * B[0] */ + "umulh x22, x12, x25\n\t" + "mul x21, x12, x25\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x25\n\t" + "adds x20, x20, x3\n\t" + "umulh x4, x11, x25\n\t" + "adcs x21, x21, x4\n\t" + /* A[1] * B[3] */ + "umulh x15, x11, x28\n\t" + "adc x22, x22, xzr\n\t" + "mul x14, x11, x28\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x26\n\t" + "adds x20, x20, x3\n\t" + "umulh x4, x10, x26\n\t" + "adcs x21, x21, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x26\n\t" + "adcs x22, x22, x3\n\t" + "umulh x4, x12, x26\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x27\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x11, x27\n\t" + "adcs x14, x14, x4\n\t" + "adcs x15, x15, xzr\n\t" + "adc x16, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x27\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x10, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x14, x14, xzr\n\t" + "adcs x15, x15, xzr\n\t" + "adc x16, x16, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x26\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x11, x26\n\t" + "adcs x22, x22, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x26\n\t" + "adcs x14, x14, x3\n\t" + "umulh x4, x13, x26\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, x16, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x27\n\t" + "adds x14, x14, x3\n\t" + "umulh x4, x12, x27\n\t" + "adcs x15, x15, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x28\n\t" + "adcs x16, x16, x3\n\t" + "umulh x17, x13, x28\n\t" + "adc x17, x17, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x28\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x10, x28\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x28\n\t" + "adcs x15, x15, x3\n\t" + "umulh x4, x12, x28\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x25\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x13, x25\n\t" + "adcs x14, x14, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x27\n\t" + "adcs x15, x15, x3\n\t" + "umulh x4, x13, x27\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x22, x22, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x19, x19, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x20, x20, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x21, x21, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x22, x22, xzr\n\t" + /* Add high product results in */ + "adds x19, x19, x5\n\t" + "adcs x20, x20, x14\n\t" + "adcs x21, x21, x15\n\t" + "adc x22, x22, x16\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x25, x26\n\t" + "mul x11, x25, x26\n\t" + /* A[0] * A[3] */ + "umulh x14, x25, x28\n\t" + "mul x13, x25, x28\n\t" + /* A[0] * A[2] */ + "mul x3, x25, x27\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x25, x27\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x26, x28\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x26, x28\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x26, x27\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x26, x27\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x27, x28\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x27, x28\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x25, x25\n\t" + "mul x10, x25, x25\n\t" + /* A[1] * A[1] */ + "mul x3, x26, x26\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x26, x26\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x27, x27\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x27, x27\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x28, x28\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x28, x28\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x10, x10, x5\n\t" + "adcs x11, x11, x14\n\t" + "adcs x12, x12, x15\n\t" + "adc x13, x13, x16\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x16, x6, x7\n\t" + "mul x15, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x25, x6, x9\n\t" + "mul x17, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x17, x17, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x7, x9\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x26, x26, x3\n\t" + "umulh x27, x8, x9\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x25, x25, x25\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x14, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x15, x15, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x16, x16, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x17, x17, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x25, x25, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x26, x26, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x14, x14, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, x27\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x7, x14, x10\n\t" + "mul x6, x14, x10\n\t" + /* A[2] * B[0] */ + "umulh x9, x16, x10\n\t" + "mul x8, x16, x10\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x10\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x15, x10\n\t" + "adcs x8, x8, x4\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x13\n\t" + "adc x9, x9, xzr\n\t" + "mul x25, x15, x13\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x14, x11\n\t" + "adcs x8, x8, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x11\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x11\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x12\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x15, x12\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x12\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x12\n\t" + "adcs x9, x9, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x11\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x15, x11\n\t" + "adcs x9, x9, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x11\n\t" + "adcs x25, x25, x3\n\t" + "umulh x4, x17, x11\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x12\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x12\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x13\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x13\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x13\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x14, x13\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x13\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x16, x13\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x10\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x17, x10\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x12\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x17, x12\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x9, x9, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x6, x6, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x9, x9, xzr\n\t" + /* Add high product results in */ + "adds x6, x6, x5\n\t" + "adcs x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, x27\n\t" + /* Store */ + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" + /* Sub */ + "subs x14, x14, x10\n\t" + "sbcs x15, x15, x11\n\t" + "sbcs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + /* Mask the modulus */ + "extr x5, x5, x17, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "sbcs x16, x16, xzr\n\t" + "sbc x17, x17, xzr\n\t" + /* Multiply by 121666 */ + "mov x5, #0xdb42\n\t" + "movk x5, #1, lsl 16\n\t" + "mul x6, x14, x5\n\t" + "umulh x7, x14, x5\n\t" + "mul x3, x15, x5\n\t" + "umulh x8, x15, x5\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + "mul x3, x16, x5\n\t" + "umulh x9, x16, x5\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + "mul x3, x17, x5\n\t" + "umulh x4, x17, x5\n\t" + "adds x9, x9, x3\n\t" + "adc x4, x4, xzr\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x9, #63\n\t" + "mul x4, x4, x5\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Add */ + "adds x10, x10, x6\n\t" + "adcs x11, x11, x7\n\t" + "adcs x12, x12, x8\n\t" + "adcs x13, x13, x9\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + /* Mask the modulus */ + "extr x5, x5, x13, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x10, x10, x3\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x7, x14, x10\n\t" + "mul x6, x14, x10\n\t" + /* A[2] * B[0] */ + "umulh x9, x16, x10\n\t" + "mul x8, x16, x10\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x10\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x15, x10\n\t" + "adcs x8, x8, x4\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x13\n\t" + "adc x9, x9, xzr\n\t" + "mul x25, x15, x13\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x14, x11\n\t" + "adcs x8, x8, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x11\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x11\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x12\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x15, x12\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x12\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x12\n\t" + "adcs x9, x9, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x11\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x15, x11\n\t" + "adcs x9, x9, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x11\n\t" + "adcs x25, x25, x3\n\t" + "umulh x4, x17, x11\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x12\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x12\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x13\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x13\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x13\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x14, x13\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x13\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x16, x13\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x10\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x17, x10\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x12\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x17, x12\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x9, x9, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x6, x6, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x9, x9, xzr\n\t" + /* Add high product results in */ + "adds x6, x6, x5\n\t" + "adcs x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, x27\n\t" + /* Store */ + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + /* Add */ + "ldp x25, x26, [x29, #48]\n\t" + "ldp x27, x28, [x29, #64]\n\t" + "adds x10, x25, x19\n\t" + "adcs x11, x26, x20\n\t" + "adcs x12, x27, x21\n\t" + "adcs x13, x28, x22\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x10, x10, x3\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Sub */ + "subs x19, x25, x19\n\t" + "sbcs x20, x26, x20\n\t" + "sbcs x21, x27, x21\n\t" + "sbcs x22, x28, x22\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x19, x19, x3\n\t" + "sbcs x20, x20, xzr\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "sbcs x21, x21, xzr\n\t" + "sbc x22, x22, xzr\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x8, x10, x11\n\t" + "mul x7, x10, x11\n\t" + /* A[0] * A[3] */ + "umulh x25, x10, x13\n\t" + "mul x9, x10, x13\n\t" + /* A[0] * A[2] */ + "mul x3, x10, x12\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x10, x12\n\t" + "adcs x9, x9, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x11, x13\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x11, x13\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x11, x12\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x11, x12\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x12, x13\n\t" + "adcs x26, x26, x3\n\t" + "umulh x27, x12, x13\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x25, x25, x25\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x10, x10\n\t" + "mul x6, x10, x10\n\t" + /* A[1] * A[1] */ + "mul x3, x11, x11\n\t" + "adds x7, x7, x4\n\t" + "umulh x4, x11, x11\n\t" + "adcs x8, x8, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x12, x12\n\t" + "adcs x9, x9, x4\n\t" + "umulh x4, x12, x12\n\t" + "adcs x25, x25, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x13, x13\n\t" + "adcs x26, x26, x4\n\t" + "umulh x4, x13, x13\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x9, x9, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x6, x6, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x9, x9, xzr\n\t" + /* Add high product results in */ + "adds x6, x6, x5\n\t" + "adcs x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, x27\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x16, x19, x20\n\t" + "mul x15, x19, x20\n\t" + /* A[0] * A[3] */ + "umulh x25, x19, x22\n\t" + "mul x17, x19, x22\n\t" + /* A[0] * A[2] */ + "mul x3, x19, x21\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x19, x21\n\t" + "adcs x17, x17, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x20, x22\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x20, x22\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x20, x21\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x20, x21\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x21, x22\n\t" + "adcs x26, x26, x3\n\t" + "umulh x27, x21, x22\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x25, x25, x25\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x19, x19\n\t" + "mul x14, x19, x19\n\t" + /* A[1] * A[1] */ + "mul x3, x20, x20\n\t" + "adds x15, x15, x4\n\t" + "umulh x4, x20, x20\n\t" + "adcs x16, x16, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x21, x21\n\t" + "adcs x17, x17, x4\n\t" + "umulh x4, x21, x21\n\t" + "adcs x25, x25, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x22, x22\n\t" + "adcs x26, x26, x4\n\t" + "umulh x4, x22, x22\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x14, x14, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, x27\n\t" + /* Multiply by 9 */ + "mov x5, #9\n\t" + "mul x10, x14, x5\n\t" + "umulh x11, x14, x5\n\t" + "mul x3, x15, x5\n\t" + "umulh x12, x15, x5\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + "mul x3, x16, x5\n\t" + "umulh x13, x16, x5\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, xzr\n\t" + "mul x3, x17, x5\n\t" + "umulh x4, x17, x5\n\t" + "adds x13, x13, x3\n\t" + "adc x4, x4, xzr\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x13, #63\n\t" + "mul x4, x4, x5\n\t" + "adds x10, x10, x4\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + "subs x24, x24, #1\n\t" + "cmp x24, #3\n\t" + "b.ge L_curve25519_base_bits_%=\n\t" + /* Conditional Swap */ + "subs xzr, xzr, %[x2], lsl 63\n\t" + "ldp x25, x26, [x29, #16]\n\t" + "ldp x27, x28, [x29, #32]\n\t" + "csel x19, x25, x10, ne\n\t" + "csel x25, x10, x25, ne\n\t" + "csel x20, x26, x11, ne\n\t" + "csel x26, x11, x26, ne\n\t" + "csel x21, x27, x12, ne\n\t" + "csel x27, x12, x27, ne\n\t" + "csel x22, x28, x13, ne\n\t" + "csel x28, x13, x28, ne\n\t" + /* Conditional Swap */ + "subs xzr, xzr, %[x2], lsl 63\n\t" + "ldp x10, x11, [%x[r]]\n\t" + "ldp x12, x13, [%x[r], #16]\n\t" + "csel x14, x10, x6, ne\n\t" + "csel x10, x6, x10, ne\n\t" + "csel x15, x11, x7, ne\n\t" + "csel x11, x7, x11, ne\n\t" + "csel x16, x12, x8, ne\n\t" + "csel x12, x8, x12, ne\n\t" + "csel x17, x13, x9, ne\n\t" + "csel x13, x9, x13, ne\n\t" + "\n" + "L_curve25519_base_3_%=: \n\t" + /* Add */ + "adds x6, x10, x25\n\t" + "adcs x7, x11, x26\n\t" + "adcs x8, x12, x27\n\t" + "adcs x9, x13, x28\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Sub */ + "subs x25, x10, x25\n\t" + "sbcs x26, x11, x26\n\t" + "sbcs x27, x12, x27\n\t" + "sbcs x28, x13, x28\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + "extr x5, x5, x28, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x25, x25, x3\n\t" + "sbcs x26, x26, xzr\n\t" + "and x28, x28, #0x7fffffffffffffff\n\t" + "sbcs x27, x27, xzr\n\t" + "sbc x28, x28, xzr\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x21, x25, x26\n\t" + "mul x20, x25, x26\n\t" + /* A[0] * A[3] */ + "umulh x14, x25, x28\n\t" + "mul x22, x25, x28\n\t" + /* A[0] * A[2] */ + "mul x3, x25, x27\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x25, x27\n\t" + "adcs x22, x22, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x26, x28\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x26, x28\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x26, x27\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x26, x27\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x27, x28\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x27, x28\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x20, x20, x20\n\t" + "adcs x21, x21, x21\n\t" + "adcs x22, x22, x22\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x25, x25\n\t" + "mul x19, x25, x25\n\t" + /* A[1] * A[1] */ + "mul x3, x26, x26\n\t" + "adds x20, x20, x4\n\t" + "umulh x4, x26, x26\n\t" + "adcs x21, x21, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x27, x27\n\t" + "adcs x22, x22, x4\n\t" + "umulh x4, x27, x27\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x28, x28\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x28, x28\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x22, x22, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x19, x19, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x20, x20, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x21, x21, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x22, x22, xzr\n\t" + /* Add high product results in */ + "adds x19, x19, x5\n\t" + "adcs x20, x20, x14\n\t" + "adcs x21, x21, x15\n\t" + "adc x22, x22, x16\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x16, x6, x7\n\t" + "mul x15, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x25, x6, x9\n\t" + "mul x17, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x17, x17, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x7, x9\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x26, x26, x3\n\t" + "umulh x27, x8, x9\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x25, x25, x25\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x14, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x15, x15, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x16, x16, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x17, x17, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x25, x25, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x26, x26, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x14, x14, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, x27\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x11, x14, x19\n\t" + "mul x10, x14, x19\n\t" + /* A[2] * B[0] */ + "umulh x13, x16, x19\n\t" + "mul x12, x16, x19\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x19\n\t" + "adds x11, x11, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x12, x12, x4\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x22\n\t" + "adc x13, x13, xzr\n\t" + "mul x25, x15, x22\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x20\n\t" + "adds x11, x11, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x12, x12, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x20\n\t" + "adcs x13, x13, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x21\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x21\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x14, x21\n\t" + "adcs x13, x13, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x20\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x13, x13, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x20\n\t" + "adcs x25, x25, x3\n\t" + "umulh x4, x17, x20\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x21\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x21\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x22\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x22\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x22\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x14, x22\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x22\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x16, x22\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x19\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x17, x19\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x21\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x17, x21\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x10, x10, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x11, x11, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x12, x12, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x10, x10, x5\n\t" + "adcs x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, x27\n\t" + /* Store */ + "stp x10, x11, [%x[r]]\n\t" + "stp x12, x13, [%x[r], #16]\n\t" + /* Sub */ + "subs x14, x14, x19\n\t" + "sbcs x15, x15, x20\n\t" + "sbcs x16, x16, x21\n\t" + "sbcs x17, x17, x22\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + /* Mask the modulus */ + "extr x5, x5, x17, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "sbcs x16, x16, xzr\n\t" + "sbc x17, x17, xzr\n\t" + /* Multiply by 121666 */ + "mov x5, #0xdb42\n\t" + "movk x5, #1, lsl 16\n\t" + "mul x6, x14, x5\n\t" + "umulh x7, x14, x5\n\t" + "mul x3, x15, x5\n\t" + "umulh x8, x15, x5\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + "mul x3, x16, x5\n\t" + "umulh x9, x16, x5\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + "mul x3, x17, x5\n\t" + "umulh x4, x17, x5\n\t" + "adds x9, x9, x3\n\t" + "adc x4, x4, xzr\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x9, #63\n\t" + "mul x4, x4, x5\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Add */ + "adds x19, x19, x6\n\t" + "adcs x20, x20, x7\n\t" + "adcs x21, x21, x8\n\t" + "adcs x22, x22, x9\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + /* Mask the modulus */ + "extr x5, x5, x22, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x19, x19, x3\n\t" + "adcs x20, x20, xzr\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x26, x14, x19\n\t" + "mul x25, x14, x19\n\t" + /* A[2] * B[0] */ + "umulh x28, x16, x19\n\t" + "mul x27, x16, x19\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x19\n\t" + "adds x26, x26, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x27, x27, x4\n\t" + /* A[1] * B[3] */ + "umulh x7, x15, x22\n\t" + "adc x28, x28, xzr\n\t" + "mul x6, x15, x22\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x20\n\t" + "adds x26, x26, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x27, x27, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x20\n\t" + "adcs x28, x28, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x6, x6, x4\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x21\n\t" + "adds x28, x28, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x21\n\t" + "adds x27, x27, x3\n\t" + "umulh x4, x14, x21\n\t" + "adcs x28, x28, x4\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x20\n\t" + "adds x27, x27, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x28, x28, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x20\n\t" + "adcs x6, x6, x3\n\t" + "umulh x4, x17, x20\n\t" + "adcs x7, x7, x4\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x21\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x16, x21\n\t" + "adcs x7, x7, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x22\n\t" + "adcs x8, x8, x3\n\t" + "umulh x9, x17, x22\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x22\n\t" + "adds x28, x28, x3\n\t" + "umulh x4, x14, x22\n\t" + "adcs x6, x6, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x22\n\t" + "adcs x7, x7, x3\n\t" + "umulh x4, x16, x22\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x19\n\t" + "adds x28, x28, x3\n\t" + "umulh x4, x17, x19\n\t" + "adcs x6, x6, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x21\n\t" + "adcs x7, x7, x3\n\t" + "umulh x4, x17, x21\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x9\n\t" + "adds x28, x28, x4\n\t" + "umulh x5, x3, x9\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x28, #63\n\t" + "mul x5, x5, x3\n\t" + "and x28, x28, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x6\n\t" + "adds x25, x25, x4\n\t" + "umulh x6, x3, x6\n\t" + "mul x4, x3, x7\n\t" + "adcs x26, x26, x4\n\t" + "umulh x7, x3, x7\n\t" + "mul x4, x3, x8\n\t" + "adcs x27, x27, x4\n\t" + "umulh x8, x3, x8\n\t" + "adc x28, x28, xzr\n\t" + /* Add high product results in */ + "adds x25, x25, x5\n\t" + "adcs x26, x26, x6\n\t" + "adcs x27, x27, x7\n\t" + "adc x28, x28, x8\n\t" + /* Store */ + "stp x25, x26, [x29, #16]\n\t" + "stp x27, x28, [x29, #32]\n\t" + "subs x24, x24, #1\n\t" + "b.ge L_curve25519_base_3_%=\n\t" + /* Invert */ + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" +#ifndef __APPLE__ + "bl fe_sq\n\t" +#else + "bl _fe_sq\n\t" +#endif /* __APPLE__ */ + "add x0, x29, #0x50\n\t" + "add x1, x29, #48\n\t" +#ifndef __APPLE__ + "bl fe_sq\n\t" +#else + "bl _fe_sq\n\t" +#endif /* __APPLE__ */ +#ifndef NDEBUG + "add x0, x29, #0x50\n\t" +#endif /* !NDEBUG */ + "add x1, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_sq\n\t" +#else + "bl _fe_sq\n\t" +#endif /* __APPLE__ */ +#ifndef NDEBUG + "add x0, x29, #0x50\n\t" +#endif /* !NDEBUG */ + "add x1, x29, #16\n\t" + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + "add x0, x29, #48\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + "add x0, x29, #0x70\n\t" +#ifndef NDEBUG + "add x1, x29, #48\n\t" +#endif /* !NDEBUG */ +#ifndef __APPLE__ + "bl fe_sq\n\t" +#else + "bl _fe_sq\n\t" +#endif /* __APPLE__ */ + "add x0, x29, #0x50\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #0x70\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 5 times */ + "mov x24, #5\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "\n" + "L_curve25519_base_inv_1_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_1_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" +#ifndef NDEBUG + "add x0, x29, #0x50\n\t" +#endif /* !NDEBUG */ + "add x1, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 10 times */ + "mov x24, #10\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "\n" + "L_curve25519_base_inv_2_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_2_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + "add x0, x29, #0x70\n\t" +#ifndef NDEBUG + "add x1, x29, #0x70\n\t" +#endif /* !NDEBUG */ + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 20 times */ + "mov x24, #20\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "\n" + "L_curve25519_base_inv_3_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_3_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" +#ifndef NDEBUG + "add x0, x29, #0x70\n\t" +#endif /* !NDEBUG */ + "add x1, x29, #0x90\n\t" + "add x2, x29, #0x70\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 10 times */ + "mov x24, #10\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "\n" + "L_curve25519_base_inv_4_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_4_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 50 times */ + "mov x24, #50\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "\n" + "L_curve25519_base_inv_5_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_5_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + "add x0, x29, #0x70\n\t" +#ifndef NDEBUG + "add x1, x29, #0x70\n\t" +#endif /* !NDEBUG */ + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 100 times */ + "mov x24, #0x64\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "\n" + "L_curve25519_base_inv_6_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_6_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" +#ifndef NDEBUG + "add x0, x29, #0x70\n\t" +#endif /* !NDEBUG */ + "add x1, x29, #0x90\n\t" + "add x2, x29, #0x70\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 50 times */ + "mov x24, #50\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "\n" + "L_curve25519_base_inv_7_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_7_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 5 times */ + "mov x24, #5\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "\n" + "L_curve25519_base_inv_8_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "b.ne L_curve25519_base_inv_8_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + "mov %x[r], x23\n\t" + /* Multiply */ + "ldp x6, x7, [%x[r]]\n\t" + "ldp x8, x9, [%x[r], #16]\n\t" + "ldp x10, x11, [x29, #16]\n\t" + "ldp x12, x13, [x29, #32]\n\t" + /* A[0] * B[0] */ + "umulh x15, x6, x10\n\t" + "mul x14, x6, x10\n\t" + /* A[2] * B[0] */ + "umulh x17, x8, x10\n\t" + "mul x16, x8, x10\n\t" + /* A[1] * B[0] */ + "mul x3, x7, x10\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x7, x10\n\t" + "adcs x16, x16, x4\n\t" + /* A[1] * B[3] */ + "umulh x20, x7, x13\n\t" + "adc x17, x17, xzr\n\t" + "mul x19, x7, x13\n\t" + /* A[0] * B[1] */ + "mul x3, x6, x11\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x6, x11\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x8, x11\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x8, x11\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x7, x12\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x7, x12\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x6, x12\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x6, x12\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x7, x11\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x7, x11\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x9, x11\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x9, x11\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x8, x12\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x8, x12\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x9, x13\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x9, x13\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x6, x13\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x6, x13\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x8, x13\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x8, x13\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x9, x10\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x9, x10\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x9, x12\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x9, x12\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" + /* Reduce if top bit set */ + "mov x3, #19\n\t" + "and x4, x3, x17, asr 63\n\t" + "adds x14, x14, x4\n\t" + "adcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + "adds x4, x14, x3\n\t" + "adcs x4, x15, xzr\n\t" + "adcs x4, x16, xzr\n\t" + "adc x4, x17, xzr\n\t" + "and x4, x3, x4, asr 63\n\t" + "adds x14, x14, x4\n\t" + "adcs x15, x15, xzr\n\t" + "mov x4, #0x7fffffffffffffff\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + "and x17, x17, x4\n\t" + /* Store */ + "stp x14, x15, [%x[r]]\n\t" + "stp x16, x17, [%x[r], #16]\n\t" + "mov x0, xzr\n\t" + "ldp x29, x30, [sp], #0xb0\n\t" + : [r] "+r" (r) + : [n] "r" (n), [x2] "r" (x2) + : "memory", "cc", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", + "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", + "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + return (word32)(size_t)r; +} + +#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ int curve25519(byte* r, const byte* n, const byte* a) { __asm__ __volatile__ ( @@ -1598,7 +4124,6 @@ int curve25519(byte* r, const byte* n, const byte* a) "add x29, sp, #0\n\t" "mov x23, xzr\n\t" "str %x[r], [x29, #176]\n\t" - "str %x[a], [x29, #184]\n\t" "ldp x6, x7, [%x[a]]\n\t" "ldp x8, x9, [%x[a], #16]\n\t" "mov x10, #1\n\t" @@ -2683,7 +5208,511 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x12, x12, x26\n\t" "adc x13, x13, x27\n\t" "subs x24, x24, #1\n\t" + "cmp x24, #3\n\t" "b.ge L_curve25519_bits_%=\n\t" + /* Conditional Swap */ + "subs xzr, xzr, x23, lsl 63\n\t" + "ldp x25, x26, [x29, #16]\n\t" + "ldp x27, x28, [x29, #32]\n\t" + "csel x19, x25, x10, ne\n\t" + "csel x25, x10, x25, ne\n\t" + "csel x20, x26, x11, ne\n\t" + "csel x26, x11, x26, ne\n\t" + "csel x21, x27, x12, ne\n\t" + "csel x27, x12, x27, ne\n\t" + "csel x22, x28, x13, ne\n\t" + "csel x28, x13, x28, ne\n\t" + /* Conditional Swap */ + "subs xzr, xzr, x23, lsl 63\n\t" + "ldp x10, x11, [%x[r]]\n\t" + "ldp x12, x13, [%x[r], #16]\n\t" + "csel x14, x10, x6, ne\n\t" + "csel x10, x6, x10, ne\n\t" + "csel x15, x11, x7, ne\n\t" + "csel x11, x7, x11, ne\n\t" + "csel x16, x12, x8, ne\n\t" + "csel x12, x8, x12, ne\n\t" + "csel x17, x13, x9, ne\n\t" + "csel x13, x9, x13, ne\n\t" + "\n" + "L_curve25519_3_%=: \n\t" + /* Add */ + "adds x6, x10, x25\n\t" + "adcs x7, x11, x26\n\t" + "adcs x8, x12, x27\n\t" + "adcs x9, x13, x28\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Sub */ + "subs x25, x10, x25\n\t" + "sbcs x26, x11, x26\n\t" + "sbcs x27, x12, x27\n\t" + "sbcs x28, x13, x28\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + "extr x5, x5, x28, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x25, x25, x3\n\t" + "sbcs x26, x26, xzr\n\t" + "and x28, x28, #0x7fffffffffffffff\n\t" + "sbcs x27, x27, xzr\n\t" + "sbc x28, x28, xzr\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x21, x25, x26\n\t" + "mul x20, x25, x26\n\t" + /* A[0] * A[3] */ + "umulh x14, x25, x28\n\t" + "mul x22, x25, x28\n\t" + /* A[0] * A[2] */ + "mul x3, x25, x27\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x25, x27\n\t" + "adcs x22, x22, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x26, x28\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x26, x28\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x26, x27\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x26, x27\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x27, x28\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x27, x28\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x20, x20, x20\n\t" + "adcs x21, x21, x21\n\t" + "adcs x22, x22, x22\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x25, x25\n\t" + "mul x19, x25, x25\n\t" + /* A[1] * A[1] */ + "mul x3, x26, x26\n\t" + "adds x20, x20, x4\n\t" + "umulh x4, x26, x26\n\t" + "adcs x21, x21, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x27, x27\n\t" + "adcs x22, x22, x4\n\t" + "umulh x4, x27, x27\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x28, x28\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x28, x28\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x22, x22, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x19, x19, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x20, x20, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x21, x21, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x22, x22, xzr\n\t" + /* Add high product results in */ + "adds x19, x19, x5\n\t" + "adcs x20, x20, x14\n\t" + "adcs x21, x21, x15\n\t" + "adc x22, x22, x16\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x16, x6, x7\n\t" + "mul x15, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x25, x6, x9\n\t" + "mul x17, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x17, x17, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x7, x9\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x26, x26, x3\n\t" + "umulh x27, x8, x9\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x25, x25, x25\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x14, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x15, x15, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x16, x16, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x17, x17, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x25, x25, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x26, x26, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x14, x14, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, x27\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x11, x14, x19\n\t" + "mul x10, x14, x19\n\t" + /* A[2] * B[0] */ + "umulh x13, x16, x19\n\t" + "mul x12, x16, x19\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x19\n\t" + "adds x11, x11, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x12, x12, x4\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x22\n\t" + "adc x13, x13, xzr\n\t" + "mul x25, x15, x22\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x20\n\t" + "adds x11, x11, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x12, x12, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x20\n\t" + "adcs x13, x13, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x21\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x21\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x14, x21\n\t" + "adcs x13, x13, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x20\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x13, x13, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x20\n\t" + "adcs x25, x25, x3\n\t" + "umulh x4, x17, x20\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x21\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x21\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x22\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x22\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x22\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x14, x22\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x22\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x16, x22\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x19\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x17, x19\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x21\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x17, x21\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x10, x10, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x11, x11, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x12, x12, x4\n\t" + "umulh x27, x3, x27\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x10, x10, x5\n\t" + "adcs x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, x27\n\t" + /* Store */ + "stp x10, x11, [%x[r]]\n\t" + "stp x12, x13, [%x[r], #16]\n\t" + /* Sub */ + "subs x14, x14, x19\n\t" + "sbcs x15, x15, x20\n\t" + "sbcs x16, x16, x21\n\t" + "sbcs x17, x17, x22\n\t" + "csetm x5, cc\n\t" + "mov x3, #-19\n\t" + /* Mask the modulus */ + "extr x5, x5, x17, #63\n\t" + "mul x3, x5, x3\n\t" + /* Add modulus (if underflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "sbcs x16, x16, xzr\n\t" + "sbc x17, x17, xzr\n\t" + /* Multiply by 121666 */ + "mov x5, #0xdb42\n\t" + "movk x5, #1, lsl 16\n\t" + "mul x6, x14, x5\n\t" + "umulh x7, x14, x5\n\t" + "mul x3, x15, x5\n\t" + "umulh x8, x15, x5\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + "mul x3, x16, x5\n\t" + "umulh x9, x16, x5\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + "mul x3, x17, x5\n\t" + "umulh x4, x17, x5\n\t" + "adds x9, x9, x3\n\t" + "adc x4, x4, xzr\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x9, #63\n\t" + "mul x4, x4, x5\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Add */ + "adds x19, x19, x6\n\t" + "adcs x20, x20, x7\n\t" + "adcs x21, x21, x8\n\t" + "adcs x22, x22, x9\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + /* Mask the modulus */ + "extr x5, x5, x22, #63\n\t" + "mul x3, x5, x3\n\t" + /* Sub modulus (if overflow) */ + "adds x19, x19, x3\n\t" + "adcs x20, x20, xzr\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x26, x14, x19\n\t" + "mul x25, x14, x19\n\t" + /* A[2] * B[0] */ + "umulh x28, x16, x19\n\t" + "mul x27, x16, x19\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x19\n\t" + "adds x26, x26, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x27, x27, x4\n\t" + /* A[1] * B[3] */ + "umulh x7, x15, x22\n\t" + "adc x28, x28, xzr\n\t" + "mul x6, x15, x22\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x20\n\t" + "adds x26, x26, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x27, x27, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x20\n\t" + "adcs x28, x28, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x6, x6, x4\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x21\n\t" + "adds x28, x28, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x21\n\t" + "adds x27, x27, x3\n\t" + "umulh x4, x14, x21\n\t" + "adcs x28, x28, x4\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x20\n\t" + "adds x27, x27, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x28, x28, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x20\n\t" + "adcs x6, x6, x3\n\t" + "umulh x4, x17, x20\n\t" + "adcs x7, x7, x4\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x21\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x16, x21\n\t" + "adcs x7, x7, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x22\n\t" + "adcs x8, x8, x3\n\t" + "umulh x9, x17, x22\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x22\n\t" + "adds x28, x28, x3\n\t" + "umulh x4, x14, x22\n\t" + "adcs x6, x6, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x22\n\t" + "adcs x7, x7, x3\n\t" + "umulh x4, x16, x22\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x19\n\t" + "adds x28, x28, x3\n\t" + "umulh x4, x17, x19\n\t" + "adcs x6, x6, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x21\n\t" + "adcs x7, x7, x3\n\t" + "umulh x4, x17, x21\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x9\n\t" + "adds x28, x28, x4\n\t" + "umulh x5, x3, x9\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x28, #63\n\t" + "mul x5, x5, x3\n\t" + "and x28, x28, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x6\n\t" + "adds x25, x25, x4\n\t" + "umulh x6, x3, x6\n\t" + "mul x4, x3, x7\n\t" + "adcs x26, x26, x4\n\t" + "umulh x7, x3, x7\n\t" + "mul x4, x3, x8\n\t" + "adcs x27, x27, x4\n\t" + "umulh x8, x3, x8\n\t" + "adc x28, x28, xzr\n\t" + /* Add high product results in */ + "adds x25, x25, x5\n\t" + "adcs x26, x26, x6\n\t" + "adcs x27, x27, x7\n\t" + "adc x28, x28, x8\n\t" + /* Store */ + "stp x25, x26, [x29, #16]\n\t" + "stp x27, x28, [x29, #32]\n\t" + "subs x24, x24, #1\n\t" + "b.ge L_curve25519_3_%=\n\t" /* Invert */ "add x0, x29, #48\n\t" "add x1, x29, #16\n\t" @@ -3694,7 +6723,6 @@ int curve25519(byte* r, const byte* n, const byte* a) return (word32)(size_t)r; } -#ifdef HAVE_ED25519 void fe_pow22523(fe r, const fe a) { __asm__ __volatile__ ( @@ -8086,6 +11114,7 @@ void ge_sub(ge_p1p1* r, const ge_p3* p, const ge_cached* q) ); } +#ifdef HAVE_ED25519 void sc_reduce(byte* s) { __asm__ __volatile__ ( diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S index 65ec393cf..18cc6bd4d 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519.S +++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S @@ -227,7 +227,7 @@ fe_add: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 24 */ .size fe_add,.-fe_add -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) .text .align 4 .globl fe_frombytes @@ -430,7 +430,7 @@ fe_isnegative: POP {r4, r5, pc} /* Cycle Count = 31 */ .size fe_isnegative,.-fe_isnegative -#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) +#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WC_NO_CACHE_RESISTANT .text .align 4 @@ -1507,8 +1507,8 @@ fe_cmov_table: /* Cycle Count = 160 */ .size fe_cmov_table,.-fe_cmov_table #endif /* WC_NO_CACHE_RESISTANT */ -#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ -#endif /* HAVE_ED25519 */ +#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN || WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ #ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 @@ -3272,7 +3272,7 @@ L_curve25519_inv_8: .size curve25519,.-curve25519 #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_CURVE25519 */ -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) .text .align 4 .globl fe_invert @@ -4533,6 +4533,8 @@ ge_sub: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 138 */ .size ge_sub,.-ge_sub +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ +#ifdef HAVE_ED25519 #ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c index 9d0747d76..b20fcad35 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -296,7 +296,7 @@ WC_OMIT_FRAME_POINTER void fe_add(fe r, const fe a, const fe b) ); } -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void fe_frombytes(fe out_p, const unsigned char* in_p) #else @@ -571,7 +571,7 @@ WC_OMIT_FRAME_POINTER int fe_isnegative(const fe a) return (word32)(size_t)a; } -#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) +#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WC_NO_CACHE_RESISTANT #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) @@ -1670,8 +1670,8 @@ WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r, fe* base, signed char b) } #endif /* WC_NO_CACHE_RESISTANT */ -#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ -#endif /* HAVE_ED25519 */ +#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN || WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ #ifdef WOLFSSL_ARM_ARCH_7M void fe_mul_op(void); #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -3663,7 +3663,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a) #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_CURVE25519 */ -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void fe_invert(fe r_p, const fe a_p) #else @@ -5156,6 +5156,8 @@ WC_OMIT_FRAME_POINTER void ge_sub(ge_p1p1 * r, const ge_p3 * p, ); } +#endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */ +#ifdef HAVE_ED25519 #ifdef WOLFSSL_ARM_ARCH_7M #ifndef WOLFSSL_NO_VAR_ASSIGN_REG WC_OMIT_FRAME_POINTER void sc_reduce(byte* s_p) diff --git a/wolfssl/wolfcrypt/ed25519.h b/wolfssl/wolfcrypt/ed25519.h index 06a22cdf7..299f3fd34 100644 --- a/wolfssl/wolfcrypt/ed25519.h +++ b/wolfssl/wolfcrypt/ed25519.h @@ -29,7 +29,7 @@ #include -#ifdef HAVE_ED25519 +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) #include #ifndef WOLFSSL_SHA512 diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h index 1848ca652..4959cf2b8 100644 --- a/wolfssl/wolfcrypt/fe_operations.h +++ b/wolfssl/wolfcrypt/fe_operations.h @@ -48,6 +48,12 @@ #define CURVED25519_ASM #endif +#if (defined(CURVED25519_ASM_64BIT) || defined(HAVE_ED25519)) && \ + !defined(WOLFSSL_CURVE25519_BLINDING) + #undef WOLFSSL_CURVE25519_USE_ED25519 + #define WOLFSSL_CURVE25519_USE_ED25519 +#endif + /* fe means field element. Here the field is \Z/(2^255-19). @@ -75,6 +81,7 @@ Bounds on each t[i] vary depending on context. #if !defined(FREESCALE_LTC_ECC) WOLFSSL_LOCAL void fe_init(void); +WOLFSSL_LOCAL int curve25519_base(byte * q, const byte * n); WOLFSSL_LOCAL int curve25519(byte * q, const byte * n, const byte * p); #ifdef WOLFSSL_CURVE25519_BLINDING WOLFSSL_LOCAL int curve25519_blind(byte * q, const byte * n, const byte* mask, diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h index a9cc47584..6b3d24405 100644 --- a/wolfssl/wolfcrypt/ge_operations.h +++ b/wolfssl/wolfcrypt/ge_operations.h @@ -27,10 +27,10 @@ #include -#ifdef HAVE_ED25519 - #include +#if defined(HAVE_ED25519) || defined(WOLFSSL_CURVE25519_USE_ED25519) + /* ge means group element.