diff --git a/src/include.am b/src/include.am index e49442210..50353c02b 100644 --- a/src/include.am +++ b/src/include.am @@ -371,9 +371,14 @@ if BUILD_FEMATH if BUILD_CURVE25519_SMALL src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_low_mem.c else -src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S +else +if BUILD_ARMASM +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S +else +src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c +endif endif endif endif @@ -384,9 +389,14 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/ge_low_mem.c else src_libwolfssl_la_SOURCES += wolfcrypt/src/ge_operations.c if !BUILD_FEMATH -src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S +else +if BUILD_ARMASM +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S +else +src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c +endif endif endif endif diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c index f181d43b8..537227017 100644 --- a/wolfcrypt/src/ge_operations.c +++ b/wolfcrypt/src/ge_operations.c @@ -42,8 +42,13 @@ #include #endif +#if defined(CURVED25519_X64) || defined(WOLFSSL_ARMASM) + #define CURVED25519_ASM_64BIT +#endif + + static void ge_p2_0(ge_p2 *); -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT static void ge_precomp_0(ge_precomp *); #endif static void ge_p3_to_p2(ge_p2 *,const ge_p3 *); @@ -927,7 +932,7 @@ r = p + q */ static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -947,7 +952,7 @@ static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) } -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT /* ge_scalar mult base */ static unsigned char equal(signed char b,signed char c) { @@ -977,7 +982,7 @@ static WC_INLINE void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b, } #endif -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT static const ge_precomp base[64][8] = { { { @@ -6368,7 +6373,7 @@ static const ge_precomp base[32][8] = { static void ge_select(ge_precomp *t,int pos,signed char b) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge_precomp minust; unsigned char bnegative = negative(b); unsigned char babs = b - (((-bnegative) & b) << 1); @@ -6390,7 +6395,6 @@ static void ge_select(ge_precomp *t,int pos,signed char b) #endif } - /* h = a * B where a = a[0]+256*a[1]+...+256^31 a[31] @@ -6404,7 +6408,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) signed char e[64]; signed char carry; ge_p1p1 r; -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge_p2 s; #endif ge_precomp t; @@ -6427,7 +6431,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) e[63] += carry; /* each e[i] is between -8 and 8 */ -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge_select(&t,0,e[1]); fe_sub(h->X, t.yplusx, t.yminusx); fe_add(h->Y, t.yplusx, t.yminusx); @@ -6498,7 +6502,7 @@ static void slide(signed char *r,const unsigned char *a) } } -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT static const ge_precomp Bi[8] = { { { 0x2fbc93c6f58c3b85, -0x306cd2390473f1e7, 0x270b4898643d42c2, 0x07cf9d3a33d4ba65, }, @@ -6691,7 +6695,7 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, return 0; } -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT static const ge d = { 0x75eb4dca135978a3, 0x00700a4d4141d8ab, -0x7338bf8688861768, 0x52036cee2b6ffe73, }; @@ -6708,7 +6712,7 @@ static const ge d = { #endif -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT static const ge sqrtm1 = { -0x3b11e4d8b5f15f50, 0x2f431806ad2fe478, 0x2b4d00993dfbd7a7, 0x2b8324804fc1df0b, }; @@ -6775,7 +6779,7 @@ r = p + q static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -6802,7 +6806,7 @@ r = p - q static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -6828,7 +6832,7 @@ r = p static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT fe_mul(r->X,p->X,p->T); fe_mul(r->Y,p->Y,p->Z); fe_mul(r->Z,p->Z,p->T); @@ -6846,7 +6850,7 @@ r = p static WC_INLINE void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT fe_mul(r->X,p->X,p->T); fe_mul(r->Y,p->Y,p->Z); fe_mul(r->Z,p->Z,p->T); @@ -6875,7 +6879,7 @@ r = 2 * p static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge t0; fe_sq(r->X,p->X); fe_sq(r->Z,p->Y); @@ -6912,7 +6916,7 @@ static void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p) r = p */ -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT static const ge d2 = { -0x1429646bd94d0ea7, 0x00e0149a8283b156, 0x198e80f2eef3d130, 0x2406d9dc56dffce7, }; @@ -6966,7 +6970,7 @@ void ge_p3_tobytes(unsigned char *s,const ge_p3 *h) } -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT /* ge_precomp_0 */ static void ge_precomp_0(ge_precomp *h) { @@ -6984,7 +6988,7 @@ r = p - q static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) { -#ifndef CURVED25519_X64 +#ifndef CURVED25519_ASM_64BIT ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 21e4ab315..fe5e27027 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -50,6 +50,8 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \ wolfcrypt/src/port/nrf51.c \ wolfcrypt/src/port/arm/armv8-aes.c \ wolfcrypt/src/port/arm/armv8-sha256.c \ + wolfcrypt/src/port/arm/armv8-curve25519.c \ + wolfcrypt/src/port/arm/armv8-curve25519.S \ wolfcrypt/src/port/nxp/ksdk_port.c \ wolfcrypt/src/port/atmel/README.md \ wolfcrypt/src/port/xilinx/xil-sha3.c \ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S new file mode 100644 index 000000000..6d384dde0 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -0,0 +1,7231 @@ +/* armv8-curve25519 + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +.text +.globl fe_init +.type fe_init,@function +.align 4 +fe_init: + ret +.size fe_init,.-fe_init +.text +.globl fe_frombytes +.type fe_frombytes,@function +.align 4 +fe_frombytes: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret +.size fe_frombytes,.-fe_frombytes +.text +.globl fe_tobytes +.type fe_tobytes,@function +.align 4 +fe_tobytes: + mov x7, #19 + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + adds x6, x2, x7 + adcs x6, x3, xzr + adcs x6, x4, xzr + adc x6, x5, xzr + lsr x6, x6, #63 + mul x6, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + adc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret +.size fe_tobytes,.-fe_tobytes +.text +.globl fe_1 +.type fe_1,@function +.align 4 +fe_1: + # Set one + mov x1, #1 + stp x1, xzr, [x0] + stp xzr, xzr, [x0, #16] + ret +.size fe_1,.-fe_1 +.text +.globl fe_0 +.type fe_0,@function +.align 4 +fe_0: + # Set zero + stp xzr, xzr, [x0] + stp xzr, xzr, [x0, #16] + ret +.size fe_0,.-fe_0 +.text +.globl fe_copy +.type fe_copy,@function +.align 4 +fe_copy: + # Copy + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret +.size fe_copy,.-fe_copy +.text +.globl fe_cswap +.type fe_cswap,@function +.align 4 +fe_cswap: + # Conditional Swap + cmp x2, #1 + ldp x3, x4, [x0] + ldp x5, x6, [x0, #16] + ldp x7, x8, [x1] + ldp x9, x10, [x1, #16] + csel x11, x3, x7, eq + csel x3, x7, x3, eq + csel x12, x4, x8, eq + csel x4, x8, x4, eq + csel x13, x5, x9, eq + csel x5, x9, x5, eq + csel x14, x6, x10, eq + csel x6, x10, x6, eq + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + stp x11, x12, [x1] + stp x13, x14, [x1, #16] + ret +.size fe_cswap,.-fe_cswap +.text +.globl fe_sub +.type fe_sub,@function +.align 4 +fe_sub: + # Sub + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + subs x3, x3, x7 + sbcs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + mov x12, #-19 + csetm x11, cc + # Mask the modulus + and x12, x11, x12 + and x13, x11, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x3, x3, x12 + adcs x4, x4, x11 + adcs x5, x5, x11 + adc x6, x6, x13 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret +.size fe_sub,.-fe_sub +.text +.globl fe_add +.type fe_add,@function +.align 4 +fe_add: + # Add + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adc x6, x6, x10 + mov x12, #-19 + asr x11, x6, #63 + # Mask the modulus + and x12, x11, x12 + and x13, x11, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x3, x3, x12 + sbcs x4, x4, x11 + sbcs x5, x5, x11 + sbc x6, x6, x13 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret +.size fe_add,.-fe_add +.text +.globl fe_neg +.type fe_neg,@function +.align 4 +fe_neg: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x6, #-19 + mov x7, #-1 + mov x8, #-1 + mov x9, #0x7fffffffffffffff + subs x6, x6, x2 + sbcs x7, x7, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ret +.size fe_neg,.-fe_neg +.text +.globl fe_cmov +.type fe_cmov,@function +.align 4 +fe_cmov: + ldp x4, x5, [x0] + ldp x6, x7, [x0, #16] + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + cmp x2, #1 + csel x4, x4, x8, eq + csel x5, x5, x9, eq + csel x6, x6, x10, eq + csel x7, x7, x11, eq + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ret +.size fe_cmov,.-fe_cmov +.text +.globl fe_isnonzero +.type fe_isnonzero,@function +.align 4 +fe_isnonzero: + mov x6, #19 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + adds x5, x1, x6 + adcs x5, x2, xzr + adcs x5, x3, xzr + adc x5, x4, xzr + lsr x5, x5, #63 + mul x5, x5, x6 + adds x1, x1, x5 + adcs x2, x2, xzr + adcs x3, x3, xzr + adc x4, x4, xzr + and x4, x4, #0x7fffffffffffffff + orr x0, x1, x2 + orr x3, x3, x4 + orr x0, x0, x3 + ret +.size fe_isnonzero,.-fe_isnonzero +.text +.globl fe_isnegative +.type fe_isnegative,@function +.align 4 +fe_isnegative: + mov x6, #19 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + adds x5, x1, x6 + adcs x5, x2, xzr + adcs x5, x3, xzr + adc x5, x4, xzr + lsr x5, x5, #63 + mul x5, x5, x6 + ldr x1, [x0] + adds x1, x1, x5 + and x0, x1, #1 + ret +.size fe_isnegative,.-fe_isnegative +.text +.globl fe_cmov_table +.type fe_cmov_table,@function +.align 4 +fe_cmov_table: + stp x29, x30, [sp, #-112]! + add x29, sp, #0 + str x17, [x29, #16] + str x18, [x29, #24] + str x19, [x29, #32] + str x20, [x29, #40] + str x21, [x29, #48] + str x22, [x29, #56] + str x23, [x29, #64] + str x24, [x29, #72] + str x25, [x29, #80] + str x26, [x29, #88] + str x27, [x29, #96] + str x28, [x29, #104] + sxtb x2, w2 + sbfx x15, x2, #7, #1 + sxtb x16, w2 + eor x16, x16, x15 + sub x16, x16, x15 + mov x3, #1 + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, #1 + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + cmp x16, #1 + ldp x17, x18, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x1, #32] + ldp x23, x24, [x1, #48] + ldp x25, x26, [x1, #64] + ldp x27, x28, [x1, #80] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + cmp x16, #2 + ldp x17, x18, [x1, #96] + ldp x19, x20, [x1, #112] + ldp x21, x22, [x1, #128] + ldp x23, x24, [x1, #144] + ldp x25, x26, [x1, #160] + ldp x27, x28, [x1, #176] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + cmp x16, #3 + ldp x17, x18, [x1, #192] + ldp x19, x20, [x1, #208] + ldp x21, x22, [x1, #224] + ldp x23, x24, [x1, #240] + ldp x25, x26, [x1, #256] + ldp x27, x28, [x1, #272] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + cmp x16, #4 + ldp x17, x18, [x1, #288] + ldp x19, x20, [x1, #304] + ldp x21, x22, [x1, #320] + ldp x23, x24, [x1, #336] + ldp x25, x26, [x1, #352] + ldp x27, x28, [x1, #368] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + add x1, x1, #0x180 + cmp x16, #5 + ldp x17, x18, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x1, #32] + ldp x23, x24, [x1, #48] + ldp x25, x26, [x1, #64] + ldp x27, x28, [x1, #80] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + cmp x16, #6 + ldp x17, x18, [x1, #96] + ldp x19, x20, [x1, #112] + ldp x21, x22, [x1, #128] + ldp x23, x24, [x1, #144] + ldp x25, x26, [x1, #160] + ldp x27, x28, [x1, #176] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + cmp x16, #7 + ldp x17, x18, [x1, #192] + ldp x19, x20, [x1, #208] + ldp x21, x22, [x1, #224] + ldp x23, x24, [x1, #240] + ldp x25, x26, [x1, #256] + ldp x27, x28, [x1, #272] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + cmp x16, #8 + ldp x17, x18, [x1, #288] + ldp x19, x20, [x1, #304] + ldp x21, x22, [x1, #320] + ldp x23, x24, [x1, #336] + ldp x25, x26, [x1, #352] + ldp x27, x28, [x1, #368] + csel x3, x17, x3, eq + csel x4, x18, x4, eq + csel x5, x19, x5, eq + csel x6, x20, x6, eq + csel x7, x21, x7, eq + csel x8, x22, x8, eq + csel x9, x23, x9, eq + csel x10, x24, x10, eq + csel x11, x25, x11, eq + csel x12, x26, x12, eq + csel x13, x27, x13, eq + csel x14, x28, x14, eq + add x1, x1, #0x180 + sub x1, x1, #0x180 + mov x17, #-19 + mov x18, #-1 + mov x19, #-1 + mov x20, #0x7fffffffffffffff + subs x17, x17, x11 + sbcs x18, x18, x12 + sbcs x19, x19, x13 + sbc x20, x20, x14 + cmp x2, #0 + mov x15, x3 + csel x3, x7, x3, lt + csel x7, x15, x7, lt + mov x15, x4 + csel x4, x8, x4, lt + csel x8, x15, x8, lt + mov x15, x5 + csel x5, x9, x5, lt + csel x9, x15, x9, lt + mov x15, x6 + csel x6, x10, x6, lt + csel x10, x15, x10, lt + csel x11, x17, x11, lt + csel x12, x18, x12, lt + csel x13, x19, x13, lt + csel x14, x20, x14, lt + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + stp x7, x8, [x0, #32] + stp x9, x10, [x0, #48] + stp x11, x12, [x0, #64] + stp x13, x14, [x0, #80] + ldr x17, [x29, #16] + ldr x18, [x29, #24] + ldr x19, [x29, #32] + ldr x20, [x29, #40] + ldr x21, [x29, #48] + ldr x22, [x29, #56] + ldr x23, [x29, #64] + ldr x24, [x29, #72] + ldr x25, [x29, #80] + ldr x26, [x29, #88] + ldr x27, [x29, #96] + ldr x28, [x29, #104] + ldp x29, x30, [sp], #0x70 + ret +.size fe_cmov_table,.-fe_cmov_table +.text +.globl fe_mul +.type fe_mul,@function +.align 4 +fe_mul: + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + str x17, [x29, #16] + str x18, [x29, #24] + str x19, [x29, #32] + str x20, [x29, #40] + str x21, [x29, #48] + str x22, [x29, #56] + # Multiply + ldp x15, x16, [x1] + ldp x17, x18, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x6, x15, x19 + umulh x7, x15, x19 + # A[0] * B[1] + mul x3, x15, x20 + umulh x8, x15, x20 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x16, x19 + umulh x4, x16, x19 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x15, x21 + umulh x4, x15, x21 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x16, x20 + umulh x4, x16, x20 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x17, x19 + umulh x4, x17, x19 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x15, x22 + umulh x4, x15, x22 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x16, x21 + umulh x4, x16, x21 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x17, x20 + umulh x4, x17, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x18, x19 + umulh x4, x18, x19 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x16, x22 + umulh x4, x16, x22 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x17, x21 + umulh x4, x17, x21 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x18, x20 + umulh x4, x18, x20 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x17, x22 + umulh x4, x17, x22 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x18, x21 + umulh x4, x18, x21 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x18, x22 + umulh x4, x18, x22 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ldr x17, [x29, #16] + ldr x18, [x29, #24] + ldr x19, [x29, #32] + ldr x20, [x29, #40] + ldr x21, [x29, #48] + ldr x22, [x29, #56] + ldp x29, x30, [sp], #0x40 + ret +.size fe_mul,.-fe_mul +.text +.globl fe_sq +.type fe_sq,@function +.align 4 +fe_sq: + stp x29, x30, [sp, #-32]! + add x29, sp, #0 + str x17, [x29, #24] + # Square + ldp x14, x15, [x1] + ldp x16, x17, [x1, #16] + # A[0] * A[1] + mul x3, x14, x15 + umulh x4, x14, x15 + # A[0] * A[2] + mul x11, x14, x16 + umulh x5, x14, x16 + adds x4, x4, x11 + adc x5, x5, xzr + # A[0] * A[3] + mul x11, x14, x17 + umulh x6, x14, x17 + adds x5, x5, x11 + adc x6, x6, xzr + # A[1] * A[2] + mul x11, x15, x16 + umulh x12, x15, x16 + adds x5, x5, x11 + adcs x6, x6, x12 + adc x7, xzr, xzr + # A[1] * A[3] + mul x11, x15, x17 + umulh x12, x15, x17 + adds x6, x6, x11 + adc x7, x7, x12 + # A[2] * A[3] + mul x11, x16, x17 + umulh x8, x16, x17 + adds x7, x7, x11 + adc x8, x8, xzr + # Double + adds x3, x3, x3 + adcs x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adc x9, xzr, xzr + # A[0] * A[0] + mul x2, x14, x14 + umulh x10, x14, x14 + # A[1] * A[1] + mul x11, x15, x15 + umulh x12, x15, x15 + adds x3, x3, x10 + adcs x4, x4, x11 + adc x10, x12, xzr + # A[2] * A[2] + mul x11, x16, x16 + umulh x12, x16, x16 + adds x5, x5, x10 + adcs x6, x6, x11 + adc x10, x12, xzr + # A[3] * A[3] + mul x11, x17, x17 + umulh x12, x17, x17 + adds x7, x7, x10 + adcs x8, x8, x11 + adc x9, x9, x12 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + extr x6, x6, x5, #63 + and x5, x5, #0x7fffffffffffffff + # Multiply top half by 19 + mov x11, #19 + mul x12, x11, x6 + umulh x6, x11, x6 + adds x2, x2, x12 + mul x12, x11, x7 + umulh x7, x11, x7 + adcs x3, x3, x12 + mul x12, x11, x8 + umulh x8, x11, x8 + adcs x4, x4, x12 + mul x12, x11, x9 + umulh x13, x11, x9 + adcs x5, x5, x12 + adc x13, x13, xzr + # Add remaining product results in + adds x3, x3, x6 + adcs x4, x4, x7 + adcs x5, x5, x8 + adc x13, x13, xzr + # Overflow + extr x13, x13, x5, #63 + mul x13, x13, x11 + and x5, x5, #0x7fffffffffffffff + adds x2, x2, x13 + adcs x3, x3, xzr + adcs x4, x4, xzr + adc x5, x5, xzr + # Reduce if top bit set + lsr x13, x5, #63 + mul x13, x13, x11 + and x5, x5, #0x7fffffffffffffff + adds x2, x2, x13 + adcs x3, x3, xzr + adcs x4, x4, xzr + adc x5, x5, xzr + # Store + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ldr x17, [x29, #24] + ldp x29, x30, [sp], #32 + ret +.size fe_sq,.-fe_sq +.text +.globl fe_mul121666 +.type fe_mul121666,@function +.align 4 +fe_mul121666: + # Multiply by 121666 + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x13, #0xdb42 + movk x13, #1, lsl 16 + mul x6, x2, x13 + umulh x7, x2, x13 + mul x11, x3, x13 + umulh x12, x3, x13 + adds x7, x7, x11 + adc x8, xzr, x12 + mul x11, x4, x13 + umulh x12, x4, x13 + adds x8, x8, x11 + adc x9, xzr, x12 + mul x11, x5, x13 + umulh x12, x5, x13 + adds x9, x9, x11 + adc x12, xzr, x12 + mov x13, #19 + extr x12, x12, x9, #63 + mul x12, x12, x13 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x12 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ret +.size fe_mul121666,.-fe_mul121666 +.text +.globl fe_sq2 +.type fe_sq2,@function +.align 4 +fe_sq2: + stp x29, x30, [sp, #-32]! + add x29, sp, #0 + str x17, [x29, #16] + str x18, [x29, #24] + # Square * 2 + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + # A[0] * A[1] + mul x7, x2, x3 + umulh x8, x2, x3 + # A[0] * A[2] + mul x11, x2, x4 + umulh x9, x2, x4 + adds x8, x8, x11 + adc x9, x9, xzr + # A[0] * A[3] + mul x11, x2, x5 + umulh x10, x2, x5 + adds x9, x9, x11 + adc x10, x10, xzr + # A[1] * A[2] + mul x11, x3, x4 + umulh x12, x3, x4 + adds x9, x9, x11 + adcs x10, x10, x12 + adc x14, xzr, xzr + # A[1] * A[3] + mul x11, x3, x5 + umulh x12, x3, x5 + adds x10, x10, x11 + adc x14, x14, x12 + # A[2] * A[3] + mul x11, x4, x5 + umulh x15, x4, x5 + adds x14, x14, x11 + adc x15, x15, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x14, x14, x14 + adcs x15, x15, x15 + adc x16, xzr, xzr + # A[0] * A[0] + mul x6, x2, x2 + umulh x17, x2, x2 + # A[1] * A[1] + mul x11, x3, x3 + umulh x12, x3, x3 + adds x7, x7, x17 + adcs x8, x8, x11 + adc x17, x12, xzr + # A[2] * A[2] + mul x11, x4, x4 + umulh x12, x4, x4 + adds x9, x9, x17 + adcs x10, x10, x11 + adc x17, x12, xzr + # A[3] * A[3] + mul x11, x5, x5 + umulh x12, x5, x5 + adds x14, x14, x17 + adcs x15, x15, x11 + adc x16, x16, x12 + # Double and Reduce + mov x11, #0x169 + # Move top half into t4-t7 and remove top bit from t3 + lsr x17, x16, #61 + extr x16, x16, x15, #62 + extr x15, x15, x14, #62 + extr x14, x14, x10, #62 + extr x10, x10, x9, #62 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + lsl x6, x6, #1 + and x9, x9, #0x7fffffffffffffff + # Two left, only one right + and x16, x16, #0x7fffffffffffffff + # Multiply top bits by 19*19 + mul x17, x17, x11 + # Multiply top half by 19 + mov x11, #19 + mul x12, x11, x10 + umulh x10, x11, x10 + adds x6, x6, x12 + mul x12, x11, x14 + umulh x14, x11, x14 + adcs x7, x7, x12 + mul x12, x11, x15 + umulh x15, x11, x15 + adcs x8, x8, x12 + mul x12, x11, x16 + umulh x13, x11, x16 + adcs x9, x9, x12 + adc x13, x13, xzr + # Add remaining product results in + adds x6, x6, x17 + adcs x7, x7, x10 + adcs x8, x8, x14 + adcs x9, x9, x15 + adc x13, x13, xzr + # Overflow + extr x13, x13, x9, #63 + mul x13, x13, x11 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x13 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x13, x9, #63 + mul x13, x13, x11 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x13 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ldr x17, [x29, #16] + ldr x18, [x29, #24] + ldp x29, x30, [sp], #32 + ret +.size fe_sq2,.-fe_sq2 +.text +.globl fe_invert +.type fe_invert,@function +.align 4 +fe_invert: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x20, [x29, #168] + # Invert + str x0, [x29, #144] + str x1, [x29, #152] + add x0, x29, #16 + bl fe_sq + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x1, x29, #48 + bl fe_sq + ldr x1, [x29, #152] + add x2, x29, #48 + bl fe_mul + add x0, x29, #16 + add x1, x29, #16 + add x2, x29, #48 + bl fe_mul + add x0, x29, #80 + bl fe_sq + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #80 + bl fe_mul + add x0, x29, #80 + bl fe_sq + mov x20, #4 + add x1, x29, #80 +L_fe_invert1: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert1 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + add x0, x29, #80 + add x1, x29, #48 + bl fe_sq + mov x20, #9 + add x1, x29, #80 +L_fe_invert2: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert2 + add x2, x29, #48 + bl fe_mul + add x0, x29, #112 + bl fe_sq + mov x20, #19 + add x1, x29, #112 +L_fe_invert3: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert3 + add x0, x29, #80 + add x2, x29, #80 + bl fe_mul + mov x20, #10 + add x1, x29, #80 +L_fe_invert4: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert4 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + add x0, x29, #80 + add x1, x29, #48 + bl fe_sq + mov x20, #49 + add x1, x29, #80 +L_fe_invert5: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert5 + add x2, x29, #48 + bl fe_mul + add x0, x29, #112 + bl fe_sq + mov x20, #0x63 + add x1, x29, #112 +L_fe_invert6: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert6 + add x0, x29, #80 + add x2, x29, #80 + bl fe_mul + mov x20, #50 + add x1, x29, #80 +L_fe_invert7: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert7 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x20, #5 + add x1, x29, #48 +L_fe_invert8: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert8 + ldr x0, [x29, #144] + add x2, x29, #16 + bl fe_mul + ldr x1, [x29, #152] + ldr x0, [x29, #144] + ldr x20, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret +.size fe_invert,.-fe_invert +.text +.globl curve25519 +.type curve25519,@function +.align 4 +curve25519: + stp x29, x30, [sp, #-272]! + add x29, sp, #0 + str x17, [x29, #200] + str x18, [x29, #208] + str x19, [x29, #216] + str x20, [x29, #224] + str x21, [x29, #232] + str x22, [x29, #240] + str x23, [x29, #248] + str x24, [x29, #256] + str x25, [x29, #264] + mov x22, xzr + str x0, [x29, #176] + # Set one + mov x23, #1 + stp x23, xzr, [x0] + stp xzr, xzr, [x0, #16] + # Set zero + stp xzr, xzr, [x29, #16] + stp xzr, xzr, [x29, #32] + # Set one + mov x23, #1 + stp x23, xzr, [x29, #48] + stp xzr, xzr, [x29, #64] + # Copy + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + mov x25, #62 + mov x24, #24 +L_curve25519_words: +L_curve25519_bits: + ldr x23, [x1, x24] + lsr x23, x23, x25 + and x23, x23, #1 + eor x22, x22, x23 + # Conditional Swap + cmp x22, #1 + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #80] + ldp x12, x13, [x29, #96] + csel x14, x6, x10, eq + csel x6, x10, x6, eq + csel x15, x7, x11, eq + csel x7, x11, x7, eq + csel x16, x8, x12, eq + csel x8, x12, x8, eq + csel x17, x9, x13, eq + csel x9, x13, x9, eq + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + stp x14, x15, [x29, #80] + stp x16, x17, [x29, #96] + # Conditional Swap + cmp x22, #1 + ldp x6, x7, [x29, #16] + ldp x8, x9, [x29, #32] + ldp x10, x11, [x29, #48] + ldp x12, x13, [x29, #64] + csel x14, x6, x10, eq + csel x6, x10, x6, eq + csel x15, x7, x11, eq + csel x7, x11, x7, eq + csel x16, x8, x12, eq + csel x8, x12, x8, eq + csel x17, x9, x13, eq + csel x9, x13, x9, eq + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + stp x14, x15, [x29, #48] + stp x16, x17, [x29, #64] + mov x22, x23 + # Add + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] + adds x14, x6, x10 + adcs x15, x7, x11 + adcs x16, x8, x12 + adc x17, x9, x13 + mov x3, #-19 + asr x23, x17, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x14, x14, x3 + sbcs x15, x15, x23 + sbcs x16, x16, x23 + sbc x17, x17, x4 + # Sub + subs x6, x6, x10 + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x6, x6, x3 + adcs x7, x7, x23 + adcs x8, x8, x23 + adc x9, x9, x4 + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] + # Add + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] + ldp x10, x11, [x29, #48] + ldp x12, x13, [x29, #64] + adds x14, x6, x10 + adcs x15, x7, x11 + adcs x16, x8, x12 + adc x17, x9, x13 + mov x3, #-19 + asr x23, x17, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x14, x14, x3 + sbcs x15, x15, x23 + sbcs x16, x16, x23 + sbc x17, x17, x4 + # Sub + subs x6, x6, x10 + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x6, x6, x3 + adcs x7, x7, x23 + adcs x8, x8, x23 + adc x9, x9, x4 + stp x14, x15, [x29, #16] + stp x16, x17, [x29, #32] + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + # Multiply + ldp x18, x19, [x29, #112] + ldp x20, x21, [x29, #128] + ldp x14, x15, [x0] + ldp x16, x17, [x0, #16] + # A[0] * B[0] + mul x6, x18, x14 + umulh x7, x18, x14 + # A[0] * B[1] + mul x3, x18, x15 + umulh x8, x18, x15 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x19, x14 + umulh x4, x19, x14 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x18, x16 + umulh x4, x18, x16 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x19, x15 + umulh x4, x19, x15 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x20, x14 + umulh x4, x20, x14 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x18, x17 + umulh x4, x18, x17 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x19, x16 + umulh x4, x19, x16 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x20, x15 + umulh x4, x20, x15 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x21, x14 + umulh x4, x21, x14 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x19, x17 + umulh x4, x19, x17 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x20, x16 + umulh x4, x20, x16 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x21, x15 + umulh x4, x21, x15 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x20, x17 + umulh x4, x20, x17 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x21, x16 + umulh x4, x21, x16 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x21, x17 + umulh x4, x21, x17 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + # Multiply + ldp x18, x19, [x29, #16] + ldp x20, x21, [x29, #32] + ldp x14, x15, [x29, #144] + ldp x16, x17, [x29, #160] + # A[0] * B[0] + mul x6, x18, x14 + umulh x7, x18, x14 + # A[0] * B[1] + mul x3, x18, x15 + umulh x8, x18, x15 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x19, x14 + umulh x4, x19, x14 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x18, x16 + umulh x4, x18, x16 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x19, x15 + umulh x4, x19, x15 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x20, x14 + umulh x4, x20, x14 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x18, x17 + umulh x4, x18, x17 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x19, x16 + umulh x4, x19, x16 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x20, x15 + umulh x4, x20, x15 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x21, x14 + umulh x4, x21, x14 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x19, x17 + umulh x4, x19, x17 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x20, x16 + umulh x4, x20, x16 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x21, x15 + umulh x4, x21, x15 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x20, x17 + umulh x4, x20, x17 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x21, x16 + umulh x4, x21, x16 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x21, x17 + umulh x4, x21, x17 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + # Square + ldp x18, x19, [x29, #144] + ldp x20, x21, [x29, #160] + # A[0] * A[1] + mul x7, x18, x19 + umulh x8, x18, x19 + # A[0] * A[2] + mul x3, x18, x20 + umulh x9, x18, x20 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x18, x21 + umulh x10, x18, x21 + adds x9, x9, x3 + adc x10, x10, xzr + # A[1] * A[2] + mul x3, x19, x20 + umulh x4, x19, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * A[3] + mul x3, x19, x21 + umulh x4, x19, x21 + adds x10, x10, x3 + adc x11, x11, x4 + # A[2] * A[3] + mul x3, x20, x21 + umulh x12, x20, x21 + adds x11, x11, x3 + adc x12, x12, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + # A[0] * A[0] + mul x6, x18, x18 + umulh x23, x18, x18 + # A[1] * A[1] + mul x3, x19, x19 + umulh x4, x19, x19 + adds x7, x7, x23 + adcs x8, x8, x3 + adc x23, x4, xzr + # A[2] * A[2] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x9, x9, x23 + adcs x10, x10, x3 + adc x23, x4, xzr + # A[3] * A[3] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x11, x11, x23 + adcs x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + # Square + ldp x18, x19, [x0] + ldp x20, x21, [x0, #16] + # A[0] * A[1] + mul x7, x18, x19 + umulh x8, x18, x19 + # A[0] * A[2] + mul x3, x18, x20 + umulh x9, x18, x20 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x18, x21 + umulh x10, x18, x21 + adds x9, x9, x3 + adc x10, x10, xzr + # A[1] * A[2] + mul x3, x19, x20 + umulh x4, x19, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * A[3] + mul x3, x19, x21 + umulh x4, x19, x21 + adds x10, x10, x3 + adc x11, x11, x4 + # A[2] * A[3] + mul x3, x20, x21 + umulh x12, x20, x21 + adds x11, x11, x3 + adc x12, x12, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + # A[0] * A[0] + mul x6, x18, x18 + umulh x23, x18, x18 + # A[1] * A[1] + mul x3, x19, x19 + umulh x4, x19, x19 + adds x7, x7, x23 + adcs x8, x8, x3 + adc x23, x4, xzr + # A[2] * A[2] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x9, x9, x23 + adcs x10, x10, x3 + adc x23, x4, xzr + # A[3] * A[3] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x11, x11, x23 + adcs x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] + # Add + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] + adds x14, x6, x10 + adcs x15, x7, x11 + adcs x16, x8, x12 + adc x17, x9, x13 + mov x3, #-19 + asr x23, x17, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x14, x14, x3 + sbcs x15, x15, x23 + sbcs x16, x16, x23 + sbc x17, x17, x4 + # Sub + subs x6, x6, x10 + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x6, x6, x3 + adcs x7, x7, x23 + adcs x8, x8, x23 + adc x9, x9, x4 + stp x14, x15, [x29, #80] + stp x16, x17, [x29, #96] + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + # Multiply + ldp x18, x19, [x29, #144] + ldp x20, x21, [x29, #160] + ldp x14, x15, [x29, #112] + ldp x16, x17, [x29, #128] + # A[0] * B[0] + mul x6, x18, x14 + umulh x7, x18, x14 + # A[0] * B[1] + mul x3, x18, x15 + umulh x8, x18, x15 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x19, x14 + umulh x4, x19, x14 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x18, x16 + umulh x4, x18, x16 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x19, x15 + umulh x4, x19, x15 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x20, x14 + umulh x4, x20, x14 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x18, x17 + umulh x4, x18, x17 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x19, x16 + umulh x4, x19, x16 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x20, x15 + umulh x4, x20, x15 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x21, x14 + umulh x4, x21, x14 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x19, x17 + umulh x4, x19, x17 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x20, x16 + umulh x4, x20, x16 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x21, x15 + umulh x4, x21, x15 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x20, x17 + umulh x4, x20, x17 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x21, x16 + umulh x4, x21, x16 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x21, x17 + umulh x4, x21, x17 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + # Sub + ldp x6, x7, [x29, #144] + ldp x8, x9, [x29, #160] + ldp x10, x11, [x29, #112] + ldp x12, x13, [x29, #128] + subs x6, x6, x10 + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x6, x6, x3 + adcs x7, x7, x23 + adcs x8, x8, x23 + adc x9, x9, x4 + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] + # Square + ldp x18, x19, [x29, #16] + ldp x20, x21, [x29, #32] + # A[0] * A[1] + mul x7, x18, x19 + umulh x8, x18, x19 + # A[0] * A[2] + mul x3, x18, x20 + umulh x9, x18, x20 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x18, x21 + umulh x10, x18, x21 + adds x9, x9, x3 + adc x10, x10, xzr + # A[1] * A[2] + mul x3, x19, x20 + umulh x4, x19, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * A[3] + mul x3, x19, x21 + umulh x4, x19, x21 + adds x10, x10, x3 + adc x11, x11, x4 + # A[2] * A[3] + mul x3, x20, x21 + umulh x12, x20, x21 + adds x11, x11, x3 + adc x12, x12, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + # A[0] * A[0] + mul x6, x18, x18 + umulh x23, x18, x18 + # A[1] * A[1] + mul x3, x19, x19 + umulh x4, x19, x19 + adds x7, x7, x23 + adcs x8, x8, x3 + adc x23, x4, xzr + # A[2] * A[2] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x9, x9, x23 + adcs x10, x10, x3 + adc x23, x4, xzr + # A[3] * A[3] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x11, x11, x23 + adcs x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + # Multiply by 121666 + ldp x18, x19, [x29, #144] + ldp x20, x21, [x29, #160] + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x18, x5 + umulh x7, x18, x5 + mul x3, x19, x5 + umulh x4, x19, x5 + adds x7, x7, x3 + adc x8, xzr, x4 + mul x3, x20, x5 + umulh x4, x20, x5 + adds x8, x8, x3 + adc x9, xzr, x4 + mul x3, x21, x5 + umulh x4, x21, x5 + adds x9, x9, x3 + adc x4, xzr, x4 + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x4 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + # Square + ldp x18, x19, [x29, #80] + ldp x20, x21, [x29, #96] + # A[0] * A[1] + mul x7, x18, x19 + umulh x8, x18, x19 + # A[0] * A[2] + mul x3, x18, x20 + umulh x9, x18, x20 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x18, x21 + umulh x10, x18, x21 + adds x9, x9, x3 + adc x10, x10, xzr + # A[1] * A[2] + mul x3, x19, x20 + umulh x4, x19, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * A[3] + mul x3, x19, x21 + umulh x4, x19, x21 + adds x10, x10, x3 + adc x11, x11, x4 + # A[2] * A[3] + mul x3, x20, x21 + umulh x12, x20, x21 + adds x11, x11, x3 + adc x12, x12, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + # A[0] * A[0] + mul x6, x18, x18 + umulh x23, x18, x18 + # A[1] * A[1] + mul x3, x19, x19 + umulh x4, x19, x19 + adds x7, x7, x23 + adcs x8, x8, x3 + adc x23, x4, xzr + # A[2] * A[2] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x9, x9, x23 + adcs x10, x10, x3 + adc x23, x4, xzr + # A[3] * A[3] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x11, x11, x23 + adcs x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + # Add + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] + ldp x10, x11, [x29, #48] + ldp x12, x13, [x29, #64] + adds x6, x6, x10 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + mov x3, #-19 + asr x23, x9, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x6, x6, x3 + sbcs x7, x7, x23 + sbcs x8, x8, x23 + sbc x9, x9, x4 + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] + # Multiply + ldp x18, x19, [x2] + ldp x20, x21, [x2, #16] + ldp x14, x15, [x29, #16] + ldp x16, x17, [x29, #32] + # A[0] * B[0] + mul x6, x18, x14 + umulh x7, x18, x14 + # A[0] * B[1] + mul x3, x18, x15 + umulh x8, x18, x15 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x19, x14 + umulh x4, x19, x14 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x18, x16 + umulh x4, x18, x16 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x19, x15 + umulh x4, x19, x15 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x20, x14 + umulh x4, x20, x14 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x18, x17 + umulh x4, x18, x17 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x19, x16 + umulh x4, x19, x16 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x20, x15 + umulh x4, x20, x15 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x21, x14 + umulh x4, x21, x14 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x19, x17 + umulh x4, x19, x17 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x20, x16 + umulh x4, x20, x16 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x21, x15 + umulh x4, x21, x15 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x20, x17 + umulh x4, x20, x17 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x21, x16 + umulh x4, x21, x16 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x21, x17 + umulh x4, x21, x17 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + # Multiply + ldp x18, x19, [x29, #144] + ldp x20, x21, [x29, #160] + ldp x14, x15, [x29, #112] + ldp x16, x17, [x29, #128] + # A[0] * B[0] + mul x6, x18, x14 + umulh x7, x18, x14 + # A[0] * B[1] + mul x3, x18, x15 + umulh x8, x18, x15 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x19, x14 + umulh x4, x19, x14 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x18, x16 + umulh x4, x18, x16 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x19, x15 + umulh x4, x19, x15 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x20, x14 + umulh x4, x20, x14 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x18, x17 + umulh x4, x18, x17 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x19, x16 + umulh x4, x19, x16 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x20, x15 + umulh x4, x20, x15 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x21, x14 + umulh x4, x21, x14 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x19, x17 + umulh x4, x19, x17 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x20, x16 + umulh x4, x20, x16 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x21, x15 + umulh x4, x21, x15 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x20, x17 + umulh x4, x20, x17 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x21, x16 + umulh x4, x21, x16 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x21, x17 + umulh x4, x21, x17 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + sub x25, x25, #1 + cmp x25, #0 + bge L_curve25519_bits + mov x25, #63 + sub x24, x24, #8 + cmp x24, #0 + bge L_curve25519_words + # Invert + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x0, x29, #80 + add x1, x29, #48 + bl fe_sq + add x1, x29, #80 + bl fe_sq + add x1, x29, #16 + add x2, x29, #80 + bl fe_mul + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #80 + bl fe_mul + add x0, x29, #112 + bl fe_sq + add x0, x29, #80 + add x1, x29, #80 + add x2, x29, #112 + bl fe_mul + add x0, x29, #112 + bl fe_sq + mov x24, #4 + add x1, x29, #112 +L_curve25519_inv_1: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_1 + add x0, x29, #80 + add x2, x29, #80 + bl fe_mul + add x0, x29, #112 + add x1, x29, #80 + bl fe_sq + mov x24, #9 + add x1, x29, #112 +L_curve25519_inv_2: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_2 + add x2, x29, #80 + bl fe_mul + add x0, x29, #144 + bl fe_sq + mov x24, #19 + add x1, x29, #144 +L_curve25519_inv_3: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_3 + add x0, x29, #112 + add x2, x29, #112 + bl fe_mul + mov x24, #10 + add x1, x29, #112 +L_curve25519_inv_4: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_4 + add x0, x29, #80 + add x2, x29, #80 + bl fe_mul + add x0, x29, #112 + add x1, x29, #80 + bl fe_sq + mov x24, #49 + add x1, x29, #112 +L_curve25519_inv_5: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_5 + add x2, x29, #80 + bl fe_mul + add x0, x29, #144 + bl fe_sq + mov x24, #0x63 + add x1, x29, #144 +L_curve25519_inv_6: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_6 + add x0, x29, #112 + add x2, x29, #112 + bl fe_mul + mov x24, #50 + add x1, x29, #112 +L_curve25519_inv_7: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_7 + add x0, x29, #80 + add x2, x29, #80 + bl fe_mul + mov x24, #5 + add x1, x29, #80 +L_curve25519_inv_8: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_8 + add x0, x29, #16 + add x2, x29, #48 + bl fe_mul + ldr x0, [x29, #176] + # Multiply + ldp x18, x19, [x0] + ldp x20, x21, [x0, #16] + ldp x14, x15, [x29, #16] + ldp x16, x17, [x29, #32] + # A[0] * B[0] + mul x6, x18, x14 + umulh x7, x18, x14 + # A[0] * B[1] + mul x3, x18, x15 + umulh x8, x18, x15 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x19, x14 + umulh x4, x19, x14 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x18, x16 + umulh x4, x18, x16 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x19, x15 + umulh x4, x19, x15 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x20, x14 + umulh x4, x20, x14 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x18, x17 + umulh x4, x18, x17 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x19, x16 + umulh x4, x19, x16 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x20, x15 + umulh x4, x20, x15 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x21, x14 + umulh x4, x21, x14 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x19, x17 + umulh x4, x19, x17 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x20, x16 + umulh x4, x20, x16 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x21, x15 + umulh x4, x21, x15 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x20, x17 + umulh x4, x20, x17 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x21, x16 + umulh x4, x21, x16 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x21, x17 + umulh x4, x21, x17 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + lsr x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + mov x0, xzr + ldr x17, [x29, #200] + ldr x18, [x29, #208] + ldr x19, [x29, #216] + ldr x20, [x29, #224] + ldr x21, [x29, #232] + ldr x22, [x29, #240] + ldr x23, [x29, #248] + ldr x24, [x29, #256] + ldr x25, [x29, #264] + ldp x29, x30, [sp], #0x110 + ret +.size curve25519,.-curve25519 +.text +.globl fe_pow22523 +.type fe_pow22523,@function +.align 4 +fe_pow22523: + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x21, [x29, #136] + # pow22523 + str x0, [x29, #112] + str x1, [x29, #120] + add x0, x29, #16 + bl fe_sq + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x1, x29, #48 + bl fe_sq + ldr x1, [x29, #120] + add x2, x29, #48 + bl fe_mul + add x0, x29, #16 + add x1, x29, #16 + add x2, x29, #48 + bl fe_mul + bl fe_sq + add x1, x29, #48 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #4 + add x1, x29, #48 +L_fe_pow22523_1: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_1 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #9 + add x1, x29, #48 +L_fe_pow22523_2: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_2 + add x2, x29, #16 + bl fe_mul + add x0, x29, #80 + bl fe_sq + mov x21, #19 + add x1, x29, #80 +L_fe_pow22523_3: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_3 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x21, #10 + add x1, x29, #48 +L_fe_pow22523_4: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_4 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #49 + add x1, x29, #48 +L_fe_pow22523_5: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_5 + add x2, x29, #16 + bl fe_mul + add x0, x29, #80 + bl fe_sq + mov x21, #0x63 + add x1, x29, #80 +L_fe_pow22523_6: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_6 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x21, #50 + add x1, x29, #48 +L_fe_pow22523_7: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_7 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + mov x21, #2 + add x1, x29, #16 +L_fe_pow22523_8: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_8 + ldr x0, [x29, #112] + ldr x2, [x29, #120] + bl fe_mul + ldr x1, [x29, #120] + ldr x0, [x29, #112] + ldr x21, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret +.size fe_pow22523,.-fe_pow22523 +.text +.globl fe_ge_to_p2 +.type fe_ge_to_p2,@function +.align 4 +fe_ge_to_p2: + stp x29, x30, [sp, #-112]! + add x29, sp, #0 + str x17, [x29, #64] + str x18, [x29, #72] + str x19, [x29, #80] + str x20, [x29, #88] + str x21, [x29, #96] + str x22, [x29, #104] + str x1, [x29, #16] + str x2, [x29, #24] + str x3, [x29, #32] + str x4, [x29, #40] + str x5, [x29, #48] + str x6, [x29, #56] + ldr x1, [x29, #32] + ldr x2, [x29, #56] + # Multiply + ldp x11, x16, [x1] + ldp x17, x18, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #16] + ldr x1, [x29, #40] + ldr x2, [x29, #48] + # Multiply + ldp x11, x16, [x1] + ldp x17, x18, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #56] + # Multiply + ldp x11, x16, [x2] + ldp x17, x18, [x2, #16] + ldp x19, x20, [x1] + ldp x21, x22, [x1, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #64] + ldr x18, [x29, #72] + ldr x19, [x29, #80] + ldr x20, [x29, #88] + ldr x21, [x29, #96] + ldr x22, [x29, #104] + ldp x29, x30, [sp], #0x70 + ret +.size fe_ge_to_p2,.-fe_ge_to_p2 +.text +.globl fe_ge_to_p3 +.type fe_ge_to_p3,@function +.align 4 +fe_ge_to_p3: + stp x29, x30, [sp, #-128]! + add x29, sp, #0 + str x17, [x29, #80] + str x18, [x29, #88] + str x19, [x29, #96] + str x20, [x29, #104] + str x21, [x29, #112] + str x22, [x29, #120] + str x1, [x29, #16] + str x2, [x29, #24] + str x3, [x29, #32] + str x4, [x29, #40] + str x5, [x29, #48] + str x6, [x29, #56] + str x7, [x29, #64] + ldr x1, [x29, #40] + ldr x2, [x29, #64] + # Multiply + ldp x11, x16, [x1] + ldp x17, x18, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #16] + ldr x1, [x29, #48] + ldr x2, [x29, #56] + # Multiply + ldp x11, x16, [x1] + ldp x17, x18, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #64] + # Multiply + ldp x11, x16, [x2] + ldp x17, x18, [x2, #16] + ldp x19, x20, [x1] + ldp x21, x22, [x1, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #32] + ldr x1, [x29, #40] + ldr x2, [x29, #48] + # Multiply + ldp x11, x16, [x1] + ldp x17, x18, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x3, x11, x19 + umulh x4, x11, x19 + # A[0] * B[1] + mul x12, x11, x20 + umulh x5, x11, x20 + adds x4, x4, x12 + adc x5, x5, xzr + # A[1] * B[0] + mul x12, x16, x19 + umulh x13, x16, x19 + adds x4, x4, x12 + adcs x5, x5, x13 + adc x6, xzr, xzr + # A[0] * B[2] + mul x12, x11, x21 + umulh x13, x11, x21 + adds x5, x5, x12 + adc x6, x6, x13 + # A[1] * B[1] + mul x12, x16, x20 + umulh x13, x16, x20 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[2] * B[0] + mul x12, x17, x19 + umulh x13, x17, x19 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, x7, xzr + # A[0] * B[3] + mul x12, x11, x22 + umulh x13, x11, x22 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[1] * B[2] + mul x12, x16, x21 + umulh x13, x16, x21 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[2] * B[1] + mul x12, x17, x20 + umulh x13, x17, x20 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[3] * B[0] + mul x12, x18, x19 + umulh x13, x18, x19 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[1] * B[3] + mul x12, x16, x22 + umulh x13, x16, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[2] * B[2] + mul x12, x17, x21 + umulh x13, x17, x21 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[1] + mul x12, x18, x20 + umulh x13, x18, x20 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[3] + mul x12, x17, x22 + umulh x13, x17, x22 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[3] * B[2] + mul x12, x18, x21 + umulh x13, x18, x21 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[3] + mul x12, x18, x22 + umulh x13, x18, x22 + adds x9, x9, x12 + adc x10, x10, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x7 + umulh x7, x12, x7 + adds x3, x3, x13 + mul x13, x12, x8 + umulh x8, x12, x8 + adcs x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x14, x12, x10 + adcs x6, x6, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x14, x14, xzr + # Overflow + extr x14, x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + lsr x14, x6, #63 + mul x14, x14, x12 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x14 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #80] + ldr x18, [x29, #88] + ldr x19, [x29, #96] + ldr x20, [x29, #104] + ldr x21, [x29, #112] + ldr x22, [x29, #120] + ldp x29, x30, [sp], #0x80 + ret +.size fe_ge_to_p3,.-fe_ge_to_p3 +.text +.globl fe_ge_dbl +.type fe_ge_dbl,@function +.align 4 +fe_ge_dbl: + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x17, [x29, #88] + str x18, [x29, #96] + str x19, [x29, #104] + str x20, [x29, #112] + str x21, [x29, #120] + str x22, [x29, #128] + str x23, [x29, #136] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + ldr x1, [x29, #48] + # Square + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + # A[0] * A[1] + mul x5, x20, x21 + umulh x6, x20, x21 + # A[0] * A[2] + mul x12, x20, x22 + umulh x7, x20, x22 + adds x6, x6, x12 + adc x7, x7, xzr + # A[0] * A[3] + mul x12, x20, x23 + umulh x8, x20, x23 + adds x7, x7, x12 + adc x8, x8, xzr + # A[1] * A[2] + mul x12, x21, x22 + umulh x13, x21, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * A[3] + mul x12, x21, x23 + umulh x13, x21, x23 + adds x8, x8, x12 + adc x9, x9, x13 + # A[2] * A[3] + mul x12, x22, x23 + umulh x10, x22, x23 + adds x9, x9, x12 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x20, x20 + umulh x15, x20, x20 + # A[1] * A[1] + mul x12, x21, x21 + umulh x13, x21, x21 + adds x5, x5, x15 + adcs x6, x6, x12 + adc x15, x13, xzr + # A[2] * A[2] + mul x12, x22, x22 + umulh x13, x22, x22 + adds x7, x7, x15 + adcs x8, x8, x12 + adc x15, x13, xzr + # A[3] * A[3] + mul x12, x23, x23 + umulh x13, x23, x23 + adds x9, x9, x15 + adcs x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #32] + ldr x2, [x29, #56] + # Square + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + # A[0] * A[1] + mul x5, x20, x21 + umulh x6, x20, x21 + # A[0] * A[2] + mul x12, x20, x22 + umulh x7, x20, x22 + adds x6, x6, x12 + adc x7, x7, xzr + # A[0] * A[3] + mul x12, x20, x23 + umulh x8, x20, x23 + adds x7, x7, x12 + adc x8, x8, xzr + # A[1] * A[2] + mul x12, x21, x22 + umulh x13, x21, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * A[3] + mul x12, x21, x23 + umulh x13, x21, x23 + adds x8, x8, x12 + adc x9, x9, x13 + # A[2] * A[3] + mul x12, x22, x23 + umulh x10, x22, x23 + adds x9, x9, x12 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x20, x20 + umulh x15, x20, x20 + # A[1] * A[1] + mul x12, x21, x21 + umulh x13, x21, x21 + adds x5, x5, x15 + adcs x6, x6, x12 + adc x15, x13, xzr + # A[2] * A[2] + mul x12, x22, x22 + umulh x13, x22, x22 + adds x7, x7, x15 + adcs x8, x8, x12 + adc x15, x13, xzr + # A[3] * A[3] + mul x12, x23, x23 + umulh x13, x23, x23 + adds x9, x9, x15 + adcs x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #24] + # Add + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adc x7, x7, x11 + mov x12, #-19 + asr x15, x7, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x12 + sbcs x5, x5, x15 + sbcs x6, x6, x15 + sbc x7, x7, x13 + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x1, [x29, #40] + # Square + ldp x20, x21, [x0] + ldp x22, x23, [x0, #16] + # A[0] * A[1] + mul x5, x20, x21 + umulh x6, x20, x21 + # A[0] * A[2] + mul x12, x20, x22 + umulh x7, x20, x22 + adds x6, x6, x12 + adc x7, x7, xzr + # A[0] * A[3] + mul x12, x20, x23 + umulh x8, x20, x23 + adds x7, x7, x12 + adc x8, x8, xzr + # A[1] * A[2] + mul x12, x21, x22 + umulh x13, x21, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * A[3] + mul x12, x21, x23 + umulh x13, x21, x23 + adds x8, x8, x12 + adc x9, x9, x13 + # A[2] * A[3] + mul x12, x22, x23 + umulh x10, x22, x23 + adds x9, x9, x12 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x20, x20 + umulh x15, x20, x20 + # A[1] * A[1] + mul x12, x21, x21 + umulh x13, x21, x21 + adds x5, x5, x15 + adcs x6, x6, x12 + adc x15, x13, xzr + # A[2] * A[2] + mul x12, x22, x22 + umulh x13, x22, x22 + adds x7, x7, x15 + adcs x8, x8, x12 + adc x15, x13, xzr + # A[3] * A[3] + mul x12, x23, x23 + umulh x13, x23, x23 + adds x9, x9, x15 + adcs x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x1, [x29, #32] + ldr x2, [x29, #16] + # Add + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x1, [x29, #40] + # Sub + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x0, [x29, #64] + # Square * 2 + ldp x20, x21, [x0] + ldp x22, x23, [x0, #16] + # A[0] * A[1] + mul x5, x20, x21 + umulh x6, x20, x21 + # A[0] * A[2] + mul x12, x20, x22 + umulh x7, x20, x22 + adds x6, x6, x12 + adc x7, x7, xzr + # A[0] * A[3] + mul x12, x20, x23 + umulh x8, x20, x23 + adds x7, x7, x12 + adc x8, x8, xzr + # A[1] * A[2] + mul x12, x21, x22 + umulh x13, x21, x22 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * A[3] + mul x12, x21, x23 + umulh x13, x21, x23 + adds x8, x8, x12 + adc x9, x9, x13 + # A[2] * A[3] + mul x12, x22, x23 + umulh x10, x22, x23 + adds x9, x9, x12 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x20, x20 + umulh x15, x20, x20 + # A[1] * A[1] + mul x12, x21, x21 + umulh x13, x21, x21 + adds x5, x5, x15 + adcs x6, x6, x12 + adc x15, x13, xzr + # A[2] * A[2] + mul x12, x22, x22 + umulh x13, x22, x22 + adds x7, x7, x15 + adcs x8, x8, x12 + adc x15, x13, xzr + # A[3] * A[3] + mul x12, x23, x23 + umulh x13, x23, x23 + adds x9, x9, x15 + adcs x10, x10, x12 + adc x11, x11, x13 + # Double and Reduce + mov x12, #0x169 + # Move top half into t4-t7 and remove top bit from t3 + lsr x15, x11, #61 + extr x11, x11, x10, #62 + extr x10, x10, x9, #62 + extr x9, x9, x8, #62 + extr x8, x8, x7, #62 + extr x7, x7, x6, #63 + extr x6, x6, x5, #63 + extr x5, x5, x4, #63 + lsl x4, x4, #1 + and x7, x7, #0x7fffffffffffffff + # Two left, only one right + and x11, x11, #0x7fffffffffffffff + # Multiply top bits by 19*19 + mul x15, x15, x12 + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x4, x4, x15 + adcs x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x0, [x29, #32] + # Sub + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x17, [x29, #88] + ldr x18, [x29, #96] + ldr x19, [x29, #104] + ldr x20, [x29, #112] + ldr x21, [x29, #120] + ldr x22, [x29, #128] + ldr x23, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret +.size fe_ge_dbl,.-fe_ge_dbl +.text +.globl fe_ge_madd +.type fe_ge_madd,@function +.align 4 +fe_ge_madd: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x18, [x29, #96] + str x19, [x29, #104] + str x20, [x29, #112] + str x21, [x29, #120] + str x22, [x29, #128] + str x23, [x29, #136] + str x24, [x29, #144] + str x25, [x29, #152] + str x26, [x29, #160] + str x27, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x1, [x29, #24] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x3] + ldp x10, x11, [x3, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x2, [x29, #32] + ldr x3, [x29, #184] + # Multiply + ldp x20, x21, [x0] + ldp x22, x23, [x0, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x0, [x29, #192] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x0] + ldp x26, x27, [x0, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #176] + ldr x3, [x29, #72] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x0, [x29, #64] + # Double + ldp x4, x5, [x0] + ldp x6, x7, [x0, #16] + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x12, #-19 + asr x15, x7, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x12 + sbcs x5, x5, x15 + sbcs x6, x6, x15 + sbc x7, x7, x13 + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x0, [x29, #40] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x2] + stp x18, x19, [x2, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x17, [x29, #88] + ldr x18, [x29, #96] + ldr x19, [x29, #104] + ldr x20, [x29, #112] + ldr x21, [x29, #120] + ldr x22, [x29, #128] + ldr x23, [x29, #136] + ldr x24, [x29, #144] + ldr x25, [x29, #152] + ldr x26, [x29, #160] + ldr x27, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret +.size fe_ge_madd,.-fe_ge_madd +.text +.globl fe_ge_msub +.type fe_ge_msub,@function +.align 4 +fe_ge_msub: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x18, [x29, #96] + str x19, [x29, #104] + str x20, [x29, #112] + str x21, [x29, #120] + str x22, [x29, #128] + str x23, [x29, #136] + str x24, [x29, #144] + str x25, [x29, #152] + str x26, [x29, #160] + str x27, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x1, [x29, #24] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x3] + ldp x10, x11, [x3, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x2, [x29, #32] + ldr x3, [x29, #192] + # Multiply + ldp x20, x21, [x0] + ldp x22, x23, [x0, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x0, [x29, #184] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x0] + ldp x26, x27, [x0, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #176] + ldr x3, [x29, #72] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x1, [x29, #24] + ldr x3, [x29, #16] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x1] + stp x18, x19, [x1, #16] + stp x4, x5, [x3] + stp x6, x7, [x3, #16] + ldr x1, [x29, #64] + # Double + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x12, #-19 + asr x15, x7, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x12 + sbcs x5, x5, x15 + sbcs x6, x6, x15 + sbc x7, x7, x13 + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x17, [x29, #88] + ldr x18, [x29, #96] + ldr x19, [x29, #104] + ldr x20, [x29, #112] + ldr x21, [x29, #120] + ldr x22, [x29, #128] + ldr x23, [x29, #136] + ldr x24, [x29, #144] + ldr x25, [x29, #152] + ldr x26, [x29, #160] + ldr x27, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret +.size fe_ge_msub,.-fe_ge_msub +.text +.globl fe_ge_add +.type fe_ge_add,@function +.align 4 +fe_ge_add: + stp x29, x30, [sp, #-208]! + add x29, sp, #0 + str x17, [x29, #120] + str x18, [x29, #128] + str x19, [x29, #136] + str x20, [x29, #144] + str x21, [x29, #152] + str x22, [x29, #160] + str x23, [x29, #168] + str x24, [x29, #176] + str x25, [x29, #184] + str x26, [x29, #192] + str x27, [x29, #200] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x1, [x29, #24] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x3] + ldp x10, x11, [x3, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x2, [x29, #32] + ldr x3, [x29, #224] + # Multiply + ldp x20, x21, [x0] + ldp x22, x23, [x0, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x2, [x29, #232] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x2] + ldp x26, x27, [x2, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x1, [x29, #40] + ldr x2, [x29, #216] + ldr x3, [x29, #72] + # Multiply + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x1, [x29, #64] + ldr x2, [x29, #208] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x2] + ldp x26, x27, [x2, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + add x1, x29, #80 + # Double + ldp x4, x5, [x0] + ldp x6, x7, [x0, #16] + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x12, #-19 + asr x15, x7, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x12 + sbcs x5, x5, x15 + sbcs x6, x6, x15 + sbc x7, x7, x13 + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x2, [x29, #24] + ldr x3, [x29, #32] + # Add + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x2] + stp x18, x19, [x2, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #40] + # Add + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x3] + stp x18, x19, [x3, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x17, [x29, #120] + ldr x18, [x29, #128] + ldr x19, [x29, #136] + ldr x20, [x29, #144] + ldr x21, [x29, #152] + ldr x22, [x29, #160] + ldr x23, [x29, #168] + ldr x24, [x29, #176] + ldr x25, [x29, #184] + ldr x26, [x29, #192] + ldr x27, [x29, #200] + ldp x29, x30, [sp], #0xd0 + ret +.size fe_ge_add,.-fe_ge_add +.text +.globl fe_ge_sub +.type fe_ge_sub,@function +.align 4 +fe_ge_sub: + stp x29, x30, [sp, #-208]! + add x29, sp, #0 + str x17, [x29, #120] + str x18, [x29, #128] + str x19, [x29, #136] + str x20, [x29, #144] + str x21, [x29, #152] + str x22, [x29, #160] + str x23, [x29, #168] + str x24, [x29, #176] + str x25, [x29, #184] + str x26, [x29, #192] + str x27, [x29, #200] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x1, [x29, #24] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x8, x9, [x3] + ldp x10, x11, [x3, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x2, [x29, #32] + ldr x3, [x29, #232] + # Multiply + ldp x20, x21, [x0] + ldp x22, x23, [x0, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x2] + stp x6, x7, [x2, #16] + ldr x2, [x29, #224] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x2] + ldp x26, x27, [x2, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x1, [x29, #40] + ldr x2, [x29, #216] + ldr x3, [x29, #72] + # Multiply + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + ldp x24, x25, [x3] + ldp x26, x27, [x3, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x1, [x29, #64] + ldr x2, [x29, #208] + # Multiply + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] + ldp x24, x25, [x2] + ldp x26, x27, [x2, #16] + # A[0] * B[0] + mul x4, x20, x24 + umulh x5, x20, x24 + # A[0] * B[1] + mul x12, x20, x25 + umulh x6, x20, x25 + adds x5, x5, x12 + adc x6, x6, xzr + # A[1] * B[0] + mul x12, x21, x24 + umulh x13, x21, x24 + adds x5, x5, x12 + adcs x6, x6, x13 + adc x7, xzr, xzr + # A[0] * B[2] + mul x12, x20, x26 + umulh x13, x20, x26 + adds x6, x6, x12 + adc x7, x7, x13 + # A[1] * B[1] + mul x12, x21, x25 + umulh x13, x21, x25 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, xzr, xzr + # A[2] * B[0] + mul x12, x22, x24 + umulh x13, x22, x24 + adds x6, x6, x12 + adcs x7, x7, x13 + adc x8, x8, xzr + # A[0] * B[3] + mul x12, x20, x27 + umulh x13, x20, x27 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, xzr, xzr + # A[1] * B[2] + mul x12, x21, x26 + umulh x13, x21, x26 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[2] * B[1] + mul x12, x22, x25 + umulh x13, x22, x25 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[3] * B[0] + mul x12, x23, x24 + umulh x13, x23, x24 + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + # A[1] * B[3] + mul x12, x21, x27 + umulh x13, x21, x27 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, xzr, xzr + # A[2] * B[2] + mul x12, x22, x26 + umulh x13, x22, x26 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[3] * B[1] + mul x12, x23, x25 + umulh x13, x23, x25 + adds x8, x8, x12 + adcs x9, x9, x13 + adc x10, x10, xzr + # A[2] * B[3] + mul x12, x22, x27 + umulh x13, x22, x27 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, xzr, xzr + # A[3] * B[2] + mul x12, x23, x26 + umulh x13, x23, x26 + adds x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, xzr + # A[3] * B[3] + mul x12, x23, x27 + umulh x13, x23, x27 + adds x10, x10, x12 + adc x11, x11, x13 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x12, #19 + mul x13, x12, x8 + umulh x8, x12, x8 + adds x4, x4, x13 + mul x13, x12, x9 + umulh x9, x12, x9 + adcs x5, x5, x13 + mul x13, x12, x10 + umulh x10, x12, x10 + adcs x6, x6, x13 + mul x13, x12, x11 + umulh x14, x12, x11 + adcs x7, x7, x13 + adc x14, x14, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x14, x14, xzr + # Overflow + extr x14, x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + lsr x14, x7, #63 + mul x14, x14, x12 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x14 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + add x1, x29, #80 + # Double + ldp x4, x5, [x0] + ldp x6, x7, [x0, #16] + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x12, #-19 + asr x15, x7, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x12 + sbcs x5, x5, x15 + sbcs x6, x6, x15 + sbc x7, x7, x13 + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x2, [x29, #24] + ldr x3, [x29, #32] + # Add + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x2] + stp x18, x19, [x2, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #40] + # Add + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + ldp x8, x9, [x0] + ldp x10, x11, [x0, #16] + adds x16, x4, x8 + adcs x17, x5, x9 + adcs x18, x6, x10 + adc x19, x7, x11 + mov x12, #-19 + asr x15, x19, #63 + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x16, x16, x12 + sbcs x17, x17, x15 + sbcs x18, x18, x15 + sbc x19, x19, x13 + # Sub + subs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + sbcs x7, x7, x11 + mov x12, #-19 + csetm x15, cc + # Mask the modulus + and x12, x15, x12 + and x13, x15, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x12 + adcs x5, x5, x15 + adcs x6, x6, x15 + adc x7, x7, x13 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + stp x4, x5, [x3] + stp x6, x7, [x3, #16] + ldr x17, [x29, #120] + ldr x18, [x29, #128] + ldr x19, [x29, #136] + ldr x20, [x29, #144] + ldr x21, [x29, #152] + ldr x22, [x29, #160] + ldr x23, [x29, #168] + ldr x24, [x29, #176] + ldr x25, [x29, #184] + ldr x26, [x29, #192] + ldr x27, [x29, #200] + ldp x29, x30, [sp], #0xd0 + ret +.size fe_ge_sub,.-fe_ge_sub diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c new file mode 100644 index 000000000..6ac00546e --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -0,0 +1,7191 @@ +/* armv8-curve25519 + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include +#include + +void fe_init() +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "\n\t" + "ldp x29, x30, [sp], #16\n\t" + : + : + : "memory" + ); +} + +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "ldp x2, x3, [x1]\n\t" + "ldp x4, x5, [x1, #16]\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + "stp x2, x3, [x0]\n\t" + "stp x4, x5, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "x2", "x3", "x4", "x5", "x6" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "mov x7, #19\n\t" + "ldp x2, x3, [x1]\n\t" + "ldp x4, x5, [x1, #16]\n\t" + "adds x6, x2, x7\n\t" + "adcs x6, x3, xzr\n\t" + "adcs x6, x4, xzr\n\t" + "adc x6, x5, xzr\n\t" + "lsr x6, x6, #63\n\t" + "mul x6, x6, x7\n\t" + "adds x2, x2, x6\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adc x5, x5, xzr\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + "stp x2, x3, [x0]\n\t" + "stp x4, x5, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [out] "+r" (out), [n] "+r" (n) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Set one */ + "mov x1, #1\n\t" + "stp x1, xzr, [x0]\n\t" + "stp xzr, xzr, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [n] "+r" (n) + : + : "memory", "x1" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Set zero */ + "stp xzr, xzr, [x0]\n\t" + "stp xzr, xzr, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [n] "+r" (n) + : + : "memory" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Copy */ + "ldp x2, x3, [x1]\n\t" + "ldp x4, x5, [x1, #16]\n\t" + "stp x2, x3, [x0]\n\t" + "stp x4, x5, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x2", "x3", "x4", "x5" + ); +} + +void fe_cswap(fe a, fe b, int c) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Conditional Swap */ + "cmp %[c], #1\n\t" + "ldp x3, x4, [x0]\n\t" + "ldp x5, x6, [x0, #16]\n\t" + "ldp x7, x8, [x1]\n\t" + "ldp x9, x10, [x1, #16]\n\t" + "csel x11, x3, x7, eq\n\t" + "csel x3, x7, x3, eq\n\t" + "csel x12, x4, x8, eq\n\t" + "csel x4, x8, x4, eq\n\t" + "csel x13, x5, x9, eq\n\t" + "csel x5, x9, x5, eq\n\t" + "csel x14, x6, x10, eq\n\t" + "csel x6, x10, x6, eq\n\t" + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "stp x11, x12, [x1]\n\t" + "stp x13, x14, [x1, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Sub */ + "ldp x3, x4, [x1]\n\t" + "ldp x5, x6, [x1, #16]\n\t" + "ldp x7, x8, [x2]\n\t" + "ldp x9, x10, [x2, #16]\n\t" + "subs x3, x3, x7\n\t" + "sbcs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "mov x12, #-19\n\t" + "csetm x11, cc\n\t" + /* Mask the modulus */ + "and x12, x11, x12\n\t" + "and x13, x11, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x3, x3, x12\n\t" + "adcs x4, x4, x11\n\t" + "adcs x5, x5, x11\n\t" + "adc x6, x6, x13\n\t" + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Add */ + "ldp x3, x4, [x1]\n\t" + "ldp x5, x6, [x1, #16]\n\t" + "ldp x7, x8, [x2]\n\t" + "ldp x9, x10, [x2, #16]\n\t" + "adds x3, x3, x7\n\t" + "adcs x4, x4, x8\n\t" + "adcs x5, x5, x9\n\t" + "adc x6, x6, x10\n\t" + "mov x12, #-19\n\t" + "asr x11, x6, #63\n\t" + /* Mask the modulus */ + "and x12, x11, x12\n\t" + "and x13, x11, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x3, x3, x12\n\t" + "sbcs x4, x4, x11\n\t" + "sbcs x5, x5, x11\n\t" + "sbc x6, x6, x13\n\t" + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "ldp x2, x3, [x1]\n\t" + "ldp x4, x5, [x1, #16]\n\t" + "mov x6, #-19\n\t" + "mov x7, #-1\n\t" + "mov x8, #-1\n\t" + "mov x9, #0x7fffffffffffffff\n\t" + "subs x6, x6, x2\n\t" + "sbcs x7, x7, x3\n\t" + "sbcs x8, x8, x4\n\t" + "sbc x9, x9, x5\n\t" + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + ); +} + +void fe_cmov(fe a, const fe b, int c) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "ldp x4, x5, [x0]\n\t" + "ldp x6, x7, [x0, #16]\n\t" + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "cmp %[c], #1\n\t" + "csel x4, x4, x8, eq\n\t" + "csel x5, x5, x9, eq\n\t" + "csel x6, x6, x10, eq\n\t" + "csel x7, x7, x11, eq\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "mov x6, #19\n\t" + "ldp x1, x2, [x0]\n\t" + "ldp x3, x4, [x0, #16]\n\t" + "adds x5, x1, x6\n\t" + "adcs x5, x2, xzr\n\t" + "adcs x5, x3, xzr\n\t" + "adc x5, x4, xzr\n\t" + "lsr x5, x5, #63\n\t" + "mul x5, x5, x6\n\t" + "adds x1, x1, x5\n\t" + "adcs x2, x2, xzr\n\t" + "adcs x3, x3, xzr\n\t" + "adc x4, x4, xzr\n\t" + "and x4, x4, #0x7fffffffffffffff\n\t" + "orr %[a], x1, x2\n\t" + "orr x3, x3, x4\n\t" + "orr %[a], %[a], x3\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [a] "+r" (a) + : + : "memory", "x1", "x2", "x3", "x4", "x5", "x6" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "mov x6, #19\n\t" + "ldp x1, x2, [x0]\n\t" + "ldp x3, x4, [x0, #16]\n\t" + "adds x5, x1, x6\n\t" + "adcs x5, x2, xzr\n\t" + "adcs x5, x3, xzr\n\t" + "adc x5, x4, xzr\n\t" + "lsr x5, x5, #63\n\t" + "mul x5, x5, x6\n\t" + "ldr x1, [x0]\n\t" + "adds x1, x1, x5\n\t" + "and %[a], x1, #1\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [a] "+r" (a) + : + : "memory", "x1", "x2", "x3", "x4", "x5", "x6" + ); + return (uint32_t)(size_t)a; +} + +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "sxtb %[b], w2\n\t" + "sbfx x15, %[b], #7, #1\n\t" + "sxtb x16, w2\n\t" + "eor x16, x16, x15\n\t" + "sub x16, x16, x15\n\t" + "mov x3, #1\n\t" + "mov x4, xzr\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, #1\n\t" + "mov x8, xzr\n\t" + "mov x9, xzr\n\t" + "mov x10, xzr\n\t" + "mov x11, xzr\n\t" + "mov x12, xzr\n\t" + "mov x13, xzr\n\t" + "mov x14, xzr\n\t" + "cmp x16, #1\n\t" + "ldp x17, x18, [x1]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x1, #32]\n\t" + "ldp x23, x24, [x1, #48]\n\t" + "ldp x25, x26, [x1, #64]\n\t" + "ldp x27, x28, [x1, #80]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "cmp x16, #2\n\t" + "ldp x17, x18, [x1, #96]\n\t" + "ldp x19, x20, [x1, #112]\n\t" + "ldp x21, x22, [x1, #128]\n\t" + "ldp x23, x24, [x1, #144]\n\t" + "ldp x25, x26, [x1, #160]\n\t" + "ldp x27, x28, [x1, #176]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "cmp x16, #3\n\t" + "ldp x17, x18, [x1, #192]\n\t" + "ldp x19, x20, [x1, #208]\n\t" + "ldp x21, x22, [x1, #224]\n\t" + "ldp x23, x24, [x1, #240]\n\t" + "ldp x25, x26, [x1, #256]\n\t" + "ldp x27, x28, [x1, #272]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "cmp x16, #4\n\t" + "ldp x17, x18, [x1, #288]\n\t" + "ldp x19, x20, [x1, #304]\n\t" + "ldp x21, x22, [x1, #320]\n\t" + "ldp x23, x24, [x1, #336]\n\t" + "ldp x25, x26, [x1, #352]\n\t" + "ldp x27, x28, [x1, #368]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "add %[base], %[base], #0x180\n\t" + "cmp x16, #5\n\t" + "ldp x17, x18, [x1]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x1, #32]\n\t" + "ldp x23, x24, [x1, #48]\n\t" + "ldp x25, x26, [x1, #64]\n\t" + "ldp x27, x28, [x1, #80]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "cmp x16, #6\n\t" + "ldp x17, x18, [x1, #96]\n\t" + "ldp x19, x20, [x1, #112]\n\t" + "ldp x21, x22, [x1, #128]\n\t" + "ldp x23, x24, [x1, #144]\n\t" + "ldp x25, x26, [x1, #160]\n\t" + "ldp x27, x28, [x1, #176]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "cmp x16, #7\n\t" + "ldp x17, x18, [x1, #192]\n\t" + "ldp x19, x20, [x1, #208]\n\t" + "ldp x21, x22, [x1, #224]\n\t" + "ldp x23, x24, [x1, #240]\n\t" + "ldp x25, x26, [x1, #256]\n\t" + "ldp x27, x28, [x1, #272]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "cmp x16, #8\n\t" + "ldp x17, x18, [x1, #288]\n\t" + "ldp x19, x20, [x1, #304]\n\t" + "ldp x21, x22, [x1, #320]\n\t" + "ldp x23, x24, [x1, #336]\n\t" + "ldp x25, x26, [x1, #352]\n\t" + "ldp x27, x28, [x1, #368]\n\t" + "csel x3, x17, x3, eq\n\t" + "csel x4, x18, x4, eq\n\t" + "csel x5, x19, x5, eq\n\t" + "csel x6, x20, x6, eq\n\t" + "csel x7, x21, x7, eq\n\t" + "csel x8, x22, x8, eq\n\t" + "csel x9, x23, x9, eq\n\t" + "csel x10, x24, x10, eq\n\t" + "csel x11, x25, x11, eq\n\t" + "csel x12, x26, x12, eq\n\t" + "csel x13, x27, x13, eq\n\t" + "csel x14, x28, x14, eq\n\t" + "add %[base], %[base], #0x180\n\t" + "sub %[base], %[base], #0x180\n\t" + "mov x17, #-19\n\t" + "mov x18, #-1\n\t" + "mov x19, #-1\n\t" + "mov x20, #0x7fffffffffffffff\n\t" + "subs x17, x17, x11\n\t" + "sbcs x18, x18, x12\n\t" + "sbcs x19, x19, x13\n\t" + "sbc x20, x20, x14\n\t" + "cmp %[b], #0\n\t" + "mov x15, x3\n\t" + "csel x3, x7, x3, lt\n\t" + "csel x7, x15, x7, lt\n\t" + "mov x15, x4\n\t" + "csel x4, x8, x4, lt\n\t" + "csel x8, x15, x8, lt\n\t" + "mov x15, x5\n\t" + "csel x5, x9, x5, lt\n\t" + "csel x9, x15, x9, lt\n\t" + "mov x15, x6\n\t" + "csel x6, x10, x6, lt\n\t" + "csel x10, x15, x10, lt\n\t" + "csel x11, x17, x11, lt\n\t" + "csel x12, x18, x12, lt\n\t" + "csel x13, x19, x13, lt\n\t" + "csel x14, x20, x14, lt\n\t" + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "stp x7, x8, [x0, #32]\n\t" + "stp x9, x10, [x0, #48]\n\t" + "stp x11, x12, [x0, #64]\n\t" + "stp x13, x14, [x0, #80]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Multiply */ + "ldp x15, x16, [x1]\n\t" + "ldp x17, x18, [x1, #16]\n\t" + "ldp x19, x20, [x2]\n\t" + "ldp x21, x22, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x6, x15, x19\n\t" + "umulh x7, x15, x19\n\t" + /* A[0] * B[1] */ + "mul x3, x15, x20\n\t" + "umulh x8, x15, x20\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x16, x19\n\t" + "umulh x4, x16, x19\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x15, x21\n\t" + "umulh x4, x15, x21\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x16, x20\n\t" + "umulh x4, x16, x20\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x17, x19\n\t" + "umulh x4, x17, x19\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x15, x22\n\t" + "umulh x4, x15, x22\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x16, x21\n\t" + "umulh x4, x16, x21\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x17, x20\n\t" + "umulh x4, x17, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x18, x19\n\t" + "umulh x4, x18, x19\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x16, x22\n\t" + "umulh x4, x16, x22\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x17, x21\n\t" + "umulh x4, x17, x21\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x18, x20\n\t" + "umulh x4, x18, x20\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x17, x22\n\t" + "umulh x4, x17, x22\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x18, x21\n\t" + "umulh x4, x18, x21\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x18, x22\n\t" + "umulh x4, x18, x22\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Square */ + "ldp x14, x15, [x1]\n\t" + "ldp x16, x17, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x3, x14, x15\n\t" + "umulh x4, x14, x15\n\t" + /* A[0] * A[2] */ + "mul x11, x14, x16\n\t" + "umulh x5, x14, x16\n\t" + "adds x4, x4, x11\n\t" + "adc x5, x5, xzr\n\t" + /* A[0] * A[3] */ + "mul x11, x14, x17\n\t" + "umulh x6, x14, x17\n\t" + "adds x5, x5, x11\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * A[2] */ + "mul x11, x15, x16\n\t" + "umulh x12, x15, x16\n\t" + "adds x5, x5, x11\n\t" + "adcs x6, x6, x12\n\t" + "adc x7, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x11, x15, x17\n\t" + "umulh x12, x15, x17\n\t" + "adds x6, x6, x11\n\t" + "adc x7, x7, x12\n\t" + /* A[2] * A[3] */ + "mul x11, x16, x17\n\t" + "umulh x8, x16, x17\n\t" + "adds x7, x7, x11\n\t" + "adc x8, x8, xzr\n\t" + /* Double */ + "adds x3, x3, x3\n\t" + "adcs x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x2, x14, x14\n\t" + "umulh x10, x14, x14\n\t" + /* A[1] * A[1] */ + "mul x11, x15, x15\n\t" + "umulh x12, x15, x15\n\t" + "adds x3, x3, x10\n\t" + "adcs x4, x4, x11\n\t" + "adc x10, x12, xzr\n\t" + /* A[2] * A[2] */ + "mul x11, x16, x16\n\t" + "umulh x12, x16, x16\n\t" + "adds x5, x5, x10\n\t" + "adcs x6, x6, x11\n\t" + "adc x10, x12, xzr\n\t" + /* A[3] * A[3] */ + "mul x11, x17, x17\n\t" + "umulh x12, x17, x17\n\t" + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adc x9, x9, x12\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "extr x6, x6, x5, #63\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x11, #19\n\t" + "mul x12, x11, x6\n\t" + "umulh x6, x11, x6\n\t" + "adds x2, x2, x12\n\t" + "mul x12, x11, x7\n\t" + "umulh x7, x11, x7\n\t" + "adcs x3, x3, x12\n\t" + "mul x12, x11, x8\n\t" + "umulh x8, x11, x8\n\t" + "adcs x4, x4, x12\n\t" + "mul x12, x11, x9\n\t" + "umulh x13, x11, x9\n\t" + "adcs x5, x5, x12\n\t" + "adc x13, x13, xzr\n\t" + /* Add remaining product results in */ + "adds x3, x3, x6\n\t" + "adcs x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adc x13, x13, xzr\n\t" + /* Overflow */ + "extr x13, x13, x5, #63\n\t" + "mul x13, x13, x11\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + "adds x2, x2, x13\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adc x5, x5, xzr\n\t" + /* Reduce if top bit set */ + "lsr x13, x5, #63\n\t" + "mul x13, x13, x11\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + "adds x2, x2, x13\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adc x5, x5, xzr\n\t" + /* Store */ + "stp x2, x3, [x0]\n\t" + "stp x4, x5, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x11", "x12", "x13", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17" + ); +} + +void fe_mul121666(fe r, fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Multiply by 121666 */ + "ldp x2, x3, [x1]\n\t" + "ldp x4, x5, [x1, #16]\n\t" + "mov x13, #0xdb42\n\t" + "movk x13, #1, lsl 16\n\t" + "mul x6, x2, x13\n\t" + "umulh x7, x2, x13\n\t" + "mul x11, x3, x13\n\t" + "umulh x12, x3, x13\n\t" + "adds x7, x7, x11\n\t" + "adc x8, xzr, x12\n\t" + "mul x11, x4, x13\n\t" + "umulh x12, x4, x13\n\t" + "adds x8, x8, x11\n\t" + "adc x9, xzr, x12\n\t" + "mul x11, x5, x13\n\t" + "umulh x12, x5, x13\n\t" + "adds x9, x9, x11\n\t" + "adc x12, xzr, x12\n\t" + "mov x13, #19\n\t" + "extr x12, x12, x9, #63\n\t" + "mul x12, x12, x13\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x11", "x12", "x13", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); +} + +void fe_sq2(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Square * 2 */ + "ldp x2, x3, [x1]\n\t" + "ldp x4, x5, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x7, x2, x3\n\t" + "umulh x8, x2, x3\n\t" + /* A[0] * A[2] */ + "mul x11, x2, x4\n\t" + "umulh x9, x2, x4\n\t" + "adds x8, x8, x11\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x11, x2, x5\n\t" + "umulh x10, x2, x5\n\t" + "adds x9, x9, x11\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * A[2] */ + "mul x11, x3, x4\n\t" + "umulh x12, x3, x4\n\t" + "adds x9, x9, x11\n\t" + "adcs x10, x10, x12\n\t" + "adc x14, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x11, x3, x5\n\t" + "umulh x12, x3, x5\n\t" + "adds x10, x10, x11\n\t" + "adc x14, x14, x12\n\t" + /* A[2] * A[3] */ + "mul x11, x4, x5\n\t" + "umulh x15, x4, x5\n\t" + "adds x14, x14, x11\n\t" + "adc x15, x15, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adc x16, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x2, x2\n\t" + "umulh x17, x2, x2\n\t" + /* A[1] * A[1] */ + "mul x11, x3, x3\n\t" + "umulh x12, x3, x3\n\t" + "adds x7, x7, x17\n\t" + "adcs x8, x8, x11\n\t" + "adc x17, x12, xzr\n\t" + /* A[2] * A[2] */ + "mul x11, x4, x4\n\t" + "umulh x12, x4, x4\n\t" + "adds x9, x9, x17\n\t" + "adcs x10, x10, x11\n\t" + "adc x17, x12, xzr\n\t" + /* A[3] * A[3] */ + "mul x11, x5, x5\n\t" + "umulh x12, x5, x5\n\t" + "adds x14, x14, x17\n\t" + "adcs x15, x15, x11\n\t" + "adc x16, x16, x12\n\t" + /* Double and Reduce */ + "mov x11, #0x169\n\t" + /* Move top half into t4-t7 and remove top bit from t3 */ + "lsr x17, x16, #61\n\t" + "extr x16, x16, x15, #62\n\t" + "extr x15, x15, x14, #62\n\t" + "extr x14, x14, x10, #62\n\t" + "extr x10, x10, x9, #62\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "lsl x6, x6, #1\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Two left, only one right */ + "and x16, x16, #0x7fffffffffffffff\n\t" + /* Multiply top bits by 19*19 */ + "mul x17, x17, x11\n\t" + /* Multiply top half by 19 */ + "mov x11, #19\n\t" + "mul x12, x11, x10\n\t" + "umulh x10, x11, x10\n\t" + "adds x6, x6, x12\n\t" + "mul x12, x11, x14\n\t" + "umulh x14, x11, x14\n\t" + "adcs x7, x7, x12\n\t" + "mul x12, x11, x15\n\t" + "umulh x15, x11, x15\n\t" + "adcs x8, x8, x12\n\t" + "mul x12, x11, x16\n\t" + "umulh x13, x11, x16\n\t" + "adcs x9, x9, x12\n\t" + "adc x13, x13, xzr\n\t" + /* Add remaining product results in */ + "adds x6, x6, x17\n\t" + "adcs x7, x7, x10\n\t" + "adcs x8, x8, x14\n\t" + "adcs x9, x9, x15\n\t" + "adc x13, x13, xzr\n\t" + /* Overflow */ + "extr x13, x13, x9, #63\n\t" + "mul x13, x13, x11\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x13\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x13, x9, #63\n\t" + "mul x13, x13, x11\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x13\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x11", "x12", "x13", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x18" + ); +} + +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-160]!\n\t" + "add x29, sp, #0\n\t" + /* Invert */ + "str %[r], [x29, #144]\n\t" + "str %[a], [x29, #152]\n\t" + "add x0, x29, #16\n\t" + "bl fe_sq\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "ldr x1, [x29, #152]\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #16\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #80\n\t" + "bl fe_sq\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #80\n\t" + "bl fe_sq\n\t" + "mov x20, #4\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_invert1:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert1\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #80\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "mov x20, #9\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_invert2:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert2\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #112\n\t" + "bl fe_sq\n\t" + "mov x20, #19\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_fe_invert3:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert3\n\t" + "add x0, x29, #80\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "mov x20, #10\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_invert4:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert4\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #80\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "mov x20, #49\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_invert5:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert5\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #112\n\t" + "bl fe_sq\n\t" + "mov x20, #0x63\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_fe_invert6:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert6\n\t" + "add x0, x29, #80\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "mov x20, #50\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_invert7:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert7\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "mov x20, #5\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_invert8:\n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert8\n\t" + "ldr x0, [x29, #144]\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "ldr %[a], [x29, #152]\n\t" + "ldr %[r], [x29, #144]\n\t" + "ldp x29, x30, [sp], #0xa0\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x20" + ); +} + +int curve25519(byte* r, byte* n, byte* a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-192]!\n\t" + "add x29, sp, #0\n\t" + "mov x22, xzr\n\t" + "str %[r], [x29, #176]\n\t" + /* Set one */ + "mov x23, #1\n\t" + "stp x23, xzr, [x0]\n\t" + "stp xzr, xzr, [x0, #16]\n\t" + /* Set zero */ + "stp xzr, xzr, [x29, #16]\n\t" + "stp xzr, xzr, [x29, #32]\n\t" + /* Set one */ + "mov x23, #1\n\t" + "stp x23, xzr, [x29, #48]\n\t" + "stp xzr, xzr, [x29, #64]\n\t" + /* Copy */ + "ldp x6, x7, [x2]\n\t" + "ldp x8, x9, [x2, #16]\n\t" + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "mov x25, #62\n\t" + "mov x24, #24\n\t" + "\n" + "L_curve25519_words:\n\t" + "\n" + "L_curve25519_bits:\n\t" + "ldr x23, [x1, x24]\n\t" + "lsr x23, x23, x25\n\t" + "and x23, x23, #1\n\t" + "eor x22, x22, x23\n\t" + /* Conditional Swap */ + "cmp x22, #1\n\t" + "ldp x6, x7, [x0]\n\t" + "ldp x8, x9, [x0, #16]\n\t" + "ldp x10, x11, [x29, #80]\n\t" + "ldp x12, x13, [x29, #96]\n\t" + "csel x14, x6, x10, eq\n\t" + "csel x6, x10, x6, eq\n\t" + "csel x15, x7, x11, eq\n\t" + "csel x7, x11, x7, eq\n\t" + "csel x16, x8, x12, eq\n\t" + "csel x8, x12, x8, eq\n\t" + "csel x17, x9, x13, eq\n\t" + "csel x9, x13, x9, eq\n\t" + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + "stp x14, x15, [x29, #80]\n\t" + "stp x16, x17, [x29, #96]\n\t" + /* Conditional Swap */ + "cmp x22, #1\n\t" + "ldp x6, x7, [x29, #16]\n\t" + "ldp x8, x9, [x29, #32]\n\t" + "ldp x10, x11, [x29, #48]\n\t" + "ldp x12, x13, [x29, #64]\n\t" + "csel x14, x6, x10, eq\n\t" + "csel x6, x10, x6, eq\n\t" + "csel x15, x7, x11, eq\n\t" + "csel x7, x11, x7, eq\n\t" + "csel x16, x8, x12, eq\n\t" + "csel x8, x12, x8, eq\n\t" + "csel x17, x9, x13, eq\n\t" + "csel x9, x13, x9, eq\n\t" + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + "stp x14, x15, [x29, #48]\n\t" + "stp x16, x17, [x29, #64]\n\t" + "mov x22, x23\n\t" + /* Add */ + "ldp x6, x7, [x0]\n\t" + "ldp x8, x9, [x0, #16]\n\t" + "ldp x10, x11, [x29, #16]\n\t" + "ldp x12, x13, [x29, #32]\n\t" + "adds x14, x6, x10\n\t" + "adcs x15, x7, x11\n\t" + "adcs x16, x8, x12\n\t" + "adc x17, x9, x13\n\t" + "mov x3, #-19\n\t" + "asr x23, x17, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, x23\n\t" + "sbcs x16, x16, x23\n\t" + "sbc x17, x17, x4\n\t" + /* Sub */ + "subs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "sbcs x8, x8, x12\n\t" + "sbcs x9, x9, x13\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, x23\n\t" + "adcs x8, x8, x23\n\t" + "adc x9, x9, x4\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" + /* Add */ + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "ldp x10, x11, [x29, #48]\n\t" + "ldp x12, x13, [x29, #64]\n\t" + "adds x14, x6, x10\n\t" + "adcs x15, x7, x11\n\t" + "adcs x16, x8, x12\n\t" + "adc x17, x9, x13\n\t" + "mov x3, #-19\n\t" + "asr x23, x17, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, x23\n\t" + "sbcs x16, x16, x23\n\t" + "sbc x17, x17, x4\n\t" + /* Sub */ + "subs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "sbcs x8, x8, x12\n\t" + "sbcs x9, x9, x13\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, x23\n\t" + "adcs x8, x8, x23\n\t" + "adc x9, x9, x4\n\t" + "stp x14, x15, [x29, #16]\n\t" + "stp x16, x17, [x29, #32]\n\t" + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + /* Multiply */ + "ldp x18, x19, [x29, #112]\n\t" + "ldp x20, x21, [x29, #128]\n\t" + "ldp x14, x15, [x0]\n\t" + "ldp x16, x17, [x0, #16]\n\t" + /* A[0] * B[0] */ + "mul x6, x18, x14\n\t" + "umulh x7, x18, x14\n\t" + /* A[0] * B[1] */ + "mul x3, x18, x15\n\t" + "umulh x8, x18, x15\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x19, x14\n\t" + "umulh x4, x19, x14\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x18, x16\n\t" + "umulh x4, x18, x16\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x19, x15\n\t" + "umulh x4, x19, x15\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x20, x14\n\t" + "umulh x4, x20, x14\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x18, x17\n\t" + "umulh x4, x18, x17\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x19, x16\n\t" + "umulh x4, x19, x16\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x20, x15\n\t" + "umulh x4, x20, x15\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x21, x14\n\t" + "umulh x4, x21, x14\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x19, x17\n\t" + "umulh x4, x19, x17\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x20, x16\n\t" + "umulh x4, x20, x16\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x21, x15\n\t" + "umulh x4, x21, x15\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x20, x17\n\t" + "umulh x4, x20, x17\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x21, x16\n\t" + "umulh x4, x21, x16\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x21, x17\n\t" + "umulh x4, x21, x17\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + /* Multiply */ + "ldp x18, x19, [x29, #16]\n\t" + "ldp x20, x21, [x29, #32]\n\t" + "ldp x14, x15, [x29, #144]\n\t" + "ldp x16, x17, [x29, #160]\n\t" + /* A[0] * B[0] */ + "mul x6, x18, x14\n\t" + "umulh x7, x18, x14\n\t" + /* A[0] * B[1] */ + "mul x3, x18, x15\n\t" + "umulh x8, x18, x15\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x19, x14\n\t" + "umulh x4, x19, x14\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x18, x16\n\t" + "umulh x4, x18, x16\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x19, x15\n\t" + "umulh x4, x19, x15\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x20, x14\n\t" + "umulh x4, x20, x14\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x18, x17\n\t" + "umulh x4, x18, x17\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x19, x16\n\t" + "umulh x4, x19, x16\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x20, x15\n\t" + "umulh x4, x20, x15\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x21, x14\n\t" + "umulh x4, x21, x14\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x19, x17\n\t" + "umulh x4, x19, x17\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x20, x16\n\t" + "umulh x4, x20, x16\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x21, x15\n\t" + "umulh x4, x21, x15\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x20, x17\n\t" + "umulh x4, x20, x17\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x21, x16\n\t" + "umulh x4, x21, x16\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x21, x17\n\t" + "umulh x4, x21, x17\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + /* Square */ + "ldp x18, x19, [x29, #144]\n\t" + "ldp x20, x21, [x29, #160]\n\t" + /* A[0] * A[1] */ + "mul x7, x18, x19\n\t" + "umulh x8, x18, x19\n\t" + /* A[0] * A[2] */ + "mul x3, x18, x20\n\t" + "umulh x9, x18, x20\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x18, x21\n\t" + "umulh x10, x18, x21\n\t" + "adds x9, x9, x3\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x19, x20\n\t" + "umulh x4, x19, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x19, x21\n\t" + "umulh x4, x19, x21\n\t" + "adds x10, x10, x3\n\t" + "adc x11, x11, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x20, x21\n\t" + "umulh x12, x20, x21\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adc x13, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x18, x18\n\t" + "umulh x23, x18, x18\n\t" + /* A[1] * A[1] */ + "mul x3, x19, x19\n\t" + "umulh x4, x19, x19\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" + "adds x9, x9, x23\n\t" + "adcs x10, x10, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x21, x21\n\t" + "umulh x4, x21, x21\n\t" + "adds x11, x11, x23\n\t" + "adcs x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + /* Square */ + "ldp x18, x19, [x0]\n\t" + "ldp x20, x21, [x0, #16]\n\t" + /* A[0] * A[1] */ + "mul x7, x18, x19\n\t" + "umulh x8, x18, x19\n\t" + /* A[0] * A[2] */ + "mul x3, x18, x20\n\t" + "umulh x9, x18, x20\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x18, x21\n\t" + "umulh x10, x18, x21\n\t" + "adds x9, x9, x3\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x19, x20\n\t" + "umulh x4, x19, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x19, x21\n\t" + "umulh x4, x19, x21\n\t" + "adds x10, x10, x3\n\t" + "adc x11, x11, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x20, x21\n\t" + "umulh x12, x20, x21\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adc x13, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x18, x18\n\t" + "umulh x23, x18, x18\n\t" + /* A[1] * A[1] */ + "mul x3, x19, x19\n\t" + "umulh x4, x19, x19\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" + "adds x9, x9, x23\n\t" + "adcs x10, x10, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x21, x21\n\t" + "umulh x4, x21, x21\n\t" + "adds x11, x11, x23\n\t" + "adcs x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" + /* Add */ + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" + "ldp x10, x11, [x29, #16]\n\t" + "ldp x12, x13, [x29, #32]\n\t" + "adds x14, x6, x10\n\t" + "adcs x15, x7, x11\n\t" + "adcs x16, x8, x12\n\t" + "adc x17, x9, x13\n\t" + "mov x3, #-19\n\t" + "asr x23, x17, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x14, x14, x3\n\t" + "sbcs x15, x15, x23\n\t" + "sbcs x16, x16, x23\n\t" + "sbc x17, x17, x4\n\t" + /* Sub */ + "subs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "sbcs x8, x8, x12\n\t" + "sbcs x9, x9, x13\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, x23\n\t" + "adcs x8, x8, x23\n\t" + "adc x9, x9, x4\n\t" + "stp x14, x15, [x29, #80]\n\t" + "stp x16, x17, [x29, #96]\n\t" + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + /* Multiply */ + "ldp x18, x19, [x29, #144]\n\t" + "ldp x20, x21, [x29, #160]\n\t" + "ldp x14, x15, [x29, #112]\n\t" + "ldp x16, x17, [x29, #128]\n\t" + /* A[0] * B[0] */ + "mul x6, x18, x14\n\t" + "umulh x7, x18, x14\n\t" + /* A[0] * B[1] */ + "mul x3, x18, x15\n\t" + "umulh x8, x18, x15\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x19, x14\n\t" + "umulh x4, x19, x14\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x18, x16\n\t" + "umulh x4, x18, x16\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x19, x15\n\t" + "umulh x4, x19, x15\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x20, x14\n\t" + "umulh x4, x20, x14\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x18, x17\n\t" + "umulh x4, x18, x17\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x19, x16\n\t" + "umulh x4, x19, x16\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x20, x15\n\t" + "umulh x4, x20, x15\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x21, x14\n\t" + "umulh x4, x21, x14\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x19, x17\n\t" + "umulh x4, x19, x17\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x20, x16\n\t" + "umulh x4, x20, x16\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x21, x15\n\t" + "umulh x4, x21, x15\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x20, x17\n\t" + "umulh x4, x20, x17\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x21, x16\n\t" + "umulh x4, x21, x16\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x21, x17\n\t" + "umulh x4, x21, x17\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + /* Sub */ + "ldp x6, x7, [x29, #144]\n\t" + "ldp x8, x9, [x29, #160]\n\t" + "ldp x10, x11, [x29, #112]\n\t" + "ldp x12, x13, [x29, #128]\n\t" + "subs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "sbcs x8, x8, x12\n\t" + "sbcs x9, x9, x13\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x6, x6, x3\n\t" + "adcs x7, x7, x23\n\t" + "adcs x8, x8, x23\n\t" + "adc x9, x9, x4\n\t" + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" + /* Square */ + "ldp x18, x19, [x29, #16]\n\t" + "ldp x20, x21, [x29, #32]\n\t" + /* A[0] * A[1] */ + "mul x7, x18, x19\n\t" + "umulh x8, x18, x19\n\t" + /* A[0] * A[2] */ + "mul x3, x18, x20\n\t" + "umulh x9, x18, x20\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x18, x21\n\t" + "umulh x10, x18, x21\n\t" + "adds x9, x9, x3\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x19, x20\n\t" + "umulh x4, x19, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x19, x21\n\t" + "umulh x4, x19, x21\n\t" + "adds x10, x10, x3\n\t" + "adc x11, x11, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x20, x21\n\t" + "umulh x12, x20, x21\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adc x13, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x18, x18\n\t" + "umulh x23, x18, x18\n\t" + /* A[1] * A[1] */ + "mul x3, x19, x19\n\t" + "umulh x4, x19, x19\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" + "adds x9, x9, x23\n\t" + "adcs x10, x10, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x21, x21\n\t" + "umulh x4, x21, x21\n\t" + "adds x11, x11, x23\n\t" + "adcs x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + /* Multiply by 121666 */ + "ldp x18, x19, [x29, #144]\n\t" + "ldp x20, x21, [x29, #160]\n\t" + "mov x5, #0xdb42\n\t" + "movk x5, #1, lsl 16\n\t" + "mul x6, x18, x5\n\t" + "umulh x7, x18, x5\n\t" + "mul x3, x19, x5\n\t" + "umulh x4, x19, x5\n\t" + "adds x7, x7, x3\n\t" + "adc x8, xzr, x4\n\t" + "mul x3, x20, x5\n\t" + "umulh x4, x20, x5\n\t" + "adds x8, x8, x3\n\t" + "adc x9, xzr, x4\n\t" + "mul x3, x21, x5\n\t" + "umulh x4, x21, x5\n\t" + "adds x9, x9, x3\n\t" + "adc x4, xzr, x4\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x9, #63\n\t" + "mul x4, x4, x5\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + /* Square */ + "ldp x18, x19, [x29, #80]\n\t" + "ldp x20, x21, [x29, #96]\n\t" + /* A[0] * A[1] */ + "mul x7, x18, x19\n\t" + "umulh x8, x18, x19\n\t" + /* A[0] * A[2] */ + "mul x3, x18, x20\n\t" + "umulh x9, x18, x20\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x18, x21\n\t" + "umulh x10, x18, x21\n\t" + "adds x9, x9, x3\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x19, x20\n\t" + "umulh x4, x19, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x19, x21\n\t" + "umulh x4, x19, x21\n\t" + "adds x10, x10, x3\n\t" + "adc x11, x11, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x20, x21\n\t" + "umulh x12, x20, x21\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adc x13, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x18, x18\n\t" + "umulh x23, x18, x18\n\t" + /* A[1] * A[1] */ + "mul x3, x19, x19\n\t" + "umulh x4, x19, x19\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" + "adds x9, x9, x23\n\t" + "adcs x10, x10, x3\n\t" + "adc x23, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x21, x21\n\t" + "umulh x4, x21, x21\n\t" + "adds x11, x11, x23\n\t" + "adcs x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + /* Add */ + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "ldp x10, x11, [x29, #48]\n\t" + "ldp x12, x13, [x29, #64]\n\t" + "adds x6, x6, x10\n\t" + "adcs x7, x7, x11\n\t" + "adcs x8, x8, x12\n\t" + "adc x9, x9, x13\n\t" + "mov x3, #-19\n\t" + "asr x23, x9, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x6, x6, x3\n\t" + "sbcs x7, x7, x23\n\t" + "sbcs x8, x8, x23\n\t" + "sbc x9, x9, x4\n\t" + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + /* Multiply */ + "ldp x18, x19, [x2]\n\t" + "ldp x20, x21, [x2, #16]\n\t" + "ldp x14, x15, [x29, #16]\n\t" + "ldp x16, x17, [x29, #32]\n\t" + /* A[0] * B[0] */ + "mul x6, x18, x14\n\t" + "umulh x7, x18, x14\n\t" + /* A[0] * B[1] */ + "mul x3, x18, x15\n\t" + "umulh x8, x18, x15\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x19, x14\n\t" + "umulh x4, x19, x14\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x18, x16\n\t" + "umulh x4, x18, x16\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x19, x15\n\t" + "umulh x4, x19, x15\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x20, x14\n\t" + "umulh x4, x20, x14\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x18, x17\n\t" + "umulh x4, x18, x17\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x19, x16\n\t" + "umulh x4, x19, x16\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x20, x15\n\t" + "umulh x4, x20, x15\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x21, x14\n\t" + "umulh x4, x21, x14\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x19, x17\n\t" + "umulh x4, x19, x17\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x20, x16\n\t" + "umulh x4, x20, x16\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x21, x15\n\t" + "umulh x4, x21, x15\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x20, x17\n\t" + "umulh x4, x20, x17\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x21, x16\n\t" + "umulh x4, x21, x16\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x21, x17\n\t" + "umulh x4, x21, x17\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + /* Multiply */ + "ldp x18, x19, [x29, #144]\n\t" + "ldp x20, x21, [x29, #160]\n\t" + "ldp x14, x15, [x29, #112]\n\t" + "ldp x16, x17, [x29, #128]\n\t" + /* A[0] * B[0] */ + "mul x6, x18, x14\n\t" + "umulh x7, x18, x14\n\t" + /* A[0] * B[1] */ + "mul x3, x18, x15\n\t" + "umulh x8, x18, x15\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x19, x14\n\t" + "umulh x4, x19, x14\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x18, x16\n\t" + "umulh x4, x18, x16\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x19, x15\n\t" + "umulh x4, x19, x15\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x20, x14\n\t" + "umulh x4, x20, x14\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x18, x17\n\t" + "umulh x4, x18, x17\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x19, x16\n\t" + "umulh x4, x19, x16\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x20, x15\n\t" + "umulh x4, x20, x15\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x21, x14\n\t" + "umulh x4, x21, x14\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x19, x17\n\t" + "umulh x4, x19, x17\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x20, x16\n\t" + "umulh x4, x20, x16\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x21, x15\n\t" + "umulh x4, x21, x15\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x20, x17\n\t" + "umulh x4, x20, x17\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x21, x16\n\t" + "umulh x4, x21, x16\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x21, x17\n\t" + "umulh x4, x21, x17\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + "sub x25, x25, #1\n\t" + "cmp x25, #0\n\t" + "bge L_curve25519_bits\n\t" + "mov x25, #63\n\t" + "sub x24, x24, #8\n\t" + "cmp x24, #0\n\t" + "bge L_curve25519_words\n\t" + /* Invert */ + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "add x0, x29, #80\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "add x1, x29, #80\n\t" + "bl fe_sq\n\t" + "add x1, x29, #16\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #112\n\t" + "bl fe_sq\n\t" + "add x0, x29, #80\n\t" + "add x1, x29, #80\n\t" + "add x2, x29, #112\n\t" + "bl fe_mul\n\t" + "add x0, x29, #112\n\t" + "bl fe_sq\n\t" + "mov x24, #4\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_curve25519_inv_1:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_1\n\t" + "add x0, x29, #80\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #112\n\t" + "add x1, x29, #80\n\t" + "bl fe_sq\n\t" + "mov x24, #9\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_curve25519_inv_2:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_2\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #144\n\t" + "bl fe_sq\n\t" + "mov x24, #19\n\t" + "add x1, x29, #144\n\t" + "\n" + "L_curve25519_inv_3:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_3\n\t" + "add x0, x29, #112\n\t" + "add x2, x29, #112\n\t" + "bl fe_mul\n\t" + "mov x24, #10\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_curve25519_inv_4:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_4\n\t" + "add x0, x29, #80\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #112\n\t" + "add x1, x29, #80\n\t" + "bl fe_sq\n\t" + "mov x24, #49\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_curve25519_inv_5:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_5\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "add x0, x29, #144\n\t" + "bl fe_sq\n\t" + "mov x24, #0x63\n\t" + "add x1, x29, #144\n\t" + "\n" + "L_curve25519_inv_6:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_6\n\t" + "add x0, x29, #112\n\t" + "add x2, x29, #112\n\t" + "bl fe_mul\n\t" + "mov x24, #50\n\t" + "add x1, x29, #112\n\t" + "\n" + "L_curve25519_inv_7:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_7\n\t" + "add x0, x29, #80\n\t" + "add x2, x29, #80\n\t" + "bl fe_mul\n\t" + "mov x24, #5\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_curve25519_inv_8:\n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_8\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "ldr %[r], [x29, #176]\n\t" + /* Multiply */ + "ldp x18, x19, [x0]\n\t" + "ldp x20, x21, [x0, #16]\n\t" + "ldp x14, x15, [x29, #16]\n\t" + "ldp x16, x17, [x29, #32]\n\t" + /* A[0] * B[0] */ + "mul x6, x18, x14\n\t" + "umulh x7, x18, x14\n\t" + /* A[0] * B[1] */ + "mul x3, x18, x15\n\t" + "umulh x8, x18, x15\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x19, x14\n\t" + "umulh x4, x19, x14\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x18, x16\n\t" + "umulh x4, x18, x16\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x19, x15\n\t" + "umulh x4, x19, x15\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x20, x14\n\t" + "umulh x4, x20, x14\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x18, x17\n\t" + "umulh x4, x18, x17\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x19, x16\n\t" + "umulh x4, x19, x16\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x20, x15\n\t" + "umulh x4, x20, x15\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x21, x14\n\t" + "umulh x4, x21, x14\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x19, x17\n\t" + "umulh x4, x19, x17\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x20, x16\n\t" + "umulh x4, x20, x16\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x21, x15\n\t" + "umulh x4, x21, x15\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x20, x17\n\t" + "umulh x4, x20, x17\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x21, x16\n\t" + "umulh x4, x21, x16\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x21, x17\n\t" + "umulh x4, x21, x17\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "lsr x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + "mov x0, xzr\n\t" + "ldp x29, x30, [sp], #0xc0\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25" + ); + return (uint32_t)(size_t)r; +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-128]!\n\t" + "add x29, sp, #0\n\t" + /* pow22523 */ + "str %[r], [x29, #112]\n\t" + "str %[a], [x29, #120]\n\t" + "add x0, x29, #16\n\t" + "bl fe_sq\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "ldr x1, [x29, #120]\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #16\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "bl fe_sq\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "mov x21, #4\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_1:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_1\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "mov x21, #9\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_2:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_2\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #80\n\t" + "bl fe_sq\n\t" + "mov x21, #19\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_pow22523_3:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_3\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "mov x21, #10\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_4:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_4\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "mov x21, #49\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_5:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_5\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #80\n\t" + "bl fe_sq\n\t" + "mov x21, #0x63\n\t" + "add x1, x29, #80\n\t" + "\n" + "L_fe_pow22523_6:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_6\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "mov x21, #50\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_7:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_7\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "mov x21, #2\n\t" + "add x1, x29, #16\n\t" + "\n" + "L_fe_pow22523_8:\n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_8\n\t" + "ldr x0, [x29, #112]\n\t" + "ldr x2, [x29, #120]\n\t" + "bl fe_mul\n\t" + "ldr %[a], [x29, #120]\n\t" + "ldr %[r], [x29, #112]\n\t" + "ldp x29, x30, [sp], #0x80\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x21" + ); +} + +void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" + "str %[ry], [x29, #16]\n\t" + "str %[rz], [x29, #24]\n\t" + "str %[px], [x29, #32]\n\t" + "str %[py], [x29, #40]\n\t" + "str %[pz], [x29, #48]\n\t" + "str %[pt], [x29, #56]\n\t" + "ldr x1, [x29, #32]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x16, [x1]\n\t" + "ldp x17, x18, [x1, #16]\n\t" + "ldp x19, x20, [x2]\n\t" + "ldp x21, x22, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #48]\n\t" + /* Multiply */ + "ldp x11, x16, [x1]\n\t" + "ldp x17, x18, [x1, #16]\n\t" + "ldp x19, x20, [x2]\n\t" + "ldp x21, x22, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x16, [x2]\n\t" + "ldp x17, x18, [x2, #16]\n\t" + "ldp x19, x20, [x1]\n\t" + "ldp x21, x22, [x1, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x12", "x13", "x14", "x15", "x7", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + ); +} + +void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %[ry], [x29, #16]\n\t" + "str %[rz], [x29, #24]\n\t" + "str %[rt], [x29, #32]\n\t" + "str %[px], [x29, #40]\n\t" + "str %[py], [x29, #48]\n\t" + "str %[pz], [x29, #56]\n\t" + "str %[pt], [x29, #64]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #64]\n\t" + /* Multiply */ + "ldp x11, x16, [x1]\n\t" + "ldp x17, x18, [x1, #16]\n\t" + "ldp x19, x20, [x2]\n\t" + "ldp x21, x22, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + "ldr x1, [x29, #48]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x16, [x1]\n\t" + "ldp x17, x18, [x1, #16]\n\t" + "ldp x19, x20, [x2]\n\t" + "ldp x21, x22, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #64]\n\t" + /* Multiply */ + "ldp x11, x16, [x2]\n\t" + "ldp x17, x18, [x2, #16]\n\t" + "ldp x19, x20, [x1]\n\t" + "ldp x21, x22, [x1, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #48]\n\t" + /* Multiply */ + "ldp x11, x16, [x1]\n\t" + "ldp x17, x18, [x1, #16]\n\t" + "ldp x19, x20, [x2]\n\t" + "ldp x21, x22, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x19\n\t" + "umulh x4, x11, x19\n\t" + /* A[0] * B[1] */ + "mul x12, x11, x20\n\t" + "umulh x5, x11, x20\n\t" + "adds x4, x4, x12\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x16, x19\n\t" + "umulh x13, x16, x19\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x11, x21\n\t" + "umulh x13, x11, x21\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x16, x20\n\t" + "umulh x13, x16, x20\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x17, x19\n\t" + "umulh x13, x17, x19\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x11, x22\n\t" + "umulh x13, x11, x22\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x16, x21\n\t" + "umulh x13, x16, x21\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x17, x20\n\t" + "umulh x13, x17, x20\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x18, x19\n\t" + "umulh x13, x18, x19\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x16, x22\n\t" + "umulh x13, x16, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x17, x21\n\t" + "umulh x13, x17, x21\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x18, x20\n\t" + "umulh x13, x18, x20\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x17, x22\n\t" + "umulh x13, x17, x22\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x18, x21\n\t" + "umulh x13, x18, x21\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x18, x22\n\t" + "umulh x13, x18, x22\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x7\n\t" + "umulh x7, x12, x7\n\t" + "adds x3, x3, x13\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adcs x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x14, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x6, #63\n\t" + "mul x14, x14, x12\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x14\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + ); +} + +void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %[rx], [x29, #16]\n\t" + "str %[ry], [x29, #24]\n\t" + "str %[rz], [x29, #32]\n\t" + "str %[rt], [x29, #40]\n\t" + "str %[px], [x29, #48]\n\t" + "str %[py], [x29, #56]\n\t" + "str %[pz], [x29, #64]\n\t" + "ldr x1, [x29, #48]\n\t" + /* Square */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x20, x21\n\t" + "umulh x6, x20, x21\n\t" + /* A[0] * A[2] */ + "mul x12, x20, x22\n\t" + "umulh x7, x20, x22\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x12, x20, x23\n\t" + "umulh x8, x20, x23\n\t" + "adds x7, x7, x12\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x12, x21, x22\n\t" + "umulh x13, x21, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x12, x21, x23\n\t" + "umulh x13, x21, x23\n\t" + "adds x8, x8, x12\n\t" + "adc x9, x9, x13\n\t" + /* A[2] * A[3] */ + "mul x12, x22, x23\n\t" + "umulh x10, x22, x23\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x20, x20\n\t" + "umulh x15, x20, x20\n\t" + /* A[1] * A[1] */ + "mul x12, x21, x21\n\t" + "umulh x13, x21, x21\n\t" + "adds x5, x5, x15\n\t" + "adcs x6, x6, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[2] * A[2] */ + "mul x12, x22, x22\n\t" + "umulh x13, x22, x22\n\t" + "adds x7, x7, x15\n\t" + "adcs x8, x8, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[3] * A[3] */ + "mul x12, x23, x23\n\t" + "umulh x13, x23, x23\n\t" + "adds x9, x9, x15\n\t" + "adcs x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Square */ + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x20, x21\n\t" + "umulh x6, x20, x21\n\t" + /* A[0] * A[2] */ + "mul x12, x20, x22\n\t" + "umulh x7, x20, x22\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x12, x20, x23\n\t" + "umulh x8, x20, x23\n\t" + "adds x7, x7, x12\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x12, x21, x22\n\t" + "umulh x13, x21, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x12, x21, x23\n\t" + "umulh x13, x21, x23\n\t" + "adds x8, x8, x12\n\t" + "adc x9, x9, x13\n\t" + /* A[2] * A[3] */ + "mul x12, x22, x23\n\t" + "umulh x10, x22, x23\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x20, x20\n\t" + "umulh x15, x20, x20\n\t" + /* A[1] * A[1] */ + "mul x12, x21, x21\n\t" + "umulh x13, x21, x21\n\t" + "adds x5, x5, x15\n\t" + "adcs x6, x6, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[2] * A[2] */ + "mul x12, x22, x22\n\t" + "umulh x13, x22, x22\n\t" + "adds x7, x7, x15\n\t" + "adcs x8, x8, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[3] * A[3] */ + "mul x12, x23, x23\n\t" + "umulh x13, x23, x23\n\t" + "adds x9, x9, x15\n\t" + "adcs x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + /* Add */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "adds x4, x4, x8\n\t" + "adcs x5, x5, x9\n\t" + "adcs x6, x6, x10\n\t" + "adc x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x7, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x12\n\t" + "sbcs x5, x5, x15\n\t" + "sbcs x6, x6, x15\n\t" + "sbc x7, x7, x13\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x1, [x29, #40]\n\t" + /* Square */ + "ldp x20, x21, [x0]\n\t" + "ldp x22, x23, [x0, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x20, x21\n\t" + "umulh x6, x20, x21\n\t" + /* A[0] * A[2] */ + "mul x12, x20, x22\n\t" + "umulh x7, x20, x22\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x12, x20, x23\n\t" + "umulh x8, x20, x23\n\t" + "adds x7, x7, x12\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x12, x21, x22\n\t" + "umulh x13, x21, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x12, x21, x23\n\t" + "umulh x13, x21, x23\n\t" + "adds x8, x8, x12\n\t" + "adc x9, x9, x13\n\t" + /* A[2] * A[3] */ + "mul x12, x22, x23\n\t" + "umulh x10, x22, x23\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x20, x20\n\t" + "umulh x15, x20, x20\n\t" + /* A[1] * A[1] */ + "mul x12, x21, x21\n\t" + "umulh x13, x21, x21\n\t" + "adds x5, x5, x15\n\t" + "adcs x6, x6, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[2] * A[2] */ + "mul x12, x22, x22\n\t" + "umulh x13, x22, x22\n\t" + "adds x7, x7, x15\n\t" + "adcs x8, x8, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[3] * A[3] */ + "mul x12, x23, x23\n\t" + "umulh x13, x23, x23\n\t" + "adds x9, x9, x15\n\t" + "adcs x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x1, [x29, #32]\n\t" + "ldr x2, [x29, #16]\n\t" + /* Add */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x1, [x29, #40]\n\t" + /* Sub */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldr x0, [x29, #64]\n\t" + /* Square * 2 */ + "ldp x20, x21, [x0]\n\t" + "ldp x22, x23, [x0, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x20, x21\n\t" + "umulh x6, x20, x21\n\t" + /* A[0] * A[2] */ + "mul x12, x20, x22\n\t" + "umulh x7, x20, x22\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x12, x20, x23\n\t" + "umulh x8, x20, x23\n\t" + "adds x7, x7, x12\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x12, x21, x22\n\t" + "umulh x13, x21, x22\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x12, x21, x23\n\t" + "umulh x13, x21, x23\n\t" + "adds x8, x8, x12\n\t" + "adc x9, x9, x13\n\t" + /* A[2] * A[3] */ + "mul x12, x22, x23\n\t" + "umulh x10, x22, x23\n\t" + "adds x9, x9, x12\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x20, x20\n\t" + "umulh x15, x20, x20\n\t" + /* A[1] * A[1] */ + "mul x12, x21, x21\n\t" + "umulh x13, x21, x21\n\t" + "adds x5, x5, x15\n\t" + "adcs x6, x6, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[2] * A[2] */ + "mul x12, x22, x22\n\t" + "umulh x13, x22, x22\n\t" + "adds x7, x7, x15\n\t" + "adcs x8, x8, x12\n\t" + "adc x15, x13, xzr\n\t" + /* A[3] * A[3] */ + "mul x12, x23, x23\n\t" + "umulh x13, x23, x23\n\t" + "adds x9, x9, x15\n\t" + "adcs x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Double and Reduce */ + "mov x12, #0x169\n\t" + /* Move top half into t4-t7 and remove top bit from t3 */ + "lsr x15, x11, #61\n\t" + "extr x11, x11, x10, #62\n\t" + "extr x10, x10, x9, #62\n\t" + "extr x9, x9, x8, #62\n\t" + "extr x8, x8, x7, #62\n\t" + "extr x7, x7, x6, #63\n\t" + "extr x6, x6, x5, #63\n\t" + "extr x5, x5, x4, #63\n\t" + "lsl x4, x4, #1\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Two left, only one right */ + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top bits by 19*19 */ + "mul x15, x15, x12\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x15\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x0, [x29, #32]\n\t" + /* Sub */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) + : + : "memory", "x12", "x13", "x14", "x15", "x7", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23" + ); +} + +void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %[rx], [x29, #16]\n\t" + "str %[ry], [x29, #24]\n\t" + "str %[rz], [x29, #32]\n\t" + "str %[rt], [x29, #40]\n\t" + "str %[px], [x29, #48]\n\t" + "str %[py], [x29, #56]\n\t" + "str %[pz], [x29, #64]\n\t" + "str %[pt], [x29, #72]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x3]\n\t" + "ldp x10, x11, [x3, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x2, [x29, #32]\n\t" + "ldr x3, [x29, #168]\n\t" + /* Multiply */ + "ldp x20, x21, [x0]\n\t" + "ldp x22, x23, [x0, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldr x0, [x29, #176]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x0]\n\t" + "ldp x26, x27, [x0, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #160]\n\t" + "ldr x3, [x29, #72]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x0, [x29, #64]\n\t" + /* Double */ + "ldp x4, x5, [x0]\n\t" + "ldp x6, x7, [x0, #16]\n\t" + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "mov x12, #-19\n\t" + "asr x15, x7, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x12\n\t" + "sbcs x5, x5, x15\n\t" + "sbcs x6, x6, x15\n\t" + "sbc x7, x7, x13\n\t" + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x2]\n\t" + "stp x18, x19, [x2, #16]\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %[rx], [x29, #16]\n\t" + "str %[ry], [x29, #24]\n\t" + "str %[rz], [x29, #32]\n\t" + "str %[rt], [x29, #40]\n\t" + "str %[px], [x29, #48]\n\t" + "str %[py], [x29, #56]\n\t" + "str %[pz], [x29, #64]\n\t" + "str %[pt], [x29, #72]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x3]\n\t" + "ldp x10, x11, [x3, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x2, [x29, #32]\n\t" + "ldr x3, [x29, #176]\n\t" + /* Multiply */ + "ldp x20, x21, [x0]\n\t" + "ldp x22, x23, [x0, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldr x0, [x29, #168]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x0]\n\t" + "ldp x26, x27, [x0, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #160]\n\t" + "ldr x3, [x29, #72]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x3, [x29, #16]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" + "stp x4, x5, [x3]\n\t" + "stp x6, x7, [x3, #16]\n\t" + "ldr x1, [x29, #64]\n\t" + /* Double */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "mov x12, #-19\n\t" + "asr x15, x7, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x12\n\t" + "sbcs x5, x5, x15\n\t" + "sbcs x6, x6, x15\n\t" + "sbc x7, x7, x13\n\t" + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-112]!\n\t" + "add x29, sp, #0\n\t" + "str %[rx], [x29, #16]\n\t" + "str %[ry], [x29, #24]\n\t" + "str %[rz], [x29, #32]\n\t" + "str %[rt], [x29, #40]\n\t" + "str %[px], [x29, #48]\n\t" + "str %[py], [x29, #56]\n\t" + "str %[pz], [x29, #64]\n\t" + "str %[pt], [x29, #72]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x3]\n\t" + "ldp x10, x11, [x3, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x2, [x29, #32]\n\t" + "ldr x3, [x29, #208]\n\t" + /* Multiply */ + "ldp x20, x21, [x0]\n\t" + "ldp x22, x23, [x0, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldr x2, [x29, #216]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x2]\n\t" + "ldp x26, x27, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #200]\n\t" + "ldr x3, [x29, #72]\n\t" + /* Multiply */ + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x1, [x29, #64]\n\t" + "ldr x2, [x29, #192]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x2]\n\t" + "ldp x26, x27, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "add x1, x29, #80\n\t" + /* Double */ + "ldp x4, x5, [x0]\n\t" + "ldp x6, x7, [x0, #16]\n\t" + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "mov x12, #-19\n\t" + "asr x15, x7, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x12\n\t" + "sbcs x5, x5, x15\n\t" + "sbcs x6, x6, x15\n\t" + "sbc x7, x7, x13\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x2, [x29, #24]\n\t" + "ldr x3, [x29, #32]\n\t" + /* Add */ + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x2]\n\t" + "stp x18, x19, [x2, #16]\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + /* Add */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x3]\n\t" + "stp x18, x19, [x3, #16]\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x70\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-112]!\n\t" + "add x29, sp, #0\n\t" + "str %[rx], [x29, #16]\n\t" + "str %[ry], [x29, #24]\n\t" + "str %[rz], [x29, #32]\n\t" + "str %[rt], [x29, #40]\n\t" + "str %[px], [x29, #48]\n\t" + "str %[py], [x29, #56]\n\t" + "str %[pz], [x29, #64]\n\t" + "str %[pt], [x29, #72]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x8, x9, [x3]\n\t" + "ldp x10, x11, [x3, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x2, [x29, #32]\n\t" + "ldr x3, [x29, #216]\n\t" + /* Multiply */ + "ldp x20, x21, [x0]\n\t" + "ldp x22, x23, [x0, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x2]\n\t" + "stp x6, x7, [x2, #16]\n\t" + "ldr x2, [x29, #208]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x2]\n\t" + "ldp x26, x27, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #200]\n\t" + "ldr x3, [x29, #72]\n\t" + /* Multiply */ + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" + "ldp x24, x25, [x3]\n\t" + "ldp x26, x27, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x1, [x29, #64]\n\t" + "ldr x2, [x29, #192]\n\t" + /* Multiply */ + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" + "ldp x24, x25, [x2]\n\t" + "ldp x26, x27, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x20, x24\n\t" + "umulh x5, x20, x24\n\t" + /* A[0] * B[1] */ + "mul x12, x20, x25\n\t" + "umulh x6, x20, x25\n\t" + "adds x5, x5, x12\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x12, x21, x24\n\t" + "umulh x13, x21, x24\n\t" + "adds x5, x5, x12\n\t" + "adcs x6, x6, x13\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x12, x20, x26\n\t" + "umulh x13, x20, x26\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* A[1] * B[1] */ + "mul x12, x21, x25\n\t" + "umulh x13, x21, x25\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x12, x22, x24\n\t" + "umulh x13, x22, x24\n\t" + "adds x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x12, x20, x27\n\t" + "umulh x13, x20, x27\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x12, x21, x26\n\t" + "umulh x13, x21, x26\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x12, x22, x25\n\t" + "umulh x13, x22, x25\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x12, x23, x24\n\t" + "umulh x13, x23, x24\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x12, x21, x27\n\t" + "umulh x13, x21, x27\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x12, x22, x26\n\t" + "umulh x13, x22, x26\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x12, x23, x25\n\t" + "umulh x13, x23, x25\n\t" + "adds x8, x8, x12\n\t" + "adcs x9, x9, x13\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x12, x22, x27\n\t" + "umulh x13, x22, x27\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x12, x23, x26\n\t" + "umulh x13, x23, x26\n\t" + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x12, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x10, x10, x12\n\t" + "adc x11, x11, x13\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x12, #19\n\t" + "mul x13, x12, x8\n\t" + "umulh x8, x12, x8\n\t" + "adds x4, x4, x13\n\t" + "mul x13, x12, x9\n\t" + "umulh x9, x12, x9\n\t" + "adcs x5, x5, x13\n\t" + "mul x13, x12, x10\n\t" + "umulh x10, x12, x10\n\t" + "adcs x6, x6, x13\n\t" + "mul x13, x12, x11\n\t" + "umulh x14, x12, x11\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x14, x14, xzr\n\t" + /* Overflow */ + "extr x14, x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "lsr x14, x7, #63\n\t" + "mul x14, x14, x12\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x14\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "add x1, x29, #80\n\t" + /* Double */ + "ldp x4, x5, [x0]\n\t" + "ldp x6, x7, [x0, #16]\n\t" + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "mov x12, #-19\n\t" + "asr x15, x7, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x12\n\t" + "sbcs x5, x5, x15\n\t" + "sbcs x6, x6, x15\n\t" + "sbc x7, x7, x13\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldr x2, [x29, #24]\n\t" + "ldr x3, [x29, #32]\n\t" + /* Add */ + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x2]\n\t" + "stp x18, x19, [x2, #16]\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + /* Add */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" + "ldp x8, x9, [x0]\n\t" + "ldp x10, x11, [x0, #16]\n\t" + "adds x16, x4, x8\n\t" + "adcs x17, x5, x9\n\t" + "adcs x18, x6, x10\n\t" + "adc x19, x7, x11\n\t" + "mov x12, #-19\n\t" + "asr x15, x19, #63\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x15\n\t" + "sbcs x18, x18, x15\n\t" + "sbc x19, x19, x13\n\t" + /* Sub */ + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "mov x12, #-19\n\t" + "csetm x15, cc\n\t" + /* Mask the modulus */ + "and x12, x15, x12\n\t" + "and x13, x15, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x12\n\t" + "adcs x5, x5, x15\n\t" + "adcs x6, x6, x15\n\t" + "adc x7, x7, x13\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "stp x4, x5, [x3]\n\t" + "stp x6, x7, [x3, #16]\n\t" + "ldp x29, x30, [sp], #0x70\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h index 4c5624180..e9cef000c 100644 --- a/wolfssl/wolfcrypt/fe_operations.h +++ b/wolfssl/wolfcrypt/fe_operations.h @@ -39,6 +39,10 @@ #define CURVED25519_128BIT #endif +#if defined(CURVED25519_X64) || defined(WOLFSSL_ARMASM) + #define CURVED25519_ASM_64BIT +#endif + /* fe means field element. Here the field is \Z/(2^255-19). @@ -72,7 +76,7 @@ WOLFSSL_LOCAL int curve25519(byte * q, byte * n, byte * p); /* default to be faster but take more memory */ #if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT typedef int64_t fe[4]; #elif defined(CURVED25519_128BIT) typedef int64_t fe[5]; @@ -108,7 +112,7 @@ WOLFSSL_LOCAL void fe_pow22523(fe,const fe); WOLFSSL_LOCAL uint64_t load_3(const unsigned char *in); WOLFSSL_LOCAL uint64_t load_4(const unsigned char *in); -#ifdef CURVED25519_X64 +#ifdef CURVED25519_ASM_64BIT WOLFSSL_LOCAL void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt); WOLFSSL_LOCAL void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, @@ -132,7 +136,7 @@ WOLFSSL_LOCAL void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe qt2d, const fe qyplusx, const fe qyminusx); WOLFSSL_LOCAL void fe_cmov_table(fe* r, fe* base, signed char b); -#endif /* CURVED25519_X64 */ +#endif /* CURVED25519_ASM_64BIT */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ /* Use less memory and only 32bit types or less, but is slower diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h index bb052dead..fe2ebdfca 100644 --- a/wolfssl/wolfcrypt/ge_operations.h +++ b/wolfssl/wolfcrypt/ge_operations.h @@ -47,7 +47,7 @@ Representations: #ifdef ED25519_SMALL typedef byte ge[F25519_SIZE]; -#elif defined(CURVED25519_X64) +#elif defined(CURVED25519_ASM_64BIT) typedef int64_t ge[4]; #elif defined(CURVED25519_128BIT) typedef int64_t ge[5];