Merge pull request #3342 from SparkiDev/arm64_clang_fix

SP ARM64: Fix assembly for clang
This commit is contained in:
toddouska
2020-09-28 09:57:19 -07:00
committed by GitHub

View File

@ -627,6 +627,8 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
}
/* Square a and put result in r. (r = a * a)
*
* All registers version.
*
* r A single precision integer.
* a A single precision integer.
@ -634,172 +636,172 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
{
__asm__ __volatile__ (
"ldp x22, x23, [%[a], 0]\n\t"
"ldp x24, x25, [%[a], 16]\n\t"
"ldp x26, x27, [%[a], 32]\n\t"
"ldp x28, x29, [%[a], 48]\n\t"
"ldp x21, x22, [%[a], 0]\n\t"
"ldp x23, x24, [%[a], 16]\n\t"
"ldp x25, x26, [%[a], 32]\n\t"
"ldp x27, x28, [%[a], 48]\n\t"
"# A[0] * A[1]\n\t"
"mul x3, x22, x23\n\t"
"umulh x7, x22, x23\n\t"
"mul x6, x21, x22\n\t"
"umulh x7, x21, x22\n\t"
"# A[0] * A[2]\n\t"
"mul x4, x22, x24\n\t"
"umulh x5, x22, x24\n\t"
"mul x4, x21, x23\n\t"
"umulh x5, x21, x23\n\t"
"adds x7, x7, x4\n\t"
"# A[0] * A[3]\n\t"
"mul x4, x22, x25\n\t"
"mul x4, x21, x24\n\t"
"adc x8, xzr, x5\n\t"
"umulh x5, x22, x25\n\t"
"umulh x5, x21, x24\n\t"
"adds x8, x8, x4\n\t"
"# A[1] * A[2]\n\t"
"mul x4, x23, x24\n\t"
"mul x4, x22, x23\n\t"
"adc x9, xzr, x5\n\t"
"umulh x5, x23, x24\n\t"
"umulh x5, x22, x23\n\t"
"adds x8, x8, x4\n\t"
"# A[0] * A[4]\n\t"
"mul x4, x22, x26\n\t"
"mul x4, x21, x25\n\t"
"adcs x9, x9, x5\n\t"
"umulh x5, x22, x26\n\t"
"umulh x5, x21, x25\n\t"
"adc x10, xzr, xzr\n\t"
"adds x9, x9, x4\n\t"
"# A[1] * A[3]\n\t"
"mul x4, x23, x25\n\t"
"mul x4, x22, x24\n\t"
"adc x10, x10, x5\n\t"
"umulh x5, x23, x25\n\t"
"umulh x5, x22, x24\n\t"
"adds x9, x9, x4\n\t"
"# A[0] * A[5]\n\t"
"mul x4, x22, x27\n\t"
"mul x4, x21, x26\n\t"
"adcs x10, x10, x5\n\t"
"umulh x5, x22, x27\n\t"
"umulh x5, x21, x26\n\t"
"adc x11, xzr, xzr\n\t"
"adds x10, x10, x4\n\t"
"# A[1] * A[4]\n\t"
"mul x4, x23, x26\n\t"
"mul x4, x22, x25\n\t"
"adc x11, x11, x5\n\t"
"umulh x5, x23, x26\n\t"
"umulh x5, x22, x25\n\t"
"adds x10, x10, x4\n\t"
"# A[2] * A[3]\n\t"
"mul x4, x24, x25\n\t"
"mul x4, x23, x24\n\t"
"adcs x11, x11, x5\n\t"
"umulh x5, x24, x25\n\t"
"umulh x5, x23, x24\n\t"
"adc x12, xzr, xzr\n\t"
"adds x10, x10, x4\n\t"
"# A[0] * A[6]\n\t"
"mul x4, x22, x28\n\t"
"mul x4, x21, x27\n\t"
"adcs x11, x11, x5\n\t"
"umulh x5, x22, x28\n\t"
"umulh x5, x21, x27\n\t"
"adc x12, x12, xzr\n\t"
"adds x11, x11, x4\n\t"
"# A[1] * A[5]\n\t"
"mul x4, x23, x27\n\t"
"mul x4, x22, x26\n\t"
"adcs x12, x12, x5\n\t"
"umulh x5, x23, x27\n\t"
"umulh x5, x22, x26\n\t"
"adc x13, xzr, xzr\n\t"
"adds x11, x11, x4\n\t"
"# A[2] * A[4]\n\t"
"mul x4, x24, x26\n\t"
"mul x4, x23, x25\n\t"
"adcs x12, x12, x5\n\t"
"umulh x5, x24, x26\n\t"
"umulh x5, x23, x25\n\t"
"adc x13, x13, xzr\n\t"
"adds x11, x11, x4\n\t"
"# A[0] * A[7]\n\t"
"mul x4, x22, x29\n\t"
"mul x4, x21, x28\n\t"
"adcs x12, x12, x5\n\t"
"umulh x5, x22, x29\n\t"
"umulh x5, x21, x28\n\t"
"adc x13, x13, xzr\n\t"
"adds x12, x12, x4\n\t"
"# A[1] * A[6]\n\t"
"mul x4, x23, x28\n\t"
"mul x4, x22, x27\n\t"
"adcs x13, x13, x5\n\t"
"umulh x5, x23, x28\n\t"
"umulh x5, x22, x27\n\t"
"adc x14, xzr, xzr\n\t"
"adds x12, x12, x4\n\t"
"# A[2] * A[5]\n\t"
"mul x4, x24, x27\n\t"
"mul x4, x23, x26\n\t"
"adcs x13, x13, x5\n\t"
"umulh x5, x24, x27\n\t"
"umulh x5, x23, x26\n\t"
"adc x14, x14, xzr\n\t"
"adds x12, x12, x4\n\t"
"# A[3] * A[4]\n\t"
"mul x4, x25, x26\n\t"
"mul x4, x24, x25\n\t"
"adcs x13, x13, x5\n\t"
"umulh x5, x25, x26\n\t"
"umulh x5, x24, x25\n\t"
"adc x14, x14, xzr\n\t"
"adds x12, x12, x4\n\t"
"# A[1] * A[7]\n\t"
"mul x4, x23, x29\n\t"
"mul x4, x22, x28\n\t"
"adcs x13, x13, x5\n\t"
"umulh x5, x23, x29\n\t"
"umulh x5, x22, x28\n\t"
"adc x14, x14, xzr\n\t"
"adds x13, x13, x4\n\t"
"# A[2] * A[6]\n\t"
"mul x4, x24, x28\n\t"
"mul x4, x23, x27\n\t"
"adcs x14, x14, x5\n\t"
"umulh x5, x24, x28\n\t"
"umulh x5, x23, x27\n\t"
"adc x15, xzr, xzr\n\t"
"adds x13, x13, x4\n\t"
"# A[3] * A[5]\n\t"
"mul x4, x25, x27\n\t"
"mul x4, x24, x26\n\t"
"adcs x14, x14, x5\n\t"
"umulh x5, x25, x27\n\t"
"umulh x5, x24, x26\n\t"
"adc x15, x15, xzr\n\t"
"adds x13, x13, x4\n\t"
"# A[2] * A[7]\n\t"
"mul x4, x24, x29\n\t"
"mul x4, x23, x28\n\t"
"adcs x14, x14, x5\n\t"
"umulh x5, x24, x29\n\t"
"umulh x5, x23, x28\n\t"
"adc x15, x15, xzr\n\t"
"adds x14, x14, x4\n\t"
"# A[3] * A[6]\n\t"
"mul x4, x25, x28\n\t"
"mul x4, x24, x27\n\t"
"adcs x15, x15, x5\n\t"
"umulh x5, x25, x28\n\t"
"umulh x5, x24, x27\n\t"
"adc x16, xzr, xzr\n\t"
"adds x14, x14, x4\n\t"
"# A[4] * A[5]\n\t"
"mul x4, x26, x27\n\t"
"mul x4, x25, x26\n\t"
"adcs x15, x15, x5\n\t"
"umulh x5, x26, x27\n\t"
"umulh x5, x25, x26\n\t"
"adc x16, x16, xzr\n\t"
"adds x14, x14, x4\n\t"
"# A[3] * A[7]\n\t"
"mul x4, x25, x29\n\t"
"mul x4, x24, x28\n\t"
"adcs x15, x15, x5\n\t"
"umulh x5, x25, x29\n\t"
"umulh x5, x24, x28\n\t"
"adc x16, x16, xzr\n\t"
"adds x15, x15, x4\n\t"
"# A[4] * A[6]\n\t"
"mul x4, x26, x28\n\t"
"mul x4, x25, x27\n\t"
"adcs x16, x16, x5\n\t"
"umulh x5, x26, x28\n\t"
"umulh x5, x25, x27\n\t"
"adc x17, xzr, xzr\n\t"
"adds x15, x15, x4\n\t"
"# A[4] * A[7]\n\t"
"mul x4, x26, x29\n\t"
"mul x4, x25, x28\n\t"
"adcs x16, x16, x5\n\t"
"umulh x5, x26, x29\n\t"
"umulh x5, x25, x28\n\t"
"adc x17, x17, xzr\n\t"
"adds x16, x16, x4\n\t"
"# A[5] * A[6]\n\t"
"mul x4, x27, x28\n\t"
"mul x4, x26, x27\n\t"
"adcs x17, x17, x5\n\t"
"umulh x5, x27, x28\n\t"
"umulh x5, x26, x27\n\t"
"adc x19, xzr, xzr\n\t"
"adds x16, x16, x4\n\t"
"# A[5] * A[7]\n\t"
"mul x4, x27, x29\n\t"
"mul x4, x26, x28\n\t"
"adcs x17, x17, x5\n\t"
"umulh x5, x27, x29\n\t"
"umulh x5, x26, x28\n\t"
"adc x19, x19, xzr\n\t"
"adds x17, x17, x4\n\t"
"# A[6] * A[7]\n\t"
"mul x4, x28, x29\n\t"
"mul x4, x27, x28\n\t"
"adcs x19, x19, x5\n\t"
"umulh x5, x28, x29\n\t"
"umulh x5, x27, x28\n\t"
"adc x20, xzr, xzr\n\t"
"adds x19, x19, x4\n\t"
"adc x20, x20, x5\n\t"
"# Double\n\t"
"adds x3, x3, x3\n\t"
"adds x6, x6, x6\n\t"
"adcs x7, x7, x7\n\t"
"adcs x8, x8, x8\n\t"
"adcs x9, x9, x9\n\t"
@ -813,47 +815,47 @@ static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
"adcs x17, x17, x17\n\t"
"adcs x19, x19, x19\n\t"
"# A[0] * A[0]\n\t"
"mul x2, x22, x22\n\t"
"mul x5, x21, x21\n\t"
"adcs x20, x20, x20\n\t"
"umulh x4, x22, x22\n\t"
"umulh x2, x21, x21\n\t"
"cset x21, cs\n\t"
"# A[1] * A[1]\n\t"
"mul x5, x23, x23\n\t"
"adds x3, x3, x4\n\t"
"umulh x6, x23, x23\n\t"
"adcs x7, x7, x5\n\t"
"mul x3, x22, x22\n\t"
"adds x6, x6, x2\n\t"
"umulh x4, x22, x22\n\t"
"adcs x7, x7, x3\n\t"
"# A[2] * A[2]\n\t"
"mul x4, x24, x24\n\t"
"adcs x8, x8, x6\n\t"
"umulh x5, x24, x24\n\t"
"adcs x9, x9, x4\n\t"
"mul x2, x23, x23\n\t"
"adcs x8, x8, x4\n\t"
"umulh x3, x23, x23\n\t"
"adcs x9, x9, x2\n\t"
"# A[3] * A[3]\n\t"
"mul x6, x25, x25\n\t"
"adcs x10, x10, x5\n\t"
"umulh x4, x25, x25\n\t"
"adcs x11, x11, x6\n\t"
"mul x4, x24, x24\n\t"
"adcs x10, x10, x3\n\t"
"umulh x2, x24, x24\n\t"
"adcs x11, x11, x4\n\t"
"# A[4] * A[4]\n\t"
"mul x5, x26, x26\n\t"
"adcs x12, x12, x4\n\t"
"umulh x6, x26, x26\n\t"
"adcs x13, x13, x5\n\t"
"mul x3, x25, x25\n\t"
"adcs x12, x12, x2\n\t"
"umulh x4, x25, x25\n\t"
"adcs x13, x13, x3\n\t"
"# A[5] * A[5]\n\t"
"mul x4, x27, x27\n\t"
"adcs x14, x14, x6\n\t"
"umulh x5, x27, x27\n\t"
"adcs x15, x15, x4\n\t"
"mul x2, x26, x26\n\t"
"adcs x14, x14, x4\n\t"
"umulh x3, x26, x26\n\t"
"adcs x15, x15, x2\n\t"
"# A[6] * A[6]\n\t"
"mul x6, x28, x28\n\t"
"adcs x16, x16, x5\n\t"
"umulh x4, x28, x28\n\t"
"adcs x17, x17, x6\n\t"
"mul x4, x27, x27\n\t"
"adcs x16, x16, x3\n\t"
"umulh x2, x27, x27\n\t"
"adcs x17, x17, x4\n\t"
"# A[7] * A[7]\n\t"
"mul x5, x29, x29\n\t"
"adcs x19, x19, x4\n\t"
"umulh x6, x29, x29\n\t"
"adcs x20, x20, x5\n\t"
"stp x2, x3, [%[r], 0]\n\t"
"adc x21, x21, x6\n\t"
"mul x3, x28, x28\n\t"
"adcs x19, x19, x2\n\t"
"umulh x4, x28, x28\n\t"
"adcs x20, x20, x3\n\t"
"stp x5, x6, [%[r], 0]\n\t"
"adc x21, x21, x4\n\t"
"stp x7, x8, [%[r], 16]\n\t"
"stp x9, x10, [%[r], 32]\n\t"
"stp x11, x12, [%[r], 48]\n\t"
@ -863,7 +865,7 @@ static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
"stp x20, x21, [%[r], 112]\n\t"
:
: [r] "r" (r), [a] "r" (a)
: "memory", "x4", "x5", "x6", "x2", "x3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29"
: "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
@ -3092,7 +3094,7 @@ static int64_t sp_2048_cmp_16(const sp_digit* a, const sp_digit* b)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -4356,7 +4358,7 @@ static int64_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -4544,7 +4546,7 @@ static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a,
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -5444,6 +5446,7 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
#ifdef HAVE_FFDHE_2048
static void sp_2048_lshift_32(sp_digit* r, sp_digit* a, byte n)
{
word64 n64 = n;
__asm__ __volatile__ (
"mov x6, 63\n\t"
"sub x6, x6, %[n]\n\t"
@ -5640,7 +5643,7 @@ static void sp_2048_lshift_32(sp_digit* r, sp_digit* a, byte n)
"str x2, [%[r]]\n\t"
"str x3, [%[r], 8]\n\t"
:
: [r] "r" (r), [a] "r" (a), [n] "r" (n)
: [r] "r" (r), [a] "r" (a), [n] "r" (n64)
: "memory", "x2", "x3", "x4", "x5", "x6"
);
}
@ -10488,7 +10491,7 @@ static int64_t sp_3072_cmp_24(const sp_digit* a, const sp_digit* b)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -12080,7 +12083,7 @@ static int64_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -12308,7 +12311,7 @@ static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a,
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -13236,6 +13239,7 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
#ifdef HAVE_FFDHE_3072
static void sp_3072_lshift_48(sp_digit* r, sp_digit* a, byte n)
{
word64 n64 = n;
__asm__ __volatile__ (
"mov x6, 63\n\t"
"sub x6, x6, %[n]\n\t"
@ -13528,7 +13532,7 @@ static void sp_3072_lshift_48(sp_digit* r, sp_digit* a, byte n)
"str x4, [%[r]]\n\t"
"str x2, [%[r], 8]\n\t"
:
: [r] "r" (r), [a] "r" (a), [n] "r" (n)
: [r] "r" (r), [a] "r" (a), [n] "r" (n64)
: "memory", "x2", "x3", "x4", "x5", "x6"
);
}
@ -17004,7 +17008,7 @@ static int64_t sp_4096_cmp_64(const sp_digit* a, const sp_digit* b)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -17272,7 +17276,7 @@ static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a,
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -18228,6 +18232,7 @@ int sp_ModExp_4096(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
#ifdef HAVE_FFDHE_4096
static void sp_4096_lshift_64(sp_digit* r, sp_digit* a, byte n)
{
word64 n64 = n;
__asm__ __volatile__ (
"mov x6, 63\n\t"
"sub x6, x6, %[n]\n\t"
@ -18616,7 +18621,7 @@ static void sp_4096_lshift_64(sp_digit* r, sp_digit* a, byte n)
"str x3, [%[r]]\n\t"
"str x4, [%[r], 8]\n\t"
:
: [r] "r" (r), [a] "r" (a), [n] "r" (n)
: [r] "r" (r), [a] "r" (a), [n] "r" (n64)
: "memory", "x2", "x3", "x4", "x5", "x6"
);
}
@ -36663,7 +36668,7 @@ static void sp_256_mask_4(sp_digit* r, const sp_digit* a, sp_digit m)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.
@ -39178,6 +39183,8 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a)
#else
/* Square a and put result in r. (r = a * a)
*
* All registers version.
*
* r A single precision integer.
* a A single precision integer.
@ -39185,93 +39192,93 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a)
static void sp_384_sqr_6(sp_digit* r, const sp_digit* a)
{
__asm__ __volatile__ (
"ldp x17, x19, [%[a], 0]\n\t"
"ldp x20, x21, [%[a], 16]\n\t"
"ldp x22, x23, [%[a], 32]\n\t"
"ldp x16, x17, [%[a], 0]\n\t"
"ldp x19, x20, [%[a], 16]\n\t"
"ldp x21, x22, [%[a], 32]\n\t"
"# A[0] * A[1]\n\t"
"mul x3, x17, x19\n\t"
"umulh x7, x17, x19\n\t"
"mul x6, x16, x17\n\t"
"umulh x7, x16, x17\n\t"
"# A[0] * A[2]\n\t"
"mul x4, x17, x20\n\t"
"umulh x5, x17, x20\n\t"
"mul x4, x16, x19\n\t"
"umulh x5, x16, x19\n\t"
"adds x7, x7, x4\n\t"
"# A[0] * A[3]\n\t"
"mul x4, x17, x21\n\t"
"mul x4, x16, x20\n\t"
"adc x8, xzr, x5\n\t"
"umulh x5, x17, x21\n\t"
"umulh x5, x16, x20\n\t"
"adds x8, x8, x4\n\t"
"# A[1] * A[2]\n\t"
"mul x4, x19, x20\n\t"
"mul x4, x17, x19\n\t"
"adc x9, xzr, x5\n\t"
"umulh x5, x19, x20\n\t"
"umulh x5, x17, x19\n\t"
"adds x8, x8, x4\n\t"
"# A[0] * A[4]\n\t"
"mul x4, x17, x22\n\t"
"mul x4, x16, x21\n\t"
"adcs x9, x9, x5\n\t"
"umulh x5, x17, x22\n\t"
"umulh x5, x16, x21\n\t"
"adc x10, xzr, xzr\n\t"
"adds x9, x9, x4\n\t"
"# A[1] * A[3]\n\t"
"mul x4, x19, x21\n\t"
"mul x4, x17, x20\n\t"
"adc x10, x10, x5\n\t"
"umulh x5, x19, x21\n\t"
"umulh x5, x17, x20\n\t"
"adds x9, x9, x4\n\t"
"# A[0] * A[5]\n\t"
"mul x4, x17, x23\n\t"
"mul x4, x16, x22\n\t"
"adcs x10, x10, x5\n\t"
"umulh x5, x17, x23\n\t"
"umulh x5, x16, x22\n\t"
"adc x11, xzr, xzr\n\t"
"adds x10, x10, x4\n\t"
"# A[1] * A[4]\n\t"
"mul x4, x19, x22\n\t"
"mul x4, x17, x21\n\t"
"adc x11, x11, x5\n\t"
"umulh x5, x19, x22\n\t"
"umulh x5, x17, x21\n\t"
"adds x10, x10, x4\n\t"
"# A[2] * A[3]\n\t"
"mul x4, x20, x21\n\t"
"mul x4, x19, x20\n\t"
"adcs x11, x11, x5\n\t"
"umulh x5, x20, x21\n\t"
"umulh x5, x19, x20\n\t"
"adc x12, xzr, xzr\n\t"
"adds x10, x10, x4\n\t"
"# A[1] * A[5]\n\t"
"mul x4, x19, x23\n\t"
"mul x4, x17, x22\n\t"
"adcs x11, x11, x5\n\t"
"umulh x5, x19, x23\n\t"
"umulh x5, x17, x22\n\t"
"adc x12, x12, xzr\n\t"
"adds x11, x11, x4\n\t"
"# A[2] * A[4]\n\t"
"mul x4, x20, x22\n\t"
"mul x4, x19, x21\n\t"
"adcs x12, x12, x5\n\t"
"umulh x5, x20, x22\n\t"
"umulh x5, x19, x21\n\t"
"adc x13, xzr, xzr\n\t"
"adds x11, x11, x4\n\t"
"# A[2] * A[5]\n\t"
"mul x4, x20, x23\n\t"
"mul x4, x19, x22\n\t"
"adcs x12, x12, x5\n\t"
"umulh x5, x20, x23\n\t"
"umulh x5, x19, x22\n\t"
"adc x13, x13, xzr\n\t"
"adds x12, x12, x4\n\t"
"# A[3] * A[4]\n\t"
"mul x4, x21, x22\n\t"
"mul x4, x20, x21\n\t"
"adcs x13, x13, x5\n\t"
"umulh x5, x21, x22\n\t"
"umulh x5, x20, x21\n\t"
"adc x14, xzr, xzr\n\t"
"adds x12, x12, x4\n\t"
"# A[3] * A[5]\n\t"
"mul x4, x21, x23\n\t"
"mul x4, x20, x22\n\t"
"adcs x13, x13, x5\n\t"
"umulh x5, x21, x23\n\t"
"umulh x5, x20, x22\n\t"
"adc x14, x14, xzr\n\t"
"adds x13, x13, x4\n\t"
"# A[4] * A[5]\n\t"
"mul x4, x22, x23\n\t"
"mul x4, x21, x22\n\t"
"adcs x14, x14, x5\n\t"
"umulh x5, x22, x23\n\t"
"umulh x5, x21, x22\n\t"
"adc x15, xzr, xzr\n\t"
"adds x14, x14, x4\n\t"
"adc x15, x15, x5\n\t"
"# Double\n\t"
"adds x3, x3, x3\n\t"
"adds x6, x6, x6\n\t"
"adcs x7, x7, x7\n\t"
"adcs x8, x8, x8\n\t"
"adcs x9, x9, x9\n\t"
@ -39281,37 +39288,37 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a)
"adcs x13, x13, x13\n\t"
"adcs x14, x14, x14\n\t"
"# A[0] * A[0]\n\t"
"mul x2, x17, x17\n\t"
"mul x5, x16, x16\n\t"
"adcs x15, x15, x15\n\t"
"umulh x4, x17, x17\n\t"
"umulh x2, x16, x16\n\t"
"cset x16, cs\n\t"
"# A[1] * A[1]\n\t"
"mul x5, x19, x19\n\t"
"adds x3, x3, x4\n\t"
"umulh x6, x19, x19\n\t"
"adcs x7, x7, x5\n\t"
"mul x3, x17, x17\n\t"
"adds x6, x6, x2\n\t"
"umulh x4, x17, x17\n\t"
"adcs x7, x7, x3\n\t"
"# A[2] * A[2]\n\t"
"mul x4, x20, x20\n\t"
"adcs x8, x8, x6\n\t"
"umulh x5, x20, x20\n\t"
"adcs x9, x9, x4\n\t"
"mul x2, x19, x19\n\t"
"adcs x8, x8, x4\n\t"
"umulh x3, x19, x19\n\t"
"adcs x9, x9, x2\n\t"
"# A[3] * A[3]\n\t"
"mul x6, x21, x21\n\t"
"adcs x10, x10, x5\n\t"
"umulh x4, x21, x21\n\t"
"adcs x11, x11, x6\n\t"
"mul x4, x20, x20\n\t"
"adcs x10, x10, x3\n\t"
"umulh x2, x20, x20\n\t"
"adcs x11, x11, x4\n\t"
"# A[4] * A[4]\n\t"
"mul x5, x22, x22\n\t"
"adcs x12, x12, x4\n\t"
"umulh x6, x22, x22\n\t"
"adcs x13, x13, x5\n\t"
"mul x3, x21, x21\n\t"
"adcs x12, x12, x2\n\t"
"umulh x4, x21, x21\n\t"
"adcs x13, x13, x3\n\t"
"# A[5] * A[5]\n\t"
"mul x4, x23, x23\n\t"
"adcs x14, x14, x6\n\t"
"umulh x5, x23, x23\n\t"
"adcs x15, x15, x4\n\t"
"stp x2, x3, [%[r], 0]\n\t"
"adc x16, x16, x5\n\t"
"mul x2, x22, x22\n\t"
"adcs x14, x14, x4\n\t"
"umulh x3, x22, x22\n\t"
"adcs x15, x15, x2\n\t"
"stp x5, x6, [%[r], 0]\n\t"
"adc x16, x16, x3\n\t"
"stp x7, x8, [%[r], 16]\n\t"
"stp x9, x10, [%[r], 32]\n\t"
"stp x11, x12, [%[r], 48]\n\t"
@ -39319,7 +39326,7 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a)
"stp x15, x16, [%[r], 80]\n\t"
:
: [r] "r" (r), [a] "r" (a)
: "memory", "x4", "x5", "x6", "x2", "x3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23"
: "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x16", "x17", "x19", "x20", "x21", "x22"
);
}
@ -43213,7 +43220,7 @@ static void sp_384_mask_6(sp_digit* r, const sp_digit* a, sp_digit m)
/* Divide d in a and put remainder into r (m*d + r = a)
* m is not calculated as it is not needed at this time.
*
* a Nmber to be divided.
* a Number to be divided.
* d Number to divide with.
* m Multiplier result.
* r Remainder from the division.