From b61b3e34dd6bbf6380a2ab7e46a26d9210a90938 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 28 Sep 2020 12:35:58 +1000 Subject: [PATCH] SP ARM64: Fix assembly for clang clang doesn't auto correct size of register (declared byte n but 64-bit usage) clang doesn't always handle use of x29 (FP or Frame Pointer) in inline assembly code correctly - reworked sp_2048_sqr_8 to not use x29. --- wolfcrypt/src/sp_arm64.c | 339 ++++++++++++++++++++------------------- 1 file changed, 173 insertions(+), 166 deletions(-) diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 6a012f071..2e2f12aec 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -627,6 +627,8 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) } /* Square a and put result in r. (r = a * a) + * + * All registers version. * * r A single precision integer. * a A single precision integer. @@ -634,172 +636,172 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( - "ldp x22, x23, [%[a], 0]\n\t" - "ldp x24, x25, [%[a], 16]\n\t" - "ldp x26, x27, [%[a], 32]\n\t" - "ldp x28, x29, [%[a], 48]\n\t" + "ldp x21, x22, [%[a], 0]\n\t" + "ldp x23, x24, [%[a], 16]\n\t" + "ldp x25, x26, [%[a], 32]\n\t" + "ldp x27, x28, [%[a], 48]\n\t" "# A[0] * A[1]\n\t" - "mul x3, x22, x23\n\t" - "umulh x7, x22, x23\n\t" + "mul x6, x21, x22\n\t" + "umulh x7, x21, x22\n\t" "# A[0] * A[2]\n\t" - "mul x4, x22, x24\n\t" - "umulh x5, x22, x24\n\t" + "mul x4, x21, x23\n\t" + "umulh x5, x21, x23\n\t" "adds x7, x7, x4\n\t" "# A[0] * A[3]\n\t" - "mul x4, x22, x25\n\t" + "mul x4, x21, x24\n\t" "adc x8, xzr, x5\n\t" - "umulh x5, x22, x25\n\t" + "umulh x5, x21, x24\n\t" "adds x8, x8, x4\n\t" "# A[1] * A[2]\n\t" - "mul x4, x23, x24\n\t" + "mul x4, x22, x23\n\t" "adc x9, xzr, x5\n\t" - "umulh x5, x23, x24\n\t" + "umulh x5, x22, x23\n\t" "adds x8, x8, x4\n\t" "# A[0] * A[4]\n\t" - "mul x4, x22, x26\n\t" + "mul x4, x21, x25\n\t" "adcs x9, x9, x5\n\t" - "umulh x5, x22, x26\n\t" + "umulh x5, x21, x25\n\t" "adc x10, xzr, xzr\n\t" "adds x9, x9, x4\n\t" "# A[1] * A[3]\n\t" - "mul x4, x23, x25\n\t" + "mul x4, x22, x24\n\t" "adc x10, x10, x5\n\t" - "umulh x5, x23, x25\n\t" + "umulh x5, x22, x24\n\t" "adds x9, x9, x4\n\t" "# A[0] * A[5]\n\t" - "mul x4, x22, x27\n\t" + "mul x4, x21, x26\n\t" "adcs x10, x10, x5\n\t" - "umulh x5, x22, x27\n\t" + "umulh x5, x21, x26\n\t" "adc x11, xzr, xzr\n\t" "adds x10, x10, x4\n\t" "# A[1] * A[4]\n\t" - "mul x4, x23, x26\n\t" + "mul x4, x22, x25\n\t" "adc x11, x11, x5\n\t" - "umulh x5, x23, x26\n\t" + "umulh x5, x22, x25\n\t" "adds x10, x10, x4\n\t" "# A[2] * A[3]\n\t" - "mul x4, x24, x25\n\t" + "mul x4, x23, x24\n\t" "adcs x11, x11, x5\n\t" - "umulh x5, x24, x25\n\t" + "umulh x5, x23, x24\n\t" "adc x12, xzr, xzr\n\t" "adds x10, x10, x4\n\t" "# A[0] * A[6]\n\t" - "mul x4, x22, x28\n\t" + "mul x4, x21, x27\n\t" "adcs x11, x11, x5\n\t" - "umulh x5, x22, x28\n\t" + "umulh x5, x21, x27\n\t" "adc x12, x12, xzr\n\t" "adds x11, x11, x4\n\t" "# A[1] * A[5]\n\t" - "mul x4, x23, x27\n\t" + "mul x4, x22, x26\n\t" "adcs x12, x12, x5\n\t" - "umulh x5, x23, x27\n\t" + "umulh x5, x22, x26\n\t" "adc x13, xzr, xzr\n\t" "adds x11, x11, x4\n\t" "# A[2] * A[4]\n\t" - "mul x4, x24, x26\n\t" + "mul x4, x23, x25\n\t" "adcs x12, x12, x5\n\t" - "umulh x5, x24, x26\n\t" + "umulh x5, x23, x25\n\t" "adc x13, x13, xzr\n\t" "adds x11, x11, x4\n\t" "# A[0] * A[7]\n\t" - "mul x4, x22, x29\n\t" + "mul x4, x21, x28\n\t" "adcs x12, x12, x5\n\t" - "umulh x5, x22, x29\n\t" + "umulh x5, x21, x28\n\t" "adc x13, x13, xzr\n\t" "adds x12, x12, x4\n\t" "# A[1] * A[6]\n\t" - "mul x4, x23, x28\n\t" + "mul x4, x22, x27\n\t" "adcs x13, x13, x5\n\t" - "umulh x5, x23, x28\n\t" + "umulh x5, x22, x27\n\t" "adc x14, xzr, xzr\n\t" "adds x12, x12, x4\n\t" "# A[2] * A[5]\n\t" - "mul x4, x24, x27\n\t" + "mul x4, x23, x26\n\t" "adcs x13, x13, x5\n\t" - "umulh x5, x24, x27\n\t" + "umulh x5, x23, x26\n\t" "adc x14, x14, xzr\n\t" "adds x12, x12, x4\n\t" "# A[3] * A[4]\n\t" - "mul x4, x25, x26\n\t" + "mul x4, x24, x25\n\t" "adcs x13, x13, x5\n\t" - "umulh x5, x25, x26\n\t" + "umulh x5, x24, x25\n\t" "adc x14, x14, xzr\n\t" "adds x12, x12, x4\n\t" "# A[1] * A[7]\n\t" - "mul x4, x23, x29\n\t" + "mul x4, x22, x28\n\t" "adcs x13, x13, x5\n\t" - "umulh x5, x23, x29\n\t" + "umulh x5, x22, x28\n\t" "adc x14, x14, xzr\n\t" "adds x13, x13, x4\n\t" "# A[2] * A[6]\n\t" - "mul x4, x24, x28\n\t" + "mul x4, x23, x27\n\t" "adcs x14, x14, x5\n\t" - "umulh x5, x24, x28\n\t" + "umulh x5, x23, x27\n\t" "adc x15, xzr, xzr\n\t" "adds x13, x13, x4\n\t" "# A[3] * A[5]\n\t" - "mul x4, x25, x27\n\t" + "mul x4, x24, x26\n\t" "adcs x14, x14, x5\n\t" - "umulh x5, x25, x27\n\t" + "umulh x5, x24, x26\n\t" "adc x15, x15, xzr\n\t" "adds x13, x13, x4\n\t" "# A[2] * A[7]\n\t" - "mul x4, x24, x29\n\t" + "mul x4, x23, x28\n\t" "adcs x14, x14, x5\n\t" - "umulh x5, x24, x29\n\t" + "umulh x5, x23, x28\n\t" "adc x15, x15, xzr\n\t" "adds x14, x14, x4\n\t" "# A[3] * A[6]\n\t" - "mul x4, x25, x28\n\t" + "mul x4, x24, x27\n\t" "adcs x15, x15, x5\n\t" - "umulh x5, x25, x28\n\t" + "umulh x5, x24, x27\n\t" "adc x16, xzr, xzr\n\t" "adds x14, x14, x4\n\t" "# A[4] * A[5]\n\t" - "mul x4, x26, x27\n\t" + "mul x4, x25, x26\n\t" "adcs x15, x15, x5\n\t" - "umulh x5, x26, x27\n\t" + "umulh x5, x25, x26\n\t" "adc x16, x16, xzr\n\t" "adds x14, x14, x4\n\t" "# A[3] * A[7]\n\t" - "mul x4, x25, x29\n\t" + "mul x4, x24, x28\n\t" "adcs x15, x15, x5\n\t" - "umulh x5, x25, x29\n\t" + "umulh x5, x24, x28\n\t" "adc x16, x16, xzr\n\t" "adds x15, x15, x4\n\t" "# A[4] * A[6]\n\t" - "mul x4, x26, x28\n\t" + "mul x4, x25, x27\n\t" "adcs x16, x16, x5\n\t" - "umulh x5, x26, x28\n\t" + "umulh x5, x25, x27\n\t" "adc x17, xzr, xzr\n\t" "adds x15, x15, x4\n\t" "# A[4] * A[7]\n\t" - "mul x4, x26, x29\n\t" + "mul x4, x25, x28\n\t" "adcs x16, x16, x5\n\t" - "umulh x5, x26, x29\n\t" + "umulh x5, x25, x28\n\t" "adc x17, x17, xzr\n\t" "adds x16, x16, x4\n\t" "# A[5] * A[6]\n\t" - "mul x4, x27, x28\n\t" + "mul x4, x26, x27\n\t" "adcs x17, x17, x5\n\t" - "umulh x5, x27, x28\n\t" + "umulh x5, x26, x27\n\t" "adc x19, xzr, xzr\n\t" "adds x16, x16, x4\n\t" "# A[5] * A[7]\n\t" - "mul x4, x27, x29\n\t" + "mul x4, x26, x28\n\t" "adcs x17, x17, x5\n\t" - "umulh x5, x27, x29\n\t" + "umulh x5, x26, x28\n\t" "adc x19, x19, xzr\n\t" "adds x17, x17, x4\n\t" "# A[6] * A[7]\n\t" - "mul x4, x28, x29\n\t" + "mul x4, x27, x28\n\t" "adcs x19, x19, x5\n\t" - "umulh x5, x28, x29\n\t" + "umulh x5, x27, x28\n\t" "adc x20, xzr, xzr\n\t" "adds x19, x19, x4\n\t" "adc x20, x20, x5\n\t" "# Double\n\t" - "adds x3, x3, x3\n\t" + "adds x6, x6, x6\n\t" "adcs x7, x7, x7\n\t" "adcs x8, x8, x8\n\t" "adcs x9, x9, x9\n\t" @@ -813,47 +815,47 @@ static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) "adcs x17, x17, x17\n\t" "adcs x19, x19, x19\n\t" "# A[0] * A[0]\n\t" - "mul x2, x22, x22\n\t" + "mul x5, x21, x21\n\t" "adcs x20, x20, x20\n\t" - "umulh x4, x22, x22\n\t" + "umulh x2, x21, x21\n\t" "cset x21, cs\n\t" "# A[1] * A[1]\n\t" - "mul x5, x23, x23\n\t" - "adds x3, x3, x4\n\t" - "umulh x6, x23, x23\n\t" - "adcs x7, x7, x5\n\t" + "mul x3, x22, x22\n\t" + "adds x6, x6, x2\n\t" + "umulh x4, x22, x22\n\t" + "adcs x7, x7, x3\n\t" "# A[2] * A[2]\n\t" - "mul x4, x24, x24\n\t" - "adcs x8, x8, x6\n\t" - "umulh x5, x24, x24\n\t" - "adcs x9, x9, x4\n\t" + "mul x2, x23, x23\n\t" + "adcs x8, x8, x4\n\t" + "umulh x3, x23, x23\n\t" + "adcs x9, x9, x2\n\t" "# A[3] * A[3]\n\t" - "mul x6, x25, x25\n\t" - "adcs x10, x10, x5\n\t" - "umulh x4, x25, x25\n\t" - "adcs x11, x11, x6\n\t" + "mul x4, x24, x24\n\t" + "adcs x10, x10, x3\n\t" + "umulh x2, x24, x24\n\t" + "adcs x11, x11, x4\n\t" "# A[4] * A[4]\n\t" - "mul x5, x26, x26\n\t" - "adcs x12, x12, x4\n\t" - "umulh x6, x26, x26\n\t" - "adcs x13, x13, x5\n\t" + "mul x3, x25, x25\n\t" + "adcs x12, x12, x2\n\t" + "umulh x4, x25, x25\n\t" + "adcs x13, x13, x3\n\t" "# A[5] * A[5]\n\t" - "mul x4, x27, x27\n\t" - "adcs x14, x14, x6\n\t" - "umulh x5, x27, x27\n\t" - "adcs x15, x15, x4\n\t" + "mul x2, x26, x26\n\t" + "adcs x14, x14, x4\n\t" + "umulh x3, x26, x26\n\t" + "adcs x15, x15, x2\n\t" "# A[6] * A[6]\n\t" - "mul x6, x28, x28\n\t" - "adcs x16, x16, x5\n\t" - "umulh x4, x28, x28\n\t" - "adcs x17, x17, x6\n\t" + "mul x4, x27, x27\n\t" + "adcs x16, x16, x3\n\t" + "umulh x2, x27, x27\n\t" + "adcs x17, x17, x4\n\t" "# A[7] * A[7]\n\t" - "mul x5, x29, x29\n\t" - "adcs x19, x19, x4\n\t" - "umulh x6, x29, x29\n\t" - "adcs x20, x20, x5\n\t" - "stp x2, x3, [%[r], 0]\n\t" - "adc x21, x21, x6\n\t" + "mul x3, x28, x28\n\t" + "adcs x19, x19, x2\n\t" + "umulh x4, x28, x28\n\t" + "adcs x20, x20, x3\n\t" + "stp x5, x6, [%[r], 0]\n\t" + "adc x21, x21, x4\n\t" "stp x7, x8, [%[r], 16]\n\t" "stp x9, x10, [%[r], 32]\n\t" "stp x11, x12, [%[r], 48]\n\t" @@ -863,7 +865,7 @@ static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) "stp x20, x21, [%[r], 112]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x4", "x5", "x6", "x2", "x3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } @@ -3092,7 +3094,7 @@ static int64_t sp_2048_cmp_16(const sp_digit* a, const sp_digit* b) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -4356,7 +4358,7 @@ static int64_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -4544,7 +4546,7 @@ static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -5444,6 +5446,7 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) #ifdef HAVE_FFDHE_2048 static void sp_2048_lshift_32(sp_digit* r, sp_digit* a, byte n) { + word64 n64 = n; __asm__ __volatile__ ( "mov x6, 63\n\t" "sub x6, x6, %[n]\n\t" @@ -5640,7 +5643,7 @@ static void sp_2048_lshift_32(sp_digit* r, sp_digit* a, byte n) "str x2, [%[r]]\n\t" "str x3, [%[r], 8]\n\t" : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : [r] "r" (r), [a] "r" (a), [n] "r" (n64) : "memory", "x2", "x3", "x4", "x5", "x6" ); } @@ -10488,7 +10491,7 @@ static int64_t sp_3072_cmp_24(const sp_digit* a, const sp_digit* b) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -12080,7 +12083,7 @@ static int64_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -12308,7 +12311,7 @@ static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -13236,6 +13239,7 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) #ifdef HAVE_FFDHE_3072 static void sp_3072_lshift_48(sp_digit* r, sp_digit* a, byte n) { + word64 n64 = n; __asm__ __volatile__ ( "mov x6, 63\n\t" "sub x6, x6, %[n]\n\t" @@ -13528,7 +13532,7 @@ static void sp_3072_lshift_48(sp_digit* r, sp_digit* a, byte n) "str x4, [%[r]]\n\t" "str x2, [%[r], 8]\n\t" : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : [r] "r" (r), [a] "r" (a), [n] "r" (n64) : "memory", "x2", "x3", "x4", "x5", "x6" ); } @@ -17004,7 +17008,7 @@ static int64_t sp_4096_cmp_64(const sp_digit* a, const sp_digit* b) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -17272,7 +17276,7 @@ static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -18228,6 +18232,7 @@ int sp_ModExp_4096(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) #ifdef HAVE_FFDHE_4096 static void sp_4096_lshift_64(sp_digit* r, sp_digit* a, byte n) { + word64 n64 = n; __asm__ __volatile__ ( "mov x6, 63\n\t" "sub x6, x6, %[n]\n\t" @@ -18616,7 +18621,7 @@ static void sp_4096_lshift_64(sp_digit* r, sp_digit* a, byte n) "str x3, [%[r]]\n\t" "str x4, [%[r], 8]\n\t" : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : [r] "r" (r), [a] "r" (a), [n] "r" (n64) : "memory", "x2", "x3", "x4", "x5", "x6" ); } @@ -36663,7 +36668,7 @@ static void sp_256_mask_4(sp_digit* r, const sp_digit* a, sp_digit m) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division. @@ -39178,6 +39183,8 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) #else /* Square a and put result in r. (r = a * a) + * + * All registers version. * * r A single precision integer. * a A single precision integer. @@ -39185,93 +39192,93 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( - "ldp x17, x19, [%[a], 0]\n\t" - "ldp x20, x21, [%[a], 16]\n\t" - "ldp x22, x23, [%[a], 32]\n\t" + "ldp x16, x17, [%[a], 0]\n\t" + "ldp x19, x20, [%[a], 16]\n\t" + "ldp x21, x22, [%[a], 32]\n\t" "# A[0] * A[1]\n\t" - "mul x3, x17, x19\n\t" - "umulh x7, x17, x19\n\t" + "mul x6, x16, x17\n\t" + "umulh x7, x16, x17\n\t" "# A[0] * A[2]\n\t" - "mul x4, x17, x20\n\t" - "umulh x5, x17, x20\n\t" + "mul x4, x16, x19\n\t" + "umulh x5, x16, x19\n\t" "adds x7, x7, x4\n\t" "# A[0] * A[3]\n\t" - "mul x4, x17, x21\n\t" + "mul x4, x16, x20\n\t" "adc x8, xzr, x5\n\t" - "umulh x5, x17, x21\n\t" + "umulh x5, x16, x20\n\t" "adds x8, x8, x4\n\t" "# A[1] * A[2]\n\t" - "mul x4, x19, x20\n\t" + "mul x4, x17, x19\n\t" "adc x9, xzr, x5\n\t" - "umulh x5, x19, x20\n\t" + "umulh x5, x17, x19\n\t" "adds x8, x8, x4\n\t" "# A[0] * A[4]\n\t" - "mul x4, x17, x22\n\t" + "mul x4, x16, x21\n\t" "adcs x9, x9, x5\n\t" - "umulh x5, x17, x22\n\t" + "umulh x5, x16, x21\n\t" "adc x10, xzr, xzr\n\t" "adds x9, x9, x4\n\t" "# A[1] * A[3]\n\t" - "mul x4, x19, x21\n\t" + "mul x4, x17, x20\n\t" "adc x10, x10, x5\n\t" - "umulh x5, x19, x21\n\t" + "umulh x5, x17, x20\n\t" "adds x9, x9, x4\n\t" "# A[0] * A[5]\n\t" - "mul x4, x17, x23\n\t" + "mul x4, x16, x22\n\t" "adcs x10, x10, x5\n\t" - "umulh x5, x17, x23\n\t" + "umulh x5, x16, x22\n\t" "adc x11, xzr, xzr\n\t" "adds x10, x10, x4\n\t" "# A[1] * A[4]\n\t" - "mul x4, x19, x22\n\t" + "mul x4, x17, x21\n\t" "adc x11, x11, x5\n\t" - "umulh x5, x19, x22\n\t" + "umulh x5, x17, x21\n\t" "adds x10, x10, x4\n\t" "# A[2] * A[3]\n\t" - "mul x4, x20, x21\n\t" + "mul x4, x19, x20\n\t" "adcs x11, x11, x5\n\t" - "umulh x5, x20, x21\n\t" + "umulh x5, x19, x20\n\t" "adc x12, xzr, xzr\n\t" "adds x10, x10, x4\n\t" "# A[1] * A[5]\n\t" - "mul x4, x19, x23\n\t" + "mul x4, x17, x22\n\t" "adcs x11, x11, x5\n\t" - "umulh x5, x19, x23\n\t" + "umulh x5, x17, x22\n\t" "adc x12, x12, xzr\n\t" "adds x11, x11, x4\n\t" "# A[2] * A[4]\n\t" - "mul x4, x20, x22\n\t" + "mul x4, x19, x21\n\t" "adcs x12, x12, x5\n\t" - "umulh x5, x20, x22\n\t" + "umulh x5, x19, x21\n\t" "adc x13, xzr, xzr\n\t" "adds x11, x11, x4\n\t" "# A[2] * A[5]\n\t" - "mul x4, x20, x23\n\t" + "mul x4, x19, x22\n\t" "adcs x12, x12, x5\n\t" - "umulh x5, x20, x23\n\t" + "umulh x5, x19, x22\n\t" "adc x13, x13, xzr\n\t" "adds x12, x12, x4\n\t" "# A[3] * A[4]\n\t" - "mul x4, x21, x22\n\t" + "mul x4, x20, x21\n\t" "adcs x13, x13, x5\n\t" - "umulh x5, x21, x22\n\t" + "umulh x5, x20, x21\n\t" "adc x14, xzr, xzr\n\t" "adds x12, x12, x4\n\t" "# A[3] * A[5]\n\t" - "mul x4, x21, x23\n\t" + "mul x4, x20, x22\n\t" "adcs x13, x13, x5\n\t" - "umulh x5, x21, x23\n\t" + "umulh x5, x20, x22\n\t" "adc x14, x14, xzr\n\t" "adds x13, x13, x4\n\t" "# A[4] * A[5]\n\t" - "mul x4, x22, x23\n\t" + "mul x4, x21, x22\n\t" "adcs x14, x14, x5\n\t" - "umulh x5, x22, x23\n\t" + "umulh x5, x21, x22\n\t" "adc x15, xzr, xzr\n\t" "adds x14, x14, x4\n\t" "adc x15, x15, x5\n\t" "# Double\n\t" - "adds x3, x3, x3\n\t" + "adds x6, x6, x6\n\t" "adcs x7, x7, x7\n\t" "adcs x8, x8, x8\n\t" "adcs x9, x9, x9\n\t" @@ -39281,37 +39288,37 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) "adcs x13, x13, x13\n\t" "adcs x14, x14, x14\n\t" "# A[0] * A[0]\n\t" - "mul x2, x17, x17\n\t" + "mul x5, x16, x16\n\t" "adcs x15, x15, x15\n\t" - "umulh x4, x17, x17\n\t" + "umulh x2, x16, x16\n\t" "cset x16, cs\n\t" "# A[1] * A[1]\n\t" - "mul x5, x19, x19\n\t" - "adds x3, x3, x4\n\t" - "umulh x6, x19, x19\n\t" - "adcs x7, x7, x5\n\t" + "mul x3, x17, x17\n\t" + "adds x6, x6, x2\n\t" + "umulh x4, x17, x17\n\t" + "adcs x7, x7, x3\n\t" "# A[2] * A[2]\n\t" - "mul x4, x20, x20\n\t" - "adcs x8, x8, x6\n\t" - "umulh x5, x20, x20\n\t" - "adcs x9, x9, x4\n\t" + "mul x2, x19, x19\n\t" + "adcs x8, x8, x4\n\t" + "umulh x3, x19, x19\n\t" + "adcs x9, x9, x2\n\t" "# A[3] * A[3]\n\t" - "mul x6, x21, x21\n\t" - "adcs x10, x10, x5\n\t" - "umulh x4, x21, x21\n\t" - "adcs x11, x11, x6\n\t" + "mul x4, x20, x20\n\t" + "adcs x10, x10, x3\n\t" + "umulh x2, x20, x20\n\t" + "adcs x11, x11, x4\n\t" "# A[4] * A[4]\n\t" - "mul x5, x22, x22\n\t" - "adcs x12, x12, x4\n\t" - "umulh x6, x22, x22\n\t" - "adcs x13, x13, x5\n\t" + "mul x3, x21, x21\n\t" + "adcs x12, x12, x2\n\t" + "umulh x4, x21, x21\n\t" + "adcs x13, x13, x3\n\t" "# A[5] * A[5]\n\t" - "mul x4, x23, x23\n\t" - "adcs x14, x14, x6\n\t" - "umulh x5, x23, x23\n\t" - "adcs x15, x15, x4\n\t" - "stp x2, x3, [%[r], 0]\n\t" - "adc x16, x16, x5\n\t" + "mul x2, x22, x22\n\t" + "adcs x14, x14, x4\n\t" + "umulh x3, x22, x22\n\t" + "adcs x15, x15, x2\n\t" + "stp x5, x6, [%[r], 0]\n\t" + "adc x16, x16, x3\n\t" "stp x7, x8, [%[r], 16]\n\t" "stp x9, x10, [%[r], 32]\n\t" "stp x11, x12, [%[r], 48]\n\t" @@ -39319,7 +39326,7 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) "stp x15, x16, [%[r], 80]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x4", "x5", "x6", "x2", "x3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x16", "x17", "x19", "x20", "x21", "x22" ); } @@ -43213,7 +43220,7 @@ static void sp_384_mask_6(sp_digit* r, const sp_digit* a, sp_digit m) /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * - * a Nmber to be divided. + * a Number to be divided. * d Number to divide with. * m Multiplier result. * r Remainder from the division.