Merge pull request #6030 from SparkiDev/sp_ecc_p256_arm64

SP Aarch64 ECC P256: mont reduce fix
This commit is contained in:
David Garske
2023-01-30 15:07:29 -08:00
committed by GitHub

View File

@@ -22541,16 +22541,16 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const
"# - a[0] << 32 << 192\n\t" "# - a[0] << 32 << 192\n\t"
"# + (a[0] * 2) << 192\n\t" "# + (a[0] * 2) << 192\n\t"
"# a[0]-a[2] << 32\n\t" "# a[0]-a[2] << 32\n\t"
"extr x10, x10, x9, 32\n\t" "extr x22, x10, x9, 32\n\t"
"add x7, x11, x8\n\t" "add x7, x11, x8\n\t"
"extr x9, x9, x8, 32\n\t" "extr x21, x9, x8, 32\n\t"
"add x7, x7, x8\n\t" "add x7, x7, x8\n\t"
"# + a[0]-a[2] << 32 << 64\n\t" "# + a[0]-a[2] << 32 << 64\n\t"
"# - a[0] << 32 << 192\n\t" "# - a[0] << 32 << 192\n\t"
"adds x5, x5, x8, lsl #32\n\t" "adds x5, x5, x8, lsl #32\n\t"
"sub x7, x7, x8, lsl #32\n\t" "sub x7, x7, x8, lsl #32\n\t"
"adcs x6, x6, x9\n\t" "adcs x6, x6, x21\n\t"
"adc x7, x7, x10\n\t" "adc x7, x7, x22\n\t"
"# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
"# a += mu << 256\n\t" "# a += mu << 256\n\t"
"adds x12, x12, x4\n\t" "adds x12, x12, x4\n\t"
@@ -22689,16 +22689,16 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const
"# - a[0] << 32 << 192\n\t" "# - a[0] << 32 << 192\n\t"
"# + (a[0] * 2) << 192\n\t" "# + (a[0] * 2) << 192\n\t"
"# a[0]-a[2] << 32\n\t" "# a[0]-a[2] << 32\n\t"
"extr x10, x10, x9, 32\n\t" "extr x21, x10, x9, 32\n\t"
"add x6, x11, x8\n\t" "add x6, x11, x8\n\t"
"extr x9, x9, x8, 32\n\t" "extr x20, x9, x8, 32\n\t"
"add x6, x6, x8\n\t" "add x6, x6, x8\n\t"
"# + a[0]-a[2] << 32 << 64\n\t" "# + a[0]-a[2] << 32 << 64\n\t"
"# - a[0] << 32 << 192\n\t" "# - a[0] << 32 << 192\n\t"
"adds x4, x4, x8, lsl #32\n\t" "adds x4, x4, x8, lsl #32\n\t"
"sub x6, x6, x8, lsl #32\n\t" "sub x6, x6, x8, lsl #32\n\t"
"adcs x5, x5, x9\n\t" "adcs x5, x5, x20\n\t"
"adc x6, x6, x10\n\t" "adc x6, x6, x21\n\t"
"# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
"# a += mu << 256\n\t" "# a += mu << 256\n\t"
"adds x12, x12, x3\n\t" "adds x12, x12, x3\n\t"
@@ -22752,7 +22752,7 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const
"stp x14, x15, [%[r], 16]\n\t" "stp x14, x15, [%[r], 16]\n\t"
: :
: [r] "r" (r), [a] "r" (a) : [r] "r" (r), [a] "r" (a)
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "cc" : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "cc"
); );
} }
@@ -22994,16 +22994,16 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m,
"# - a[0] << 32 << 192\n\t" "# - a[0] << 32 << 192\n\t"
"# + (a[0] * 2) << 192\n\t" "# + (a[0] * 2) << 192\n\t"
"# a[0]-a[2] << 32\n\t" "# a[0]-a[2] << 32\n\t"
"extr x12, x12, x11, 32\n\t" "extr x20, x12, x11, 32\n\t"
"add x6, x13, x10\n\t" "add x6, x13, x10\n\t"
"extr x11, x11, x10, 32\n\t" "extr x19, x11, x10, 32\n\t"
"add x6, x6, x10\n\t" "add x6, x6, x10\n\t"
"# + a[0]-a[2] << 32 << 64\n\t" "# + a[0]-a[2] << 32 << 64\n\t"
"# - a[0] << 32 << 192\n\t" "# - a[0] << 32 << 192\n\t"
"adds x4, x4, x10, lsl #32\n\t" "adds x4, x4, x10, lsl #32\n\t"
"sub x6, x6, x10, lsl #32\n\t" "sub x6, x6, x10, lsl #32\n\t"
"adcs x5, x5, x11\n\t" "adcs x5, x5, x19\n\t"
"adc x6, x6, x12\n\t" "adc x6, x6, x20\n\t"
"# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
"# a += mu << 256\n\t" "# a += mu << 256\n\t"
"adds x14, x14, x3\n\t" "adds x14, x14, x3\n\t"
@@ -23057,7 +23057,7 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m,
"stp x16, x17, [%[a], 16]\n\t" "stp x16, x17, [%[a], 16]\n\t"
: :
: [a] "r" (a), [m] "r" (m), [mp] "r" (mp) : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
: "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x19", "x20", "cc"
); );
} }
/* Reduce the number back to 256 bits using Montgomery reduction. /* Reduce the number back to 256 bits using Montgomery reduction.