Merge pull request #4681 from SparkiDev/sp_arm64_p384_mr

SP ARM64: P-384 prime specific Montogmery Reduction
This commit is contained in:
David Garske
2021-12-21 13:50:33 -08:00
committed by GitHub

View File

@ -40527,8 +40527,6 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m)
);
}
#define sp_384_mont_reduce_order_6 sp_384_mont_reduce_6
/* Reduce the number back to 384 bits using Montgomery reduction.
*
* a A single precision number to reduce in place.
@ -40538,6 +40536,143 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m)
SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m,
sp_digit mp)
{
__asm__ __volatile__ (
"ldp x7, x8, [%[a], #0]\n\t"
"ldp x9, x10, [%[a], #16]\n\t"
"ldp x11, x12, [%[a], #32]\n\t"
"mov x6, xzr\n\t"
"# a[0-7] += m[0-5] * mu[0..1] = m[0-5] * (a[0..1] * mp)\n\t"
"ldp x13, x14, [%[a], #48]\n\t"
"lsl x2, x8, 32\n\t"
"lsl x1, x7, 32\n\t"
"orr x2, x2, x7, lsr 32\n\t"
"adds x1, x1, x7\n\t"
"adc x2, x2, x8\n\t"
"add x2, x2, x7\n\t"
"lsl x3, x1, 32\n\t"
"lsl x4, x2, 32\n\t"
"orr x4, x4, x1, lsr 32\n\t"
"lsr x5, x2, 32\n\t"
"adds x7, x7, x3\n\t"
"adcs x8, x8, x4\n\t"
"adcs x9, x9, x5\n\t"
"adcs x10, x10, xzr\n\t"
"adcs x11, x11, xzr\n\t"
"adcs x12, x12, xzr\n\t"
"adcs x13, x13, x1\n\t"
"adcs x14, x14, x2\n\t"
"adcs x6, x6, xzr\n\t"
"adds x3, x3, x2\n\t"
"adcs x4, x4, x1\n\t"
"adcs x5, x5, x2\n\t"
"adcs x2, xzr, xzr\n\t"
"subs x9, x9, x4\n\t"
"sbcs x10, x10, x5\n\t"
"sbcs x11, x11, x2\n\t"
"sbcs x12, x12, xzr\n\t"
"sbcs x13, x13, xzr\n\t"
"sbcs x14, x14, xzr\n\t"
"sbc x6, x6, xzr\n\t"
"# a[2-9] += m[0-5] * mu[0..1] = m[0-5] * (a[2..3] * mp)\n\t"
"ldp x7, x8, [%[a], #64]\n\t"
"lsl x2, x10, 32\n\t"
"lsl x1, x9, 32\n\t"
"orr x2, x2, x9, lsr 32\n\t"
"adds x1, x1, x9\n\t"
"adc x2, x2, x10\n\t"
"add x2, x2, x9\n\t"
"lsl x3, x1, 32\n\t"
"lsl x4, x2, 32\n\t"
"orr x4, x4, x1, lsr 32\n\t"
"lsr x5, x2, 32\n\t"
"adds x7, x7, x6\n\t"
"adcs x8, x8, xzr\n\t"
"adc x6, xzr, xzr\n\t"
"adds x9, x9, x3\n\t"
"adcs x10, x10, x4\n\t"
"adcs x11, x11, x5\n\t"
"adcs x12, x12, xzr\n\t"
"adcs x13, x13, xzr\n\t"
"adcs x14, x14, xzr\n\t"
"adcs x7, x7, x1\n\t"
"adcs x8, x8, x2\n\t"
"adcs x6, x6, xzr\n\t"
"adds x3, x3, x2\n\t"
"adcs x4, x4, x1\n\t"
"adcs x5, x5, x2\n\t"
"adcs x2, xzr, xzr\n\t"
"subs x11, x11, x4\n\t"
"sbcs x12, x12, x5\n\t"
"sbcs x13, x13, x2\n\t"
"sbcs x14, x14, xzr\n\t"
"sbcs x7, x7, xzr\n\t"
"sbcs x8, x8, xzr\n\t"
"sbc x6, x6, xzr\n\t"
"# a[4-11] += m[0-5] * mu[0..1] = m[0-5] * (a[4..5] * mp)\n\t"
"ldp x9, x10, [%[a], #80]\n\t"
"lsl x2, x12, 32\n\t"
"lsl x1, x11, 32\n\t"
"orr x2, x2, x11, lsr 32\n\t"
"adds x1, x1, x11\n\t"
"adc x2, x2, x12\n\t"
"add x2, x2, x11\n\t"
"lsl x3, x1, 32\n\t"
"lsl x4, x2, 32\n\t"
"orr x4, x4, x1, lsr 32\n\t"
"lsr x5, x2, 32\n\t"
"adds x9, x9, x6\n\t"
"adcs x10, x10, xzr\n\t"
"adc x6, xzr, xzr\n\t"
"adds x11, x11, x3\n\t"
"adcs x12, x12, x4\n\t"
"adcs x13, x13, x5\n\t"
"adcs x14, x14, xzr\n\t"
"adcs x7, x7, xzr\n\t"
"adcs x8, x8, xzr\n\t"
"adcs x9, x9, x1\n\t"
"adcs x10, x10, x2\n\t"
"adcs x6, x6, xzr\n\t"
"adds x3, x3, x2\n\t"
"adcs x4, x4, x1\n\t"
"adcs x5, x5, x2\n\t"
"adcs x2, xzr, xzr\n\t"
"subs x13, x13, x4\n\t"
"sbcs x14, x14, x5\n\t"
"sbcs x7, x7, x2\n\t"
"sbcs x8, x8, xzr\n\t"
"sbcs x9, x9, xzr\n\t"
"sbcs x10, x10, xzr\n\t"
"sbc x6, x6, xzr\n\t"
"# Subtract mod if carry\n\t"
"neg x6, x6\n\t"
"mov x5, -2\n\t"
"lsr x3, x6, 32\n\t"
"lsl x4, x6, 32\n\t"
"and x5, x5, x6\n\t"
"subs x13, x13, x3\n\t"
"sbcs x14, x14, x4\n\t"
"sbcs x7, x7, x5\n\t"
"sbcs x8, x8, x6\n\t"
"sbcs x9, x9, x6\n\t"
"sbc x10, x10, x6\n\t"
"stp x13, x14, [%[a], #0]\n\t"
"stp x7, x8, [%[a], #16]\n\t"
"stp x9, x10, [%[a], #32]\n\t"
:
: [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
: "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"
);
}
/* Reduce the number back to 384 bits using Montgomery reduction.
*
* a A single precision number to reduce in place.
* m The single precision number representing the modulus.
* mp The digit representing the negative inverse of m mod 2^n.
*/
SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* m,
sp_digit mp)
{
__asm__ __volatile__ (
"ldp x14, x15, [%[m], 0]\n\t"