mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2025-07-29 18:27:29 +02:00
Merge pull request #4681 from SparkiDev/sp_arm64_p384_mr
SP ARM64: P-384 prime specific Montogmery Reduction
This commit is contained in:
@ -40527,8 +40527,6 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m)
|
||||
);
|
||||
}
|
||||
|
||||
#define sp_384_mont_reduce_order_6 sp_384_mont_reduce_6
|
||||
|
||||
/* Reduce the number back to 384 bits using Montgomery reduction.
|
||||
*
|
||||
* a A single precision number to reduce in place.
|
||||
@ -40538,6 +40536,143 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m)
|
||||
SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m,
|
||||
sp_digit mp)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
"ldp x7, x8, [%[a], #0]\n\t"
|
||||
"ldp x9, x10, [%[a], #16]\n\t"
|
||||
"ldp x11, x12, [%[a], #32]\n\t"
|
||||
"mov x6, xzr\n\t"
|
||||
"# a[0-7] += m[0-5] * mu[0..1] = m[0-5] * (a[0..1] * mp)\n\t"
|
||||
"ldp x13, x14, [%[a], #48]\n\t"
|
||||
"lsl x2, x8, 32\n\t"
|
||||
"lsl x1, x7, 32\n\t"
|
||||
"orr x2, x2, x7, lsr 32\n\t"
|
||||
"adds x1, x1, x7\n\t"
|
||||
"adc x2, x2, x8\n\t"
|
||||
"add x2, x2, x7\n\t"
|
||||
"lsl x3, x1, 32\n\t"
|
||||
"lsl x4, x2, 32\n\t"
|
||||
"orr x4, x4, x1, lsr 32\n\t"
|
||||
"lsr x5, x2, 32\n\t"
|
||||
"adds x7, x7, x3\n\t"
|
||||
"adcs x8, x8, x4\n\t"
|
||||
"adcs x9, x9, x5\n\t"
|
||||
"adcs x10, x10, xzr\n\t"
|
||||
"adcs x11, x11, xzr\n\t"
|
||||
"adcs x12, x12, xzr\n\t"
|
||||
"adcs x13, x13, x1\n\t"
|
||||
"adcs x14, x14, x2\n\t"
|
||||
"adcs x6, x6, xzr\n\t"
|
||||
"adds x3, x3, x2\n\t"
|
||||
"adcs x4, x4, x1\n\t"
|
||||
"adcs x5, x5, x2\n\t"
|
||||
"adcs x2, xzr, xzr\n\t"
|
||||
"subs x9, x9, x4\n\t"
|
||||
"sbcs x10, x10, x5\n\t"
|
||||
"sbcs x11, x11, x2\n\t"
|
||||
"sbcs x12, x12, xzr\n\t"
|
||||
"sbcs x13, x13, xzr\n\t"
|
||||
"sbcs x14, x14, xzr\n\t"
|
||||
"sbc x6, x6, xzr\n\t"
|
||||
"# a[2-9] += m[0-5] * mu[0..1] = m[0-5] * (a[2..3] * mp)\n\t"
|
||||
"ldp x7, x8, [%[a], #64]\n\t"
|
||||
"lsl x2, x10, 32\n\t"
|
||||
"lsl x1, x9, 32\n\t"
|
||||
"orr x2, x2, x9, lsr 32\n\t"
|
||||
"adds x1, x1, x9\n\t"
|
||||
"adc x2, x2, x10\n\t"
|
||||
"add x2, x2, x9\n\t"
|
||||
"lsl x3, x1, 32\n\t"
|
||||
"lsl x4, x2, 32\n\t"
|
||||
"orr x4, x4, x1, lsr 32\n\t"
|
||||
"lsr x5, x2, 32\n\t"
|
||||
"adds x7, x7, x6\n\t"
|
||||
"adcs x8, x8, xzr\n\t"
|
||||
"adc x6, xzr, xzr\n\t"
|
||||
"adds x9, x9, x3\n\t"
|
||||
"adcs x10, x10, x4\n\t"
|
||||
"adcs x11, x11, x5\n\t"
|
||||
"adcs x12, x12, xzr\n\t"
|
||||
"adcs x13, x13, xzr\n\t"
|
||||
"adcs x14, x14, xzr\n\t"
|
||||
"adcs x7, x7, x1\n\t"
|
||||
"adcs x8, x8, x2\n\t"
|
||||
"adcs x6, x6, xzr\n\t"
|
||||
"adds x3, x3, x2\n\t"
|
||||
"adcs x4, x4, x1\n\t"
|
||||
"adcs x5, x5, x2\n\t"
|
||||
"adcs x2, xzr, xzr\n\t"
|
||||
"subs x11, x11, x4\n\t"
|
||||
"sbcs x12, x12, x5\n\t"
|
||||
"sbcs x13, x13, x2\n\t"
|
||||
"sbcs x14, x14, xzr\n\t"
|
||||
"sbcs x7, x7, xzr\n\t"
|
||||
"sbcs x8, x8, xzr\n\t"
|
||||
"sbc x6, x6, xzr\n\t"
|
||||
"# a[4-11] += m[0-5] * mu[0..1] = m[0-5] * (a[4..5] * mp)\n\t"
|
||||
"ldp x9, x10, [%[a], #80]\n\t"
|
||||
"lsl x2, x12, 32\n\t"
|
||||
"lsl x1, x11, 32\n\t"
|
||||
"orr x2, x2, x11, lsr 32\n\t"
|
||||
"adds x1, x1, x11\n\t"
|
||||
"adc x2, x2, x12\n\t"
|
||||
"add x2, x2, x11\n\t"
|
||||
"lsl x3, x1, 32\n\t"
|
||||
"lsl x4, x2, 32\n\t"
|
||||
"orr x4, x4, x1, lsr 32\n\t"
|
||||
"lsr x5, x2, 32\n\t"
|
||||
"adds x9, x9, x6\n\t"
|
||||
"adcs x10, x10, xzr\n\t"
|
||||
"adc x6, xzr, xzr\n\t"
|
||||
"adds x11, x11, x3\n\t"
|
||||
"adcs x12, x12, x4\n\t"
|
||||
"adcs x13, x13, x5\n\t"
|
||||
"adcs x14, x14, xzr\n\t"
|
||||
"adcs x7, x7, xzr\n\t"
|
||||
"adcs x8, x8, xzr\n\t"
|
||||
"adcs x9, x9, x1\n\t"
|
||||
"adcs x10, x10, x2\n\t"
|
||||
"adcs x6, x6, xzr\n\t"
|
||||
"adds x3, x3, x2\n\t"
|
||||
"adcs x4, x4, x1\n\t"
|
||||
"adcs x5, x5, x2\n\t"
|
||||
"adcs x2, xzr, xzr\n\t"
|
||||
"subs x13, x13, x4\n\t"
|
||||
"sbcs x14, x14, x5\n\t"
|
||||
"sbcs x7, x7, x2\n\t"
|
||||
"sbcs x8, x8, xzr\n\t"
|
||||
"sbcs x9, x9, xzr\n\t"
|
||||
"sbcs x10, x10, xzr\n\t"
|
||||
"sbc x6, x6, xzr\n\t"
|
||||
"# Subtract mod if carry\n\t"
|
||||
"neg x6, x6\n\t"
|
||||
"mov x5, -2\n\t"
|
||||
"lsr x3, x6, 32\n\t"
|
||||
"lsl x4, x6, 32\n\t"
|
||||
"and x5, x5, x6\n\t"
|
||||
"subs x13, x13, x3\n\t"
|
||||
"sbcs x14, x14, x4\n\t"
|
||||
"sbcs x7, x7, x5\n\t"
|
||||
"sbcs x8, x8, x6\n\t"
|
||||
"sbcs x9, x9, x6\n\t"
|
||||
"sbc x10, x10, x6\n\t"
|
||||
"stp x13, x14, [%[a], #0]\n\t"
|
||||
"stp x7, x8, [%[a], #16]\n\t"
|
||||
"stp x9, x10, [%[a], #32]\n\t"
|
||||
:
|
||||
: [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
|
||||
: "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"
|
||||
);
|
||||
}
|
||||
|
||||
/* Reduce the number back to 384 bits using Montgomery reduction.
|
||||
*
|
||||
* a A single precision number to reduce in place.
|
||||
* m The single precision number representing the modulus.
|
||||
* mp The digit representing the negative inverse of m mod 2^n.
|
||||
*/
|
||||
SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* m,
|
||||
sp_digit mp)
|
||||
{
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"ldp x14, x15, [%[m], 0]\n\t"
|
||||
|
Reference in New Issue
Block a user