From 9f2419246e8da59090ff00a9eb5dfe9faa2f0e19 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 21 Dec 2021 10:18:12 +1000 Subject: [PATCH] SP ARM64: P-384 prime specific Montogmery Reduction Improves performance --- wolfcrypt/src/sp_arm64.c | 139 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 8280e5696..adab94b4b 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -40527,8 +40527,6 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m) ); } -#define sp_384_mont_reduce_order_6 sp_384_mont_reduce_6 - /* Reduce the number back to 384 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -40538,6 +40536,143 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m) SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m, sp_digit mp) { + __asm__ __volatile__ ( + "ldp x7, x8, [%[a], #0]\n\t" + "ldp x9, x10, [%[a], #16]\n\t" + "ldp x11, x12, [%[a], #32]\n\t" + "mov x6, xzr\n\t" + "# a[0-7] += m[0-5] * mu[0..1] = m[0-5] * (a[0..1] * mp)\n\t" + "ldp x13, x14, [%[a], #48]\n\t" + "lsl x2, x8, 32\n\t" + "lsl x1, x7, 32\n\t" + "orr x2, x2, x7, lsr 32\n\t" + "adds x1, x1, x7\n\t" + "adc x2, x2, x8\n\t" + "add x2, x2, x7\n\t" + "lsl x3, x1, 32\n\t" + "lsl x4, x2, 32\n\t" + "orr x4, x4, x1, lsr 32\n\t" + "lsr x5, x2, 32\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adcs x9, x9, x5\n\t" + "adcs x10, x10, xzr\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adcs x13, x13, x1\n\t" + "adcs x14, x14, x2\n\t" + "adcs x6, x6, xzr\n\t" + "adds x3, x3, x2\n\t" + "adcs x4, x4, x1\n\t" + "adcs x5, x5, x2\n\t" + "adcs x2, xzr, xzr\n\t" + "subs x9, x9, x4\n\t" + "sbcs x10, x10, x5\n\t" + "sbcs x11, x11, x2\n\t" + "sbcs x12, x12, xzr\n\t" + "sbcs x13, x13, xzr\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x6, x6, xzr\n\t" + "# a[2-9] += m[0-5] * mu[0..1] = m[0-5] * (a[2..3] * mp)\n\t" + "ldp x7, x8, [%[a], #64]\n\t" + "lsl x2, x10, 32\n\t" + "lsl x1, x9, 32\n\t" + "orr x2, x2, x9, lsr 32\n\t" + "adds x1, x1, x9\n\t" + "adc x2, x2, x10\n\t" + "add x2, x2, x9\n\t" + "lsl x3, x1, 32\n\t" + "lsl x4, x2, 32\n\t" + "orr x4, x4, x1, lsr 32\n\t" + "lsr x5, x2, 32\n\t" + "adds x7, x7, x6\n\t" + "adcs x8, x8, xzr\n\t" + "adc x6, xzr, xzr\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adcs x11, x11, x5\n\t" + "adcs x12, x12, xzr\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adcs x7, x7, x1\n\t" + "adcs x8, x8, x2\n\t" + "adcs x6, x6, xzr\n\t" + "adds x3, x3, x2\n\t" + "adcs x4, x4, x1\n\t" + "adcs x5, x5, x2\n\t" + "adcs x2, xzr, xzr\n\t" + "subs x11, x11, x4\n\t" + "sbcs x12, x12, x5\n\t" + "sbcs x13, x13, x2\n\t" + "sbcs x14, x14, xzr\n\t" + "sbcs x7, x7, xzr\n\t" + "sbcs x8, x8, xzr\n\t" + "sbc x6, x6, xzr\n\t" + "# a[4-11] += m[0-5] * mu[0..1] = m[0-5] * (a[4..5] * mp)\n\t" + "ldp x9, x10, [%[a], #80]\n\t" + "lsl x2, x12, 32\n\t" + "lsl x1, x11, 32\n\t" + "orr x2, x2, x11, lsr 32\n\t" + "adds x1, x1, x11\n\t" + "adc x2, x2, x12\n\t" + "add x2, x2, x11\n\t" + "lsl x3, x1, 32\n\t" + "lsl x4, x2, 32\n\t" + "orr x4, x4, x1, lsr 32\n\t" + "lsr x5, x2, 32\n\t" + "adds x9, x9, x6\n\t" + "adcs x10, x10, xzr\n\t" + "adc x6, xzr, xzr\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adcs x13, x13, x5\n\t" + "adcs x14, x14, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, x1\n\t" + "adcs x10, x10, x2\n\t" + "adcs x6, x6, xzr\n\t" + "adds x3, x3, x2\n\t" + "adcs x4, x4, x1\n\t" + "adcs x5, x5, x2\n\t" + "adcs x2, xzr, xzr\n\t" + "subs x13, x13, x4\n\t" + "sbcs x14, x14, x5\n\t" + "sbcs x7, x7, x2\n\t" + "sbcs x8, x8, xzr\n\t" + "sbcs x9, x9, xzr\n\t" + "sbcs x10, x10, xzr\n\t" + "sbc x6, x6, xzr\n\t" + "# Subtract mod if carry\n\t" + "neg x6, x6\n\t" + "mov x5, -2\n\t" + "lsr x3, x6, 32\n\t" + "lsl x4, x6, 32\n\t" + "and x5, x5, x6\n\t" + "subs x13, x13, x3\n\t" + "sbcs x14, x14, x4\n\t" + "sbcs x7, x7, x5\n\t" + "sbcs x8, x8, x6\n\t" + "sbcs x9, x9, x6\n\t" + "sbc x10, x10, x6\n\t" + "stp x13, x14, [%[a], #0]\n\t" + "stp x7, x8, [%[a], #16]\n\t" + "stp x9, x10, [%[a], #32]\n\t" + : + : [a] "r" (a), [m] "r" (m), [mp] "r" (mp) + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14" + ); +} + +/* Reduce the number back to 384 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* m, + sp_digit mp) +{ __asm__ __volatile__ ( "ldp x14, x15, [%[m], 0]\n\t"