diff --git a/src/include.am b/src/include.am index 50353c02b..ed640e63e 100644 --- a/src/include.am +++ b/src/include.am @@ -375,6 +375,7 @@ if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv7-curve25519.S src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S else src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c @@ -393,6 +394,7 @@ if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv7-curve25519.S src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S else src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c diff --git a/wolfcrypt/src/fe_operations.c b/wolfcrypt/src/fe_operations.c index df74a350c..136f40ebf 100644 --- a/wolfcrypt/src/fe_operations.c +++ b/wolfcrypt/src/fe_operations.c @@ -42,7 +42,9 @@ #endif #ifdef CURVED25519_X64 -/* Assumbly code in fe_x25519_asm.* */ +/* Assembly code in fe_x25519_asm.* */ +#elif defined(WOLFSSL_ARMASM) +/* Assembly code in fe_armv[78]_x25519.* */ #elif defined(CURVED25519_128BIT) #include "fe_x25519_128.i" #else diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c index 537227017..9d2ce8d4b 100644 --- a/wolfcrypt/src/ge_operations.c +++ b/wolfcrypt/src/ge_operations.c @@ -42,13 +42,22 @@ #include #endif -#if defined(CURVED25519_X64) || defined(WOLFSSL_ARMASM) - #define CURVED25519_ASM_64BIT +#if defined(CURVED25519_X64) + #define CURVED25519_ASM_64BIT + #define CURVED25519_ASM +#endif +#if defined(WOLFSSL_ARMASM) + #if defined(__aarch64__) + #define CURVED25519_ASM_64BIT + #else + #define CURVED25519_ASM_32BIT + #endif + #define CURVED25519_ASM #endif static void ge_p2_0(ge_p2 *); -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM static void ge_precomp_0(ge_precomp *); #endif static void ge_p3_to_p2(ge_p2 *,const ge_p3 *); @@ -86,6 +95,28 @@ Representations: #define ORDER_4 0x1dea2f #define ORDER_5 0xa6f7c +#ifdef CURVED25519_ASM_32BIT +uint64_t load_3(const unsigned char *in) +{ + uint64_t result; + result = (uint64_t) in[0]; + result |= ((uint64_t) in[1]) << 8; + result |= ((uint64_t) in[2]) << 16; + return result; +} + + +uint64_t load_4(const unsigned char *in) +{ + uint64_t result; + result = (uint64_t) in[0]; + result |= ((uint64_t) in[1]) << 8; + result |= ((uint64_t) in[2]) << 16; + result |= ((uint64_t) in[3]) << 24; + return result; +} +#endif + /* Input: s[0]+256*s[1]+...+256^63*s[63] = s @@ -932,7 +963,7 @@ r = p + q */ static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -952,7 +983,7 @@ static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) } -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM /* ge_scalar mult base */ static unsigned char equal(signed char b,signed char c) { @@ -3673,6 +3704,2697 @@ static const ge_precomp base[64][8] = { }, }, }; +#elif defined(CURVED25519_ASM_32BIT) +static const ge_precomp base[64][8] = { +{ + { + { -0x0a73c47b, 0x2fbc93c6, -0x0473f1e7, -0x306cd23a, 0x643d42c2, 0x270b4898, 0x33d4ba65, 0x07cf9d3a }, + { -0x28bf6ec2, -0x62efc6fb, -0x2ebf414d, -0x02c660fb, 0x688f8a09, -0x5a3e7bcc, -0x6707ed99, 0x44fd2f92 }, + { 0x4b6fbb59, -0x2442ea99, -0x115d5a16, 0x41e13f00, -0x36a83906, -0x322b62e4, -0x50e91336, 0x4f0ebe1f } + }, + { + { -0x6cc38e29, -0x6ddb1804, 0x7a0ff5b5, -0x60b9626a, -0x1e29f8fe, 0x5aa69a65, -0x5782d1d2, 0x590c063f }, + { 0x42b4d5a8, -0x75665aa0, 0x4e60acf6, -0x70d47ef4, -0x4e91c856, -0x1f61dc95, 0x69c92555, 0x6bb595a6 }, + { -0x252c97fe, 0x6e347eaa, -0x7c11b7fb, -0x450ca66d, -0x19f897da, 0x3bcabe10, 0x165ed1b8, 0x49314f0a } + }, + { + { 0x4cee9730, -0x50da4f58, -0x1779b476, 0x025a8430, -0x60fe98ce, -0x3ee4affe, -0x657f070c, 0x7a164e1b }, + { -0x5b032d9b, 0x56611fe8, -0x1a3e4583, 0x3bd353fd, 0x214bd6bd, -0x7ece0ce6, 0x555bda62, 0x2ab91587 }, + { -0x0e98b7cc, -0x640dee0c, -0x09d2076b, -0x47b194e9, 0x5b722a4e, -0x282190f9, 0x63bb2a21, 0x549a04b9 } + }, + { + { -0x7103f661, 0x287351b9, 0x7dfd2538, 0x6765c6f4, -0x04f56d9b, -0x35cb72c3, 0x21e58727, 0x680e9103 }, + { 0x056818bf, -0x6a01faf6, 0x5660faa9, 0x327e8971, 0x06a05073, -0x3c171c33, 0x7445a49a, 0x27933f4c }, + { -0x1aebd950, -0x40e1ba14, 0x6dba0f94, -0x1cd439c3, -0x7307ad40, -0x1bd68b2b, -0x4f19b3e8, 0x44f079b1 } + }, + { + { 0x08a5bb33, -0x5ded43bc, -0x38a112fe, -0x72afb73d, 0x5abfec44, -0x22e414f4, 0x46e206eb, 0x2945ccf1 }, + { -0x5bb82946, 0x7f9182c3, 0x4b2729b7, -0x2affeb2f, -0x479b5f79, -0x1cc30ee4, -0x14e4aa0d, 0x154a7e73 }, + { -0x182ffc4d, -0x37cd5e87, 0x00124d7e, 0x5f729d0a, 0x0e6d8ff3, 0x62c1d4a1, 0x38b27a98, 0x68b8ac59 } + }, + { + { 0x77157131, 0x3a0ceeeb, 0x00c8af88, -0x64d8ea77, -0x25a658ca, -0x7f9a4998, -0x5d33c743, 0x51e57bb6 }, + { 0x7b7d8ca4, 0x499806b6, 0x27d22739, 0x575be284, 0x204553b9, -0x44f7a319, -0x51be877c, 0x38b64c41 }, + { 0x689de3a4, -0x7062526f, -0x07046ec9, 0x175f2428, -0x60304678, 0x050ab532, 0x1354c09f, 0x7865dfa2 } + }, + { + { -0x6bb15c41, 0x6b1a5cd0, -0x4c623f2e, 0x7470353a, 0x28542e49, 0x71b25282, 0x283c927e, 0x461bea69 }, + { -0x55cdde4f, -0x4590d366, 0x3bba23a7, 0x6ca02153, -0x6de6d3c6, -0x621589b1, 0x2e5317e0, 0x1d6edd5d }, + { -0x54f025ca, 0x217a8aac, 0x3d3549c8, -0x5ad739ac, 0x13ab7568, 0x37d05b8b, 0x3a2cbc37, 0x233cef62 } + }, + { + { 0x04dd3e8f, 0x59b75966, -0x1d778fd4, 0x6cb30377, 0x5ed9c323, -0x4ecc639a, 0x61bce52f, 0x0915e760 }, + { -0x0c6dcb27, -0x1d58a213, -0x1e4aa707, -0x69c28980, 0x6e3c23fb, 0x2c2741ac, 0x320e01c3, 0x3a9024a1 }, + { -0x57cb5c82, -0x208217cb, 0x689857ea, -0x741e6326, 0x7167b326, 0x2c118536, -0x24102a3e, 0x589eb3d9 } + }, +}, +{ + { + { 0x2d9021f6, 0x322d04a5, 0x75c6bf9c, -0x463e60cd, 0x42d20b09, 0x587a3a43, -0x559b019f, 0x143b1cf8 }, + { 0x553e2df3, 0x7ec851ca, -0x59b7874d, -0x58ed7b35, 0x3288d1e7, -0x194a1be7, 0x5a9a8883, 0x4cf210ec }, + { -0x69753555, -0x60798383, 0x27092729, 0x5f54258e, -0x15e7f68b, -0x2f582cb5, 0x374126e1, 0x21b546a3 } + }, + { + { -0x2e7ade71, 0x490a7a45, 0x46049335, -0x65eac888, -0x33ce1e0a, 0x0060ea09, -0x0791169b, 0x7e041577 }, + { -0x5d777cbd, -0x56b007a8, 0x5313ed3c, -0x31f12baa, -0x4a40cb06, -0x0aa3c231, -0x36154c8f, 0x0a653ca5 }, + { -0x31a4980d, 0x66b2a496, -0x42a9686a, -0x00ab6d28, 0x4a592cd0, 0x503cec29, 0x0813acb2, 0x56694365 } + }, + { + { 0x1dabb69d, 0x5672f9eb, -0x5017ac04, -0x458f4acb, 0x2796d66d, 0x47ac0f75, -0x6bee8d8b, 0x32a53517 }, + { 0x26620798, -0x47e724f4, 0x606e354a, 0x5d5c31d9, 0x00a8cdc7, 0x0982fa4f, 0x4653e2d4, 0x17e12bcd }, + { -0x209b7bc9, -0x2c59bb5a, -0x77f04023, 0x703b6559, -0x52c5e55b, -0x347adac0, -0x71b39b98, 0x0900b3f7 } + }, + { + { -0x37e952cf, -0x12d7f042, -0x2719101d, 0x52d9595b, -0x0939dc0b, 0x0fe71772, 0x051e293c, 0x4314030b }, + { 0x679d651b, 0x0a851b9f, 0x033342f2, -0x1ef7349f, -0x1774cf5d, -0x29fe0a81, -0x12d228ec, 0x371f3aca }, + { -0x040f4353, -0x2a9fffa2, -0x2e78f3a2, -0x7148f0d2, -0x2f7b1960, 0x201f9033, -0x31849990, 0x4c3a5ae1 } + }, + { + { -0x36c25f23, -0x45078a1c, 0x71b9294d, -0x46cd7d59, -0x0b393ba0, -0x7f29c049, -0x15993e7f, 0x6de9c73d }, + { -0x2347056b, 0x4138a434, 0x6c96840b, -0x78f30983, 0x297be82c, -0x21c77a8c, 0x7262a55a, 0x7c814db2 }, + { -0x5fb2070e, 0x478904d5, -0x4efebd2d, -0x050451b6, 0x555d0998, -0x0937539d, 0x2f90b104, 0x5aac4a41 } + }, + { + { -0x4280aecc, 0x603a0d0a, -0x1e2c51ba, -0x7f7636ce, -0x7867429d, -0x20da6ec7, 0x74ba0235, 0x1c145cd2 }, + { 0x3ac92908, -0x39b0cd95, -0x199c1e20, 0x5551b282, 0x4a1a4b83, 0x476b35f5, 0x189f68c2, 0x1b9da3fe }, + { 0x75f3d743, 0x32e83864, 0x6ae5d9ef, 0x365b8baf, 0x385b681e, -0x7dadc74a, 0x167d65e1, 0x234929c1 } + }, + { + { 0x1d099fcf, 0x48145cc2, -0x33d7281b, 0x4535c192, 0x48247e01, -0x7f183e1b, 0x3b2973ee, 0x4a5f2874 }, + { -0x5f885218, -0x67b21355, 0x19eb389d, 0x383f77ad, 0x2954d794, -0x38139482, -0x1483c586, 0x59c77b3a }, + { 0x225ccf62, -0x2c5228db, -0x4dead3a3, -0x6ee5cc7f, 0x5b08f87d, -0x274c6053, 0x4799fe3b, 0x6f05606b } + }, + { + { -0x06e49b7d, 0x5b433149, 0x5a2cbf62, -0x524a239b, 0x632827b3, -0x78057bee, -0x54b60728, 0x60895e91 }, + { 0x177ba962, -0x6001616e, 0x0de5cae1, -0x675118e3, 0x2d831044, 0x3ff4ae94, 0x58533ac8, 0x714de12e }, + { 0x0cf86c18, -0x16130d13, 0x0735dfd4, -0x4b92f9ee, 0x04b96be7, -0x43625f68, -0x26923d95, 0x73e2e62f } + }, +}, +{ + { + { 0x632f9c1d, 0x2eccdd0e, 0x76893115, 0x51d0b696, -0x579c85a8, 0x52dfb76b, -0x5ff110c7, 0x6dd37d49 }, + { 0x49aa515e, -0x12a49cac, 0x0bc6823a, -0x579a3b61, 0x5b42d1c4, -0x7af3e017, 0x03d315b9, 0x30d76d6f }, + { 0x2106e4c7, 0x6c444417, -0x6d728097, -0x04ac2980, 0x694d3f26, -0x4b8c615c, 0x2e864bb0, 0x10c69711 } + }, + { + { -0x7ca737fb, 0x0ca62aa0, 0x7a204247, 0x6a3d4ae3, 0x3b11eddc, 0x7464d3a6, 0x550806ef, 0x03bf9baf }, + { 0x7dbe5fde, 0x6493c427, 0x19ad7ea2, 0x265d4fad, 0x46304590, 0x0e00dfc8, -0x129901f7, 0x25e61cab }, + { -0x33a799fc, 0x3f13e128, -0x4ba68b82, 0x6f5873ec, -0x33ed970b, -0x5f49c213, 0x4586e22c, 0x566d7863 } + }, + { + { -0x39a5d030, -0x5efabd7b, -0x0ce9983d, 0x6c64112a, 0x731aee58, 0x680ae240, 0x4793b22a, 0x14fba5f3 }, + { -0x633ef7cc, 0x1637a49f, -0x57643baf, -0x4371a92b, 0x7f7fd2db, 0x1cb5ec0f, 0x5ecc35d9, 0x33975bca }, + { 0x6985f7d4, 0x3cd74616, -0x3637ffa9, 0x593e5e84, 0x7b61131e, 0x2fc3f2b6, -0x7c03ad94, 0x14829cea } + }, + { + { 0x4e71ecb8, 0x21e70b2f, 0x40a477e3, -0x19a92247, -0x31e2b080, -0x409aa932, 0x535d7b7e, 0x05fc3bc4 }, + { -0x68226a3e, -0x00bc847c, -0x55b14a59, 0x6c744e30, 0x3c85e88b, -0x61f3a29f, 0x5f758173, 0x2fd9c71e }, + { 0x52afdedd, 0x24b8b3ae, -0x12c4cf31, 0x3495638c, -0x56417e6b, 0x33a4bc83, 0x5c651f04, 0x37376747 } + }, + { + { 0x14246590, 0x634095cb, 0x16c15535, -0x10edebc0, -0x76ef43a0, -0x61c7ebf4, 0x30907c8c, 0x6bf59057 }, + { 0x40d1add9, 0x2fba99fd, -0x690b2fd9, -0x4cf8e991, 0x15f03bae, 0x4363f052, 0x3b18f999, 0x1fbea56c }, + { -0x1ebea476, 0x0fa778f1, -0x453c5882, 0x06409ff7, -0x655d65b0, 0x6f52d7b8, 0x7a635a56, 0x02521cf6 } + }, + { + { 0x772f5ee4, -0x4eeb98e0, -0x69f86532, -0x17076b4f, 0x00ac824a, 0x4af8224d, -0x0832933c, 0x001753d9 }, + { 0x0a9d5294, 0x513fee0b, 0x0fdf5a66, -0x706718a4, -0x401ef832, -0x2b9e7978, 0x71382ced, 0x3fa00a7e }, + { -0x69c224cc, 0x3c69232d, -0x4b68c7a8, 0x1dde87da, -0x5f6e0d7b, -0x55282e07, -0x5fb7124a, 0x12b5fe2f } + }, + { + { -0x5290e16e, -0x20d483da, 0x504b8913, 0x4b66d323, 0x751c8bc3, -0x73bf6240, 0x0796c7b8, 0x6f7e93c2 }, + { -0x69031cb3, 0x71f0fbc4, -0x520ca413, 0x73b9826b, -0x00d73a9f, -0x2dfb8d9f, 0x6fb1206f, 0x749b76f9 }, + { -0x515951fb, 0x1f5af604, -0x411b6367, -0x3edcae0f, -0x1100949a, 0x61a808b5, 0x01e02151, 0x0fcec10f } + }, + { + { -0x3bdbb1bb, 0x3df2d29d, -0x6c2721f6, 0x2b020e74, -0x7df3deb3, 0x6cc8067e, 0x6feab90a, 0x41377916 }, + { 0x49fe1e44, 0x644d58a6, 0x31ad777e, 0x21fcaea2, -0x77802f2e, 0x02441c5a, -0x7c3aee0d, 0x4901aa71 }, + { -0x73e50710, 0x08b1b754, 0x246299b4, -0x31f08584, 0x1e06d939, -0x089f4f07, 0x726d1213, 0x41bb887b } + }, +}, +{ + { + { -0x55c6082e, -0x68267f20, 0x52c6b51c, 0x35d03842, 0x07cd55aa, 0x7d43f493, -0x48753c9e, 0x56bd36cf }, + { 0x567c49d8, -0x6d987f94, -0x3586e196, 0x066d04cc, -0x1c33c6b5, -0x5960a9bb, -0x5f87732e, 0x5c95b686 }, + { 0x0d14a954, 0x2ac519c1, -0x6b4a0570, -0x150b8b4c, -0x560785a6, -0x19507c7e, -0x78641f6c, 0x0dea6db1 } + }, + { + { -0x29578686, 0x15baeb74, -0x053be8ce, 0x7ef55cf1, 0x3c8b05c5, 0x29001f5a, 0x52eaccfb, 0x0ad7cc87 }, + { 0x7344e5ab, -0x559940ac, -0x70e4bcf7, -0x25eda778, -0x02a9b4d1, 0x5e87d2b3, 0x5483b1dd, 0x5b2c7888 }, + { 0x793408cf, 0x52151362, 0x19963d94, -0x14f0e8fd, -0x77c26b9a, -0x57cc4d06, 0x75003c78, 0x093a7fa7 } + }, + { + { 0x60a91286, -0x47169fbc, 0x7778d3de, 0x7f3fd804, -0x4075a1d3, 0x67d01e31, -0x3d849ac2, 0x7b038a06 }, + { 0x3a16d7be, -0x1aef821a, -0x650ccd31, -0x5c880024, 0x440b677f, 0x70d5bf18, -0x5b5cebfd, 0x6a252b19 }, + { -0x2c966f0d, -0x6126e62b, -0x24b1460e, 0x5213aebb, 0x4cb99135, -0x38f715fb, 0x72260e56, 0x58ded57f } + }, + { + { 0x5b0fd48b, -0x2592acda, -0x6c405678, -0x769f7dcf, 0x61d57e28, -0x287536ce, 0x3a5c8143, 0x79f2942d }, + { -0x16bec289, 0x78e79dad, -0x68d61983, -0x0da8062b, -0x1c85581a, 0x59db910e, -0x4461fc64, 0x6aa11b5b }, + { -0x49377217, -0x6825d0db, -0x530dfe97, 0x251ba7ea, -0x10b14b1c, 0x09b44f87, -0x4395825b, 0x7d90ab1b } + }, + { + { -0x694c3c69, 0x1a07a3f4, -0x70b1dace, 0x11ceaa18, -0x588ae410, 0x7d9498d5, 0x508dd8a0, 0x19ed161f }, + { -0x58fe9402, -0x6533597d, -0x0d3af493, -0x6fafa0b3, -0x331bca56, 0x6b610d5f, 0x6198ff96, 0x19a10d44 }, + { -0x78231936, 0x560a2cd6, -0x799b30b3, 0x7f3568c4, 0x22803a38, -0x78be16ae, 0x595653fc, 0x483bdab1 } + }, + { + { -0x4b257f0a, -0x2930b2f6, -0x07cf8020, -0x7db7c1bb, -0x5190625c, 0x05005269, -0x63087886, 0x1c705290 }, + { -0x78cb05b7, -0x0587f0ec, 0x360534e0, 0x106f0b70, -0x1c1cf843, 0x2210776f, -0x22195f02, 0x3286c109 }, + { -0x78b1672c, 0x32ee7de2, -0x4681f3a0, 0x14c362e9, 0x6a60a38a, 0x5781dcde, -0x558557c0, 0x217dd5ea } + }, + { + { -0x4173f138, -0x7420e047, -0x1cf5fd7e, 0x00bae7f8, -0x5293b094, 0x4963991d, 0x5df6f60a, 0x07058a6e }, + { 0x248e1eb0, -0x62483b30, 0x4d74bf52, -0x1f89681f, 0x3c562354, 0x1e6a9b17, 0x795a4965, 0x7fa7c21f }, + { -0x24ce0981, -0x1614fd3c, 0x10bcfb2b, -0x12da0277, 0x5c5cddb4, 0x46c8131f, -0x5f346432, 0x33b21c13 } + }, + { + { 0x5ee38c5b, -0x65504650, 0x071a13c7, -0x4062d2b2, -0x16ccd6f6, -0x71119193, -0x51ef68e9, 0x1c3bab17 }, + { 0x087d8e31, 0x360692f8, -0x2d8e9c09, -0x0b2339c9, 0x65ea5963, 0x25a4e620, 0x5ac160d9, 0x659bf72e }, + { -0x38354850, 0x1c9ab216, 0x07bbc3cc, 0x7d65d374, 0x504a58d5, 0x52744750, 0x131a2990, 0x09f2606b } + }, +}, +{ + { + { 0x7c6691ae, 0x7e234c59, 0x0a85b4c8, 0x64889d3d, 0x354afae7, -0x251d36f4, 0x0c6a9e1d, 0x0a871e07 }, + { 0x744346be, 0x40e87d44, 0x15b52b25, 0x1d48dad4, -0x5ec49fc2, 0x7c3a8a18, 0x2fcdbdf7, 0x4eb728c1 }, + { 0x4bbc8989, 0x3301b599, 0x5bdd4260, 0x736bae3a, 0x19d59e3c, 0x0d61ade2, 0x2685d464, 0x3ee7300f } + }, + { + { -0x7be18ae8, 0x43fa7947, 0x639c46d7, -0x1a3905a7, -0x1cfad48c, -0x5ef9a1e3, -0x30476fd0, 0x7d47c6a2 }, + { -0x61822949, -0x0a2daa1c, 0x610b1eac, -0x7fe9eea4, -0x6d1e7836, 0x3c99975d, -0x686eda3e, 0x13815762 }, + { -0x710f2920, 0x3fdad014, -0x6eab90c4, -0x62c18b66, 0x26bb8157, 0x71ec6210, 0x34c9ec80, 0x148cf58d } + }, + { + { -0x651b8a93, -0x1da8d083, -0x770cb781, 0x56c345bb, 0x6960a88d, -0x602ef493, 0x4eaea1b9, 0x278febad }, + { 0x7934f027, 0x46a492f6, -0x097bf557, 0x469984be, -0x769ee7ac, 0x5ca1bc2a, -0x42a2442c, 0x3ff2fa1e }, + { -0x736cc69a, -0x4e5597e1, 0x20290c98, -0x73de6b64, 0x219d3c52, 0x39115291, -0x01639885, 0x4104dd02 } + }, + { + { -0x24f69548, -0x7edeb1fa, 0x0ce44f35, 0x21a8b6c9, 0x409e2af5, 0x6524c12a, -0x71035b7f, 0x0165b5a4 }, + { 0x1124422a, 0x72b2bf5e, -0x675cc54b, -0x5e05f3cd, -0x05ad499a, -0x6b349eff, -0x5050ac2b, 0x2c863b00 }, + { -0x5f7b958a, -0x0e6f5b8c, -0x32d08340, 0x12eff984, 0x58aa2b8f, 0x695e2906, -0x40013748, 0x591b67d9 } + }, + { + { -0x60e74aa3, -0x66464c8f, -0x5e739be2, -0x1b9a1a06, -0x3d60fa13, 0x61081136, 0x7030128b, 0x489b4f86 }, + { -0x7f4b6406, 0x312f0d1c, -0x540c1376, 0x5979515e, -0x610fe378, 0x727033c0, -0x35708435, 0x3de02ec7 }, + { 0x3aeb92ef, -0x2dcdefd3, 0x6116a861, -0x1e9dac4c, 0x190baa24, 0x3d7eabe7, 0x496cbebf, 0x49f5fbba } + }, + { + { 0x1e9c572e, 0x155d628c, -0x3a77b8bf, -0x75b27954, 0x515763eb, -0x6e5cad0a, -0x7798aea5, 0x06a1a6c2 }, + { -0x75a4302c, 0x30949a10, -0x439b8c15, -0x23bf2290, 0x307c0d1c, -0x6d3d6b3f, -0x3405918c, 0x5604a86d }, + { 0x7c1764b6, 0x7288d1d4, -0x1fbe74af, 0x72541140, 0x18acf6d1, -0x60fce5a0, -0x01d8bd3a, 0x20989e89 } + }, + { + { -0x7a1513d2, 0x1674278b, 0x7acb2bdf, 0x5621dc07, 0x61cbf45a, 0x640a4c16, -0x08fa6a2d, 0x730b9950 }, + { 0x3a2dcc7f, 0x499777fd, -0x5ab0276e, 0x32857c2c, -0x2df81c60, -0x5d86279c, 0x0ca67e29, 0x0403ed1d }, + { -0x78b13aae, -0x36b4d2cb, -0x67db9073, -0x3a193731, 0x16c035ce, -0x0834b906, 0x08303dcc, 0x5bd74543 } + }, + { + { 0x15e7792a, -0x7a3b6cdf, -0x42322237, -0x39b3765e, -0x525c289e, -0x62e1c258, 0x3067f82c, 0x5bb7db12 }, + { 0x28b24cc2, 0x7f9ad195, 0x6335c181, 0x7f6b5465, 0x4fc07236, 0x66b8b66e, 0x7380ad83, 0x133a7800 }, + { -0x39359d42, 0x0961f467, 0x211952ee, 0x04ec21d6, -0x642ab890, 0x18236077, 0x58f0e0d2, 0x740dca6d } + }, +}, +{ + { + { -0x12d9e51b, 0x3906c72a, -0x771eff09, -0x65497027, -0x0cc9fe69, -0x0a16fa66, -0x40d492b9, 0x0e53dc78 }, + { -0x2c0f50f5, 0x50b70bf5, -0x1cd18e09, 0x4feaf48a, -0x5aa442cc, 0x60e84ed3, 0x3f50d1ed, 0x00ed489b }, + { 0x7971877a, -0x46f7d641, 0x6d17e631, 0x5e444463, 0x18276893, 0x4d05c52e, 0x5a4a4af5, 0x27632d9a } + }, + { + { -0x78150025, -0x567d7a2f, -0x272f579c, -0x5a4b0445, 0x022663f7, -0x49a70d81, -0x26631d7e, 0x3bbc2b22 }, + { 0x54b260ce, -0x2ee00faf, 0x72f95270, -0x27923c72, 0x267cc138, 0x601fcd0d, 0x29e90ccd, 0x2b679164 }, + { 0x583c0a58, -0x46e836ae, 0x0fe4c6f3, 0x653ff9b8, -0x4320c3f4, -0x64f25829, -0x54ab29f2, 0x43a0eeb6 } + }, + { + { 0x57875fe8, 0x3ac63223, -0x0a043471, -0x262b0b14, 0x382bb620, -0x72117b6d, 0x4c799fdc, 0x50c5eaa1 }, + { 0x6d4a5487, 0x396966a4, -0x53d44c46, -0x07ee5e76, 0x5628b26b, 0x66e4685b, -0x626d646e, 0x70a47702 }, + { -0x290d04c4, -0x22f12375, -0x63384860, 0x54c63aa7, 0x2c8d9f1a, -0x51f4fcd5, 0x602967fb, 0x6f9ce107 } + }, + { + { 0x3520e0b5, 0x13969306, -0x7715fc02, 0x437fcf7c, -0x2c36a644, -0x082b3bf5, -0x076c2127, 0x699154d1 }, + { -0x321e3dd6, -0x52efab4f, 0x48eb32df, -0x3b5716fe, -0x53323f16, 0x5f3e7b33, -0x038669c2, 0x72364713 }, + { -0x4b4d8ada, 0x315d5c75, 0x0236daa5, -0x33347bd3, 0x345fee8e, 0x22f0c8a3, 0x7d39dbed, 0x73975a61 } + }, + { + { -0x0bbcc1ba, 0x6f37f392, 0x1f566b18, 0x0e19b9a1, 0x1fd1d662, 0x220fb78a, -0x5c7e36b3, 0x362a4258 }, + { 0x6375da10, -0x1bfdb207, 0x1830c870, 0x78d3251a, 0x658cd91c, -0x6fd4e6b8, 0x29b7438a, 0x7e18b10b }, + { 0x2b6beb2f, -0x6f8e26ed, 0x28418247, 0x0f26e9ad, -0x42136da3, -0x1546e137, -0x0b750d22, 0x4be65bc8 } + }, + { + { 0x57c26234, 0x1d50fba2, -0x214f9875, 0x7bd4823a, -0x59ac750b, -0x3d4f2392, 0x351da73e, 0x5665eec6 }, + { -0x5c918fd8, 0x78487feb, 0x1dd8ce34, 0x5f3f1300, 0x4b30c489, -0x6cb04ed3, 0x397f0a2b, 0x056c244d }, + { 0x43bfb210, -0x24c11ff7, 0x20800ac2, 0x49720187, 0x73bd8667, 0x26ab5d61, -0x54dfb6c8, 0x20b209c2 } + }, + { + { 0x16bd3289, 0x1fcca945, 0x41420428, 0x448d65aa, 0x16a55d62, 0x59c3b7b2, 0x4e612cd8, 0x49992cc6 }, + { -0x3f804cb5, 0x549e342a, 0x21373d93, 0x02d82208, -0x532e0a99, -0x43d9d290, -0x0435387c, 0x7a92c9fd }, + { 0x70f801de, 0x65bd1bea, -0x01b61d76, 0x1befb7c0, -0x4e4d51b6, -0x579cf933, 0x265c2a09, 0x3b7ac0cd } + }, + { + { 0x22ed39a7, -0x0f2ab1b1, 0x5608150a, -0x5d5516e2, -0x1225178b, -0x0bde4d17, 0x6b7de992, 0x31bc531d }, + { -0x73fe4314, -0x7dd411bd, -0x3f0438c5, 0x530cb525, -0x3e6ac017, 0x48519034, -0x1f65f0a5, 0x265cc261 }, + { -0x567f068f, -0x20c2ecb3, 0x221a22a7, 0x7a4fb8d1, 0x35aad6d8, 0x3df7d420, 0x6a1a125e, 0x2a14edcc } + }, +}, +{ + { + { 0x0478433c, 0x231a8c57, -0x3d7ebc63, -0x484ad8f2, -0x1c26f861, -0x24556616, 0x6c2b03d9, 0x2c03f525 }, + { 0x52cfce4e, -0x20b711f9, 0x06ec08b7, -0x3c00050d, -0x46aba63c, 0x05710b2a, -0x69c15c73, 0x161d25fa }, + { 0x7b53a47d, 0x790f1875, -0x30f3a787, 0x307b0130, 0x257ef7f9, 0x31903d77, -0x42694451, 0x699468bd } + }, + { + { 0x6aa91948, -0x2722c21a, 0x2fc0d2cc, 0x485064c2, 0x34fdea2f, -0x64b7db9a, 0x6c4a2e3a, 0x293e1c4e }, + { -0x0b250131, -0x42e0d0ba, -0x5b802909, 0x7cef0114, 0x4a47b37f, -0x2ce00226, 0x73905785, 0x525219a4 }, + { -0x6daeed1f, 0x376e134b, -0x235ea260, 0x703778b5, 0x461c3111, -0x4fba7651, 0x7f032823, 0x5b605c44 } + }, + { + { -0x0f180fb4, 0x3be9fec6, 0x75e34962, -0x7995a862, 0x1e1de61a, 0x5542ef16, -0x33a5422b, 0x2f12fef4 }, + { 0x20c47c89, -0x469a7fa7, -0x6dc47034, -0x180feff4, 0x02e2ef77, 0x00012565, -0x57514c12, 0x24a76dce }, + { -0x203f38c0, 0x0a4522b2, 0x40c9a407, 0x10d06e7f, 0x78cff668, -0x3930ebbf, 0x18a43790, 0x5e607b25 } + }, + { + { -0x5a6930ec, -0x5fd3bce4, -0x512c1c00, -0x1c3bd2c0, 0x2e0f26db, -0x2dbad980, -0x61ba8f98, 0x201f3313 }, + { 0x6cdf1818, 0x58b31d8f, -0x3c9da75e, 0x35cfa74f, 0x66e61d6e, -0x1e4c00b1, 0x6ccdd5f7, 0x5067acab }, + { 0x08039d51, -0x02ad8095, 0x017c0006, 0x18b14964, 0x2e25a4a8, -0x2addf150, 0x62460375, 0x397cba88 } + }, + { + { -0x37ec8619, 0x7815c3fb, -0x221ed50f, -0x599e6be0, -0x7a57022b, -0x00563f08, -0x3e1e3dae, 0x771b4022 }, + { -0x0fa6a64e, 0x30c13093, -0x1656868a, -0x1dc55e73, 0x721d5e26, 0x222fd491, 0x766e6c3a, 0x2339d320 }, + { 0x513a2fa7, -0x2782267a, -0x062b30f8, -0x0a53648f, 0x1ea283b3, -0x2f943ce5, 0x19971a76, 0x331a1892 } + }, + { + { -0x628a8d51, 0x26512f3a, 0x68074a9e, 0x5bcbe288, 0x1180f7c4, -0x7b123e3f, -0x09b65985, 0x1ac9619f }, + { -0x04b07f3a, -0x0ae990bb, 0x61c775cf, -0x63c93822, -0x6fbe26e4, -0x1c2b17e5, -0x7c4201df, 0x31167c6b }, + { 0x524b1068, -0x0dd4c7be, -0x11631679, 0x5068343b, 0x4a6250c8, -0x03628e7c, 0x1f08b111, 0x61243634 } + }, + { + { 0x1a2d2638, -0x749cb61d, -0x642c02cb, -0x62204900, -0x5c5f945c, 0x7f8bf1b8, 0x78d90445, 0x1522aa31 }, + { -0x78b17673, -0x2662be25, 0x6c07dc20, 0x09fea5f1, -0x2ff06444, 0x793d2c67, -0x61a100c0, 0x46ebe230 }, + { 0x69614938, 0x2c382f53, -0x48d292f0, -0x2501bf66, -0x49b90dd9, -0x1737cc6f, 0x0524306c, 0x45fe70f5 } + }, + { + { -0x376aeb6f, 0x62f24920, 0x3f630ca2, 0x05f007c8, -0x0a362b48, 0x6fbb45d2, -0x4a85ddbb, 0x16619f6d }, + { -0x69f3f474, -0x25b78a5a, -0x10f1d0e0, 0x5b68d076, 0x3d0b8fd4, 0x07fb51cf, -0x5f1c6d2c, 0x428d1623 }, + { 0x01a308fd, 0x084f4a44, 0x76a5caac, -0x57dde63d, 0x43d1bc7d, -0x214721ba, 0x60bd38c6, 0x1d81592d } + }, +}, +{ + { + { 0x2f89c8a1, 0x3a4a369a, 0x7c8de80d, 0x63137a1d, 0x78eda015, -0x4353ff76, -0x4b7c4fc1, 0x2cb8b3a5 }, + { -0x13d5b3c8, -0x27cc2842, 0x0acc20ed, 0x2c916283, -0x6d208a7f, -0x16c5b856, 0x333c4a81, 0x702d67a3 }, + { -0x34e46f5f, 0x36e417cb, 0x7f11794e, 0x33b3ddaa, -0x77a439f9, 0x3f510808, -0x1957fdf3, 0x24141dc0 } + }, + { + { -0x427cea83, -0x6e6da234, 0x22cc8094, 0x3ca12053, 0x3f90d6e4, 0x28e57f18, -0x21d18985, 0x1a4714ce }, + { 0x3fefee9d, 0x59f73c77, -0x3e306763, -0x4c0e1077, -0x1fd1aba1, -0x1ca204be, 0x47a1b47c, 0x5766120b }, + { -0x47494801, -0x24df45f1, 0x77511fa1, -0x48cd3c4a, -0x660fd277, -0x56d4ae40, 0x489ca5f1, 0x4f3875ad } + }, + { + { -0x118c1140, 0x79ed13f6, 0x69110bb1, -0x5a39ad93, -0x79fc79f4, -0x1b76d73d, -0x028fa60b, 0x722a1446 }, + { 0x4932ab22, -0x380389d1, 0x2f4c3c1b, 0x7ac0edf7, -0x65576a18, 0x5f6b55aa, -0x52f5ff7f, 0x3680274d }, + { -0x573077e7, -0x2f6a6017, -0x7b8a5664, -0x2f566ab0, 0x20b09cc5, 0x6eac1733, 0x331b1095, 0x628ecf04 } + }, + { + { 0x5c74ccf1, -0x64be5308, 0x08265251, -0x498cce7f, 0x11adb147, -0x6636d513, 0x34ecb40f, 0x7a47d70d }, + { -0x562f2244, -0x67434ee8, 0x08b4802b, -0x11bb61cc, -0x47594efc, -0x78f76dda, 0x45c7915d, 0x685f349a }, + { -0x33bc5b0b, 0x60a0c4cb, 0x3677bea9, 0x775c66ca, 0x2ff8f5ed, -0x5e855e8b, 0x0e01fdc0, 0x11ded902 } + }, + { + { 0x3bea93b7, 0x471f95b0, 0x3313abd3, 0x0552d7d4, -0x1e81c085, -0x426c8f1e, -0x4df1a414, 0x7b120f1d }, + { -0x351018fc, -0x76f187f7, -0x1cf17394, -0x78d7d693, -0x6d514e37, 0x4c5cd2a3, 0x5771531f, 0x194263d1 }, + { -0x79afd286, 0x17d2fb3d, 0x50a69352, -0x4a9b27bc, -0x59f128a3, 0x7da962c8, 0x318736aa, 0x00d0f85b } + }, + { + { -0x0289de3f, -0x598ac3e2, 0x445671f5, 0x69c0b4a7, 0x05b23c11, -0x68e0ad8c, 0x51a8c7cd, 0x387bc748 }, + { 0x777c84fd, -0x6874ebd2, 0x05a8c062, -0x0bfd9bb9, -0x1819ed39, -0x59852ae5, -0x672295cd, 0x2f7b4596 }, + { 0x4a52a9a8, -0x7e76b4b3, -0x09477cd1, -0x5226c1ee, -0x49e429c8, 0x184d8548, -0x29360933, 0x3f1c62db } + }, + { + { 0x148f693d, 0x3fad3e40, -0x6b14658e, 0x052656e1, 0x184f4e2f, 0x2f4dcbfd, -0x3b7d1e75, 0x406f8db1 }, + { -0x6e6ef3e1, 0x2e8f1f00, -0x400d1ed4, -0x5b20b020, -0x116d8bc8, 0x60c6560a, -0x53103706, 0x6338283f }, + { 0x7f191ee4, -0x619cf2d4, -0x43c00990, 0x4fbf8301, 0x7afb73c4, 0x787d8e4e, -0x170a705b, 0x50d83d5b } + }, + { + { -0x4b2c4993, -0x3f533070, 0x61732e60, -0x58fa621b, 0x70c6b0ba, 0x033d1f78, 0x26d946e4, 0x584161cd }, + { -0x3ee5e769, -0x7a97c6ea, -0x1af92ff8, 0x2d69a4ef, -0x099b42ff, 0x39af1378, 0x361517c6, 0x65942131 }, + { 0x72d27ca2, -0x440d4e60, -0x042138fc, -0x40c6c3a7, -0x1d9d47e2, -0x16724432, 0x3029b589, 0x02eebd0b } + }, +}, +{ + { + { 0x7b85c5e8, -0x789a4961, -0x2e97454e, 0x6ff0678b, 0x1d330f9b, 0x3a70e77c, -0x4f507184, 0x3a5f6d51 }, + { -0x59f253a1, 0x61368756, -0x145423a9, 0x17e02f6a, 0x4cce0f7d, 0x7f193f2d, -0x76132310, 0x20234a77 }, + { 0x7178b252, 0x76d20db6, -0x2ae12ea0, 0x071c34f9, -0x4c1bee90, -0x09d5b5e0, 0x3cffe366, 0x7cd68235 } + }, + { + { 0x68acf4f3, -0x599a32a0, 0x3cd7e3d3, 0x42d92d18, 0x336025d9, 0x5759389d, 0x2b2cd8ff, 0x3ef0253b }, + { -0x2778054a, 0x0be1a45b, -0x45bfc492, 0x2a846a32, -0x1691a000, -0x266defee, 0x3bdc0943, 0x2838c886 }, + { 0x4a465030, -0x2e944f31, 0x15c577ab, -0x05b694bf, -0x0b54be63, -0x7d305176, 0x06a82812, 0x21dcb8a6 } + }, + { + { -0x4188ce46, -0x6572ff06, 0x629e1889, -0x7dfc9f82, 0x43f3d97f, -0x4d33fdc9, 0x6c6f678b, 0x5d840dbf }, + { -0x73626038, 0x5c600446, -0x2bd55c35, 0x2540096e, 0x12ee2f9c, 0x125b4d4c, -0x6b5ce255, 0x0bc3d081 }, + { 0x309fe18b, 0x706e380d, -0x461e9a39, 0x6eb02da6, 0x7dae20ab, 0x57bbba99, 0x2ac196dd, 0x3a427623 } + }, + { + { -0x24bb8135, 0x3bf8c172, -0x39d7d243, 0x5fcfc41f, 0x75aa15fe, -0x7f530040, 0x24e1a9f9, 0x0770c9e8 }, + { -0x758f7b06, 0x4b42432c, -0x20461abb, -0x7675e61d, -0x63a71ba3, -0x4160ffdf, -0x5e92142f, 0x1ff177ce }, + { 0x45b5b5fd, -0x309e2666, 0x1b3a7924, -0x79f67b17, 0x303e3e89, -0x18cff6e7, 0x41500b1e, 0x39f264fd } + }, + { + { -0x01f6841f, -0x2e64b555, -0x201fe6d7, -0x5b92031f, 0x2ca6f1ff, -0x3c36f76c, 0x2c35f14e, 0x65c62127 }, + { -0x24181d64, -0x5852cbe9, 0x2b9c139c, -0x426bc896, -0x6ca68457, -0x5f16e472, 0x68889840, 0x1712d734 }, + { -0x31ce6c23, -0x18d47608, -0x5eda3f45, 0x4d103356, 0x2e1cfe83, 0x0419a93d, -0x4e631d8e, 0x22f9800a } + }, + { + { -0x65910254, 0x42029fdd, 0x34a54941, -0x46ed3142, -0x78420c85, 0x640f64b9, -0x7a67354c, 0x4171a4d3 }, + { 0x3e9ef8cb, 0x605a368a, -0x5aafb8eb, -0x1c163fde, 0x5f24248f, 0x553d48b0, 0x647626e5, 0x13f416cd }, + { -0x6636b374, -0x05d8a756, -0x4fff47f9, 0x23006f6f, -0x5225ac6e, -0x042d6e23, 0x574bd1ab, 0x508214fa } + }, + { + { 0x53d003d6, 0x461a15bb, -0x430c369b, -0x4defd778, 0x6c683a5a, 0x27c57675, -0x37934bb9, 0x3a7758a4 }, + { 0x3ed6fe4b, -0x3dfd96eb, 0x511d77c4, -0x59a598c7, 0x2c14af94, -0x3421d9ba, 0x6faba74b, 0x22f960ec }, + { -0x6c51af8a, 0x548111f6, 0x1dfd54a6, 0x1dae21df, -0x0ceea19b, 0x12248c90, -0x72180b6c, 0x5d9fd15f } + }, + { + { -0x1128ade2, 0x3f244d2a, 0x432e9615, -0x71c56fd8, 0x2e9c16d4, -0x1e9b4589, 0x47eb98d8, 0x3bc187fa }, + { 0x6d63727f, 0x031408d3, -0x28384acd, 0x6a379aef, -0x33511db5, -0x561e703b, 0x4f8fbed3, 0x332f3591 }, + { -0x15793df4, 0x6d470115, 0x6c46d125, -0x66754835, 0x3a660188, -0x2887cd4b, -0x6f9045fd, 0x450d81ce } + }, +}, +{ + { + { -0x4d351f4b, 0x23264d66, -0x14359a8a, 0x7dbaed33, -0x0f2db538, 0x030ebed6, -0x089caaf0, 0x2a887f78 }, + { -0x27bac6fe, -0x0751b2d6, -0x1724d2e3, 0x7018058e, -0x382d3ee2, -0x554c66a1, 0x24ccca79, 0x53b16d23 }, + { 0x5c012d4f, 0x2a23b9e7, -0x351e0d16, 0x0c974651, 0x675d70ca, 0x2fb63273, -0x79bbfc0b, 0x0ba7250b } + }, + { + { -0x79079264, -0x229ca76d, -0x1ec57a5c, 0x61699176, 0x4eaa7d57, 0x2e511195, -0x049f4205, 0x32c21b57 }, + { 0x029c6421, -0x44f2e703, -0x76d670fe, -0x43d2ebdf, -0x74daf16a, -0x7cb8071a, 0x032d71c9, 0x7b9f2fe8 }, + { 0x319e0780, -0x2787dc33, -0x76888a3b, -0x103b303f, -0x65f54c09, 0x4854fb12, 0x7238c371, 0x12c49d41 } + }, + { + { -0x7c866abe, 0x09b3a017, -0x552a11c1, 0x626dd08f, -0x148feb61, -0x45ff4312, -0x5f5bbb37, 0x1421b246 }, + { -0x0017c897, 0x0950b533, -0x71e2942f, 0x21861c1d, 0x1302e510, -0x0fdd27c8, 0x6391cab4, 0x2509200c }, + { -0x73db5839, 0x4aa43a8e, -0x270fa10b, 0x04c1f540, 0x0b3eb9dc, -0x5245a1f4, 0x48a49ce3, 0x2ab55044 } + }, + { + { 0x1c5d3afa, -0x23f8539d, -0x06207394, 0x58615171, -0x628c1d50, 0x72a079d8, -0x4b151ea3, 0x7301f4ce }, + { 0x6f0f5dec, 0x2ed22726, 0x5ed50824, -0x67db11bf, -0x6b972beb, -0x7f841384, -0x4ade1dc1, 0x7093bae1 }, + { -0x298dd3bf, 0x6409e759, 0x72bf729b, -0x598b1e31, 0x3c21e569, -0x43f5db15, 0x4ebacb23, 0x390167d2 } + }, + { + { -0x5d0dedf5, -0x2844fab5, -0x4efa7649, -0x1d463152, -0x0c3f1242, 0x3fe8bac8, 0x7112cb69, 0x4cbd4076 }, + { -0x45cac0e4, 0x27f58e3b, -0x4095bc9f, 0x4c47764d, 0x6e562650, -0x50443b1b, -0x551e5ba3, 0x07db2ee6 }, + { 0x29c58176, 0x0b603cc0, 0x5cb15d61, 0x5988e382, -0x230f5273, 0x2bb61413, 0x74183287, 0x7b8eec6c } + }, + { + { -0x03c7948d, 0x32fee570, -0x25c57339, -0x2574febf, -0x37697ca7, -0x68a002f6, -0x4ecd57ab, 0x6ee809a1 }, + { 0x2cd27cb0, -0x1b35bf88, -0x04169843, -0x25063cdd, -0x752be162, -0x4d642cb6, 0x626ede4d, 0x72810497 }, + { -0x030279c6, -0x6bbb44cf, 0x3e4e48c5, 0x2fe3690a, -0x2f7705db, -0x23d63799, -0x2e8cd6d2, 0x13bd1e38 } + }, + { + { 0x1dfac521, 0x223fb5cf, 0x6f554450, 0x325c2531, 0x659177ac, 0x030b98d7, 0x4f88a4bd, 0x1ed018b6 }, + { 0x696149b5, -0x2cd4b328, -0x7e275549, -0x1aa6c829, -0x51edd46c, 0x0bcb2127, -0x4ebf6650, 0x41e86fcf }, + { -0x47fd5950, 0x3630dfa1, 0x42ad3bd5, -0x77f078b9, -0x113a5b2c, 0x0af90d6c, 0x37cdc5d9, 0x746a247a } + }, + { + { 0x78d941ed, 0x6eccd852, -0x2dd087bd, 0x2254ae83, 0x7bbfcdb7, -0x3add2fd2, -0x400f1b1e, 0x681e3351 }, + { 0x2b7b9af6, -0x2ace4743, 0x37fc5b51, 0x50050935, -0x3a6cab93, 0x232fcf25, 0x2bb40f49, 0x20a36514 }, + { -0x7cfcb0bb, -0x749b4a63, 0x1fa20efb, 0x2f8b71f2, -0x459aaf1c, 0x69249495, 0x45d5472b, 0x539ef98e } + }, +}, +{ + { + { 0x1cae743f, -0x2f8b276a, -0x11e39c13, -0x0792e70b, -0x180b12d7, -0x68423aa5, 0x663ab108, 0x4cbad279 }, + { -0x59dfad8b, 0x6e7bb6a1, 0x413c8e83, -0x55b0de29, -0x1770a34e, 0x6f56d155, -0x59cba41f, 0x2de25d4b }, + { -0x5f28e033, -0x7f2e6fdc, -0x04d77508, -0x3ada3df6, 0x5f3a6419, -0x4e5c68b5, -0x1dff8dcd, 0x7d7fbcef } + }, + { + { -0x0c3d6f6c, -0x3283a23b, 0x2a9105ab, -0x387e5d66, 0x421c3058, -0x7f39e2ca, -0x23272b29, 0x4f9cd196 }, + { 0x266b2801, -0x0510e196, -0x2a8c60ea, -0x7993973c, 0x1b03762c, -0x0975d044, -0x7848a573, 0x5975435e }, + { 0x6a7b3768, 0x199297d8, 0x1ad17a63, -0x2f2fa7dc, 0x5c1c0c17, -0x45fd6353, 0x387a0307, 0x7ccdd084 } + }, + { + { 0x6760cc93, -0x64f37be8, 0x1ab32a99, -0x3251ff86, 0x620bda18, -0x5772137a, -0x7e6f35bc, 0x3593ca84 }, + { 0x6d260417, -0x2359bdd4, -0x6b7dbf43, -0x51eac2b0, -0x04973989, -0x563f3e4c, 0x61d0cf53, 0x428bd0ed }, + { 0x5e849aa7, -0x6dece766, 0x65d8facd, -0x2b273ccb, 0x53fdbbd1, -0x73adaba5, -0x25d29c1a, 0x27398308 } + }, + { + { 0x0a702453, -0x465ef1b4, -0x2a82e422, 0x0fa25866, -0x32d82509, -0x0046264b, 0x492c33fd, 0x572c2945 }, + { 0x435ed413, 0x42c38d28, 0x3278ccc9, -0x42af0ca0, 0x79da03ef, -0x44f854e6, -0x4173ccab, 0x269597ae }, + { -0x2932cf42, -0x388038bb, -0x1c455105, -0x1b20172d, -0x55a225f4, -0x5dd377d0, -0x3fa43580, 0x7f985498 } + }, + { + { 0x0fbf6363, -0x2ca9eaae, -0x30b2045a, 0x08045a45, -0x78c05f3e, -0x113db044, -0x2964ed19, 0x30f2653c }, + { -0x60f41ee9, 0x3849ce88, 0x7b54a288, -0x7ffa52e5, 0x23fc921c, 0x3da3c39f, 0x0a31f304, 0x76c2ec47 }, + { -0x553ef37b, -0x75f736c8, -0x24d89435, 0x46179b60, 0x0e6fac70, -0x56df3fe2, 0x596473da, 0x2f1273f1 } + }, + { + { 0x55a70bc0, 0x30488bd7, -0x0e2bbd19, 0x06d6b5a4, -0x43a69e9e, -0x152e5962, -0x123a087c, 0x38ac1997 }, + { -0x751fe1ef, 0x4739fc7c, 0x4a6aab9f, -0x02ad8b70, -0x788d70d2, 0x41d98a82, -0x27a4960e, 0x5d9e572a }, + { -0x58ae4ec5, 0x0666b517, 0x7e9b858c, 0x747d0686, 0x454dde49, -0x53533fef, -0x40161964, 0x22dfcd9c } + }, + { + { 0x103be0a1, 0x56ec59b4, -0x2da60697, 0x2ee3baec, 0x13f5cd32, 0x797cb294, 0x24cde472, 0x0fe98778 }, + { -0x3cf2f327, -0x72242d20, -0x5344bccd, -0x527199a1, 0x322a961f, -0x7094da74, 0x5448c1c7, 0x6b2916c0 }, + { 0x0aba913b, 0x7edb34d1, 0x2e6dac0e, 0x4ea3cd82, 0x6578f815, 0x66083dff, 0x7ff00a17, 0x4c303f30 } + }, + { + { 0x0dd94500, 0x29fc0358, 0x6fbbec93, -0x132d855c, -0x3d1d5808, 0x130a155f, -0x48f95e2b, 0x416b151a }, + { 0x17b28c85, -0x2cf5c42a, 0x39773bea, -0x3a2c8849, 0x1e6a5cbf, -0x39391874, -0x74d5483c, 0x0d61b8f7 }, + { -0x163ec950, 0x56a8d7ef, 0x58e44b20, -0x42f81a33, 0x1b57e0ab, -0x5019d026, 0x4277e8d2, 0x191a2af7 } + }, +}, +{ + { + { 0x2fe09a14, 0x09d4b60b, -0x244e8b82, -0x3c7b0f51, 0x78b5fd6e, 0x58e2ea89, -0x4a1f64f6, 0x519ef577 }, + { -0x5490b67b, -0x2aaff6a5, 0x4fbfaf1a, 0x04f4cd5b, 0x2a0c7540, -0x6271d12f, -0x4ddedd7a, 0x2bc24e04 }, + { 0x1124cca9, 0x1863d7d9, -0x47758f72, 0x7ac08145, -0x7a8fce0b, 0x2bcd7309, -0x7547051b, 0x62337a6e } + }, + { + { 0x1b3a1273, -0x2e54cdb2, -0x7efaacc0, 0x18947cf1, -0x5673e692, 0x3b5d9567, -0x7fd1e198, 0x7fa00425 }, + { 0x06ffca16, 0x4bcef17f, 0x692ae16a, -0x21f91e25, 0x614f42b0, 0x0753702d, 0x5b9212d0, 0x5f6041b4 }, + { 0x028c2705, 0x7d531574, -0x24f28a02, -0x7fce8297, -0x10737223, 0x30fface8, -0x493c1668, 0x7e9de97b } + }, + { + { -0x5db2bf23, -0x0ffb419e, 0x0452d41f, -0x45f9a66f, 0x62a44234, -0x7e3ba11f, -0x5ddd9911, 0x4cb829d8 }, + { -0x619a7a5d, 0x1558967b, -0x6716746e, -0x68366320, 0x6eb3adad, 0x10af149b, -0x0b2c7306, 0x42181fe8 }, + { 0x07b86681, 0x1dbcaa84, -0x74d98ac5, 0x081f001e, -0x7bfb717f, 0x3cd7ce6a, 0x3f25f22c, 0x78af1163 } + }, + { + { 0x7d65318c, 0x3241c00e, -0x2f179219, -0x19411a24, -0x043f73da, 0x118b2dc2, -0x039fc23d, 0x680d04a7 }, + { 0x0b50babc, -0x7be9142c, 0x28208bee, 0x15087226, -0x463e3c93, -0x5ceb7051, -0x2cd282a3, 0x0d07daac }, + { 0x695aa3eb, -0x063dbeb6, 0x05a68f21, -0x255bd3b4, 0x7f93963e, 0x7c6c2398, 0x0c3954e3, 0x210e8cd3 } + }, + { + { 0x37fe6c26, 0x2b50f161, 0x56e404d8, -0x1efd4328, 0x4c561f6b, 0x12b0f141, -0x2fd7136f, 0x51b17bc8 }, + { 0x10a71c06, -0x53bdfe0e, -0x0c404fdf, 0x6a65e0ae, 0x393632f7, -0x43bd3ca4, -0x79a0f8be, 0x56ea8db1 }, + { -0x30acaee7, -0x000a04b5, -0x20eef760, -0x0b676287, -0x65c45cdb, -0x4203159b, 0x74d1a6f2, 0x18a11f11 } + }, + { + { -0x2d85a0d4, -0x0429c326, -0x755ef929, -0x0ff03b44, -0x719b5bd0, 0x53fb5c1a, 0x0c1a2e85, 0x04eaabe5 }, + { 0x3f6bba29, 0x407375ab, -0x66e1b7d2, -0x613c4928, -0x1aa06d17, -0x6637f17e, -0x04f3f51f, 0x307c13b6 }, + { -0x34754a19, 0x24751021, 0x5c5010eb, -0x03dcbbb7, 0x4e5610a1, 0x5f1e717b, -0x3d8ef32b, 0x44da5f18 } + }, + { + { -0x76271534, -0x6ea90195, -0x1dced95f, -0x19486baf, 0x3944eb4e, -0x428b9c27, 0x767203ae, 0x726373f6 }, + { -0x0e47d14b, 0x033cc55f, 0x411cae52, -0x4ea51c93, -0x7004532d, -0x45bf49e7, 0x532e861f, 0x768edce1 }, + { -0x14810976, -0x1cfa358e, 0x70eadb23, 0x662cf31f, -0x4b3ba498, 0x18f026fd, -0x4a2d1343, 0x513b5384 } + }, + { + { -0x750cb315, 0x5e270287, -0x46b92952, -0x6ff4fbf7, -0x25427aee, 0x6512ebf7, -0x77da707f, 0x61d9b769 }, + { -0x38d66762, 0x46d46280, 0x5368a5dd, 0x4b93fbd0, -0x2e89a577, 0x63df3f81, -0x465f5ddd, 0x34cebd64 }, + { 0x49b7d94b, -0x593a58ed, 0x23eb9446, -0x5c0c2ea8, 0x77484834, 0x0416fbd2, 0x2c70812f, 0x69d45e6f } + }, +}, +{ + { + { 0x4f460efb, -0x6019d4bd, -0x59c9f82a, -0x212cfc2c, -0x485f25dc, -0x0faddef2, 0x00545b93, 0x237e7dbe }, + { -0x3ac3ebcf, -0x31e908b5, 0x2072edde, 0x2b9725ce, -0x4a4dc119, -0x47463c91, 0x0b5cc908, 0x7e2e0e45 }, + { 0x6701b430, 0x013575ed, -0x60f402f0, 0x231094e6, -0x7c1b80de, 0x75320f15, -0x4eeeaa1d, 0x71afa699 } + }, + { + { 0x473b50d6, -0x15bdc3e4, 0x3b38ef10, 0x51e87a1f, -0x4d36416b, -0x647b40a1, 0x78f89a1c, 0x00731fbc }, + { 0x3953b61d, 0x65ce6f9b, -0x505ebe1a, -0x39a7c616, -0x5608a602, 0x0f435ffd, -0x3d4e3d72, 0x021142e9 }, + { 0x48f81880, -0x1bcf38e8, 0x5ecec119, -0x4069f3de, 0x6bba15e3, -0x49251f7d, 0x47e15808, 0x4c4d6f33 } + }, + { + { -0x6770e690, 0x2f0cddfc, -0x4f460ae5, 0x6b916227, 0x779176be, 0x6ec7b6c4, -0x57706058, 0x38bf9500 }, + { -0x3e82e037, 0x18f7eccf, 0x51403c14, 0x6c75f5a6, -0x0811f321, -0x24218ed5, -0x581b85de, 0x193fddaa }, + { 0x37e8876f, 0x1fd2c93c, 0x18d1462c, -0x5d09e1a6, 0x39241276, 0x5080f582, -0x40f2b697, 0x6a6fb99e } + }, + { + { -0x491bdc3a, -0x114edd4b, -0x0d790072, -0x6c628ff0, 0x1dcf5d8c, -0x6f56d57d, 0x42c5eb10, 0x136fda9f }, + { 0x560855eb, 0x6a46c1bb, -0x076c0f63, 0x2416bb38, -0x708e533f, -0x28e2eec9, -0x5ce76916, 0x75f76914 }, + { -0x5cfa422f, -0x06b3204f, -0x6007d3f8, 0x0f364b9d, -0x3c44a776, 0x2a87d8a5, 0x0be8dcba, 0x02218351 } + }, + { + { 0x43307a7f, -0x62a58eff, -0x3b825ba1, -0x4f9c2162, -0x416d852d, 0x22bbfe52, -0x02bfbd94, 0x1387c441 }, + { 0x5ead2d14, 0x4af76638, -0x3583a7d0, -0x5f712780, 0x10211e3d, 0x0d13a6e6, 0x7b806c03, 0x6a071ce1 }, + { -0x78687508, -0x4a2c3c2f, 0x7f0e4413, 0x722b5a3d, -0x44b88360, 0x0d7b4848, -0x50e1236e, 0x3171b26a } + }, + { + { -0x4d75b82f, -0x59f24828, 0x1770a4f1, -0x5940eb2a, 0x53ddbd58, -0x2b5e076d, 0x344243e9, 0x6c514a63 }, + { -0x68a9b358, -0x56d0ce70, 0x2275e119, -0x008447b4, -0x5b78aeb0, 0x4f55fe37, 0x3cf0835a, 0x221fd487 }, + { 0x3a156341, 0x2322204f, -0x45f5fcd3, -0x048c1f17, 0x410f030e, -0x031f22b4, -0x046db556, 0x48daa596 } + }, + { + { -0x37b3686d, 0x14f61d5d, -0x10be7dfa, -0x66be061d, 0x346277ac, -0x320a4771, 0x0e8a79a9, 0x58c837fa }, + { 0x5ca59cc7, 0x6eca8e66, 0x2e38aca0, -0x57b8dab5, -0x2de1e832, 0x31afc708, -0x3527b509, 0x676dd6fc }, + { -0x69036fa8, 0x0cf96885, 0x7b56a01b, 0x1ddcbbf3, 0x4935d66a, -0x233d1883, -0x395a80f6, 0x1c4f73f2 } + }, + { + { -0x0383cb7c, -0x4c918f92, -0x3c3e309f, 0x73dfc9b4, 0x781cc7e5, -0x14e28637, 0x7daf675c, 0x70459adb }, + { 0x305fa0bb, 0x0e7a4fbd, 0x54c663ad, -0x7d62b320, 0x2fe33848, -0x0bde3c7d, 0x1bf64c42, 0x795ac80d }, + { -0x6e4bd44d, 0x1b91db49, 0x4b02dcca, 0x57269623, 0x1f8c78dc, -0x6020611b, -0x731de02d, 0x5fe16284 } + }, +}, +{ + { + { -0x6aeeac77, 0x315c29c7, -0x79d08b32, -0x281f1af9, -0x7a6d8bce, 0x0c4a7621, 0x4a25a1e4, 0x72de6c98 }, + { 0x4d077c41, -0x1d86f552, -0x248b965d, -0x746c7d90, -0x7542e95e, 0x6eb632dc, -0x55f9b48e, 0x720814ec }, + { -0x40955cf0, -0x51654aad, -0x7f9291e5, 0x050a50a9, -0x5200aec7, -0x6d448bfd, 0x45be618b, 0x0394d276 } + }, + { + { -0x4dcaba5c, -0x0ac69bdb, -0x67044d6a, 0x15a7a27e, 0x636fdd86, -0x5493ad44, 0x419334ee, 0x79d995a8 }, + { -0x7a81120c, 0x4d572251, -0x1e616c3b, -0x1c8db123, 0x0b797035, -0x758ebdf2, -0x785418bd, 0x3b3c8336 }, + { 0x1195dd75, -0x3275715a, 0x1dd9a82f, -0x5afb2758, -0x5ca7864a, 0x540dca81, 0x79c86a8a, 0x60dd16a3 } + }, + { + { 0x153e47b8, 0x3501d6f8, 0x14a2f60c, -0x485698ac, 0x455d9523, 0x112ee8b6, -0x7eed1576, 0x4e62a3c1 }, + { 0x7381e559, 0x35a2c848, -0x287f7d35, 0x596ffea6, -0x245849ad, -0x34688e15, -0x64b2597b, 0x5a08b501 }, + { 0x516ab786, -0x372b53fc, 0x5295b23d, 0x595af321, -0x24fdcf3f, -0x29122dcc, -0x7da4be34, 0x0929efe8 } + }, + { + { -0x52a99ae3, -0x74ce8d49, 0x3fabd717, 0x01581b7a, 0x424df6e4, 0x2dc94df6, 0x2c29284f, 0x30376e5d }, + { -0x342f0d2d, 0x5f0601d1, 0x6132bb7f, 0x736e412f, 0x238dde87, -0x7c9fbbce, -0x0a3f8ac4, 0x1e3a5272 }, + { -0x7ea65a64, -0x2d6e7259, 0x3f0713f3, 0x6bdc1cd9, 0x4acd6590, 0x565f7a93, 0x4cb4c128, 0x53daacec } + }, + { + { -0x7ad30250, -0x667ad43d, 0x59d6ed0b, 0x2cc12e95, -0x64a53d85, 0x70f9e2bf, 0x7959ae99, 0x4f3b8c11 }, + { -0x6337582a, 0x4ca73bd7, 0x47e9a9b2, 0x4d4a738f, 0x42f5fe00, -0x0b340ed7, -0x4240f8ae, 0x01a13ff9 }, + { 0x2ff26412, 0x55b6c9c8, 0x1fb667a8, 0x1ac4a8c9, -0x1488740e, -0x2ad84031, 0x7012a3be, 0x303337da } + }, + { + { -0x052d022f, -0x6892c335, 0x37a640a8, -0x34777c69, 0x6734cb25, 0x2ff00c1d, 0x789c2d2b, 0x269ff4dc }, + { -0x73e36284, -0x6aabddde, 0x1a9b340f, 0x01fac137, -0x6da4b729, 0x7e8d9177, 0x61b3e31b, 0x53f8ad56 }, + { -0x3f729873, 0x0c003fbd, 0x7ead2b17, 0x4d982fa3, -0x4d1a7d0f, -0x3f819433, -0x20bed5bc, 0x296c7291 } + }, + { + { -0x25474a62, -0x204dcdfb, -0x37f6ddb0, 0x465aeaa0, -0x658da2e8, -0x2ecc3ee8, 0x61f117d1, 0x23273702 }, + { 0x33daf397, 0x7903de2b, -0x3659db4d, -0x2f00f9e7, 0x555b3e18, -0x75e2dad5, 0x52e0b7c0, 0x2b6d581c }, + { 0x623e7986, 0x3d0543d3, -0x3d875cac, 0x679414c2, 0x726196f6, -0x51bc0f34, -0x7dba1546, 0x7836c41f } + }, + { + { -0x7fee6c84, -0x359ae17c, 0x6ef41a28, -0x394f3b92, 0x5f3f8d52, -0x48fde459, -0x15284603, 0x119dff99 }, + { 0x49e95a81, -0x185dab25, 0x08b0ad73, 0x5192d5d0, -0x2ff503f9, 0x4d20e5b1, 0x2cf25f38, 0x5d55f801 }, + { -0x0b4ce2b3, 0x43eadfcb, 0x11148892, -0x39afc08c, 0x060d3b17, -0x0111973b, -0x22b5f538, 0x329293b3 } + }, +}, +{ + { + { 0x5d7cb208, 0x2879852d, 0x687df2e7, -0x47212290, 0x21687891, -0x23f40055, 0x677daa35, 0x2b44c043 }, + { -0x1e6b69e6, 0x4e59214f, 0x0d71cd4f, 0x49be7dc7, 0x3b50f22d, -0x6cff302e, -0x036e8dce, 0x4789d446 }, + { 0x074eb78e, 0x1a1c87ab, -0x66250b99, -0x05392e72, 0x484f9067, 0x3eacbbcd, 0x2bb9a4e4, 0x60c52eef } + }, + { + { 0x7cae6d11, 0x702bc5c2, 0x54a48cab, 0x44c7699b, -0x45b6d14e, -0x1043bfaa, -0x26499893, 0x70d77248 }, + { 0x3bfd8bf1, 0x0b5d89bc, -0x360caae6, -0x4f946dc9, -0x2acfd70b, 0x0e4c16b0, 0x2ccfcaab, 0x10bc9c31 }, + { 0x3ec2a05b, -0x557517b5, -0x12e87e20, -0x6796610c, 0x708e85d1, 0x794513e4, -0x56890bed, 0x63755bd3 } + }, + { + { -0x680e5349, 0x3dc71018, -0x3e9a4428, 0x5dda7d5e, 0x0fa1020f, 0x508e5b9c, 0x37c52a56, 0x27637517 }, + { 0x2ad10853, -0x4aa05fc2, -0x6119ca97, 0x356f7590, -0x41964770, -0x60060e03, -0x743e907c, 0x0d8cc1c4 }, + { 0x6eb419a9, 0x029402d3, 0x77b460a5, -0x0f4bb182, -0x2bc3b6aa, -0x30579dd0, 0x7ad166e7, 0x70c2dd8a } + }, + { + { -0x471281ed, -0x6e2b6983, -0x28897e86, 0x74252f0a, 0x0d852564, -0x1bf67d20, 0x16a53ce5, 0x32b86138 }, + { -0x609013f2, 0x65619450, 0x46c6518d, -0x11d18157, 0x67e09b5c, -0x68cc3e0d, 0x63948495, 0x2e0fac63 }, + { -0x1bb7329c, 0x79e7f7be, 0x087886d0, 0x6ac83a67, -0x5f1b24d2, -0x07602b27, 0x735a4f41, 0x4179215c } + }, + { + { 0x286bcd34, -0x1b51cc47, 0x559dd6dc, -0x4810814a, -0x4c2c71e1, 0x278b141f, 0x2241c286, 0x31fa8566 }, + { -0x282312d6, -0x738f6b19, 0x47d39c70, -0x6804753d, -0x56f926fe, -0x1ec41fcd, 0x0cd99d76, 0x700344a3 }, + { 0x2e3622f4, -0x507d93be, -0x67ccafd3, -0x3edfd679, 0x2b389123, -0x643e481f, -0x566adb77, 0x24bb2312 } + }, + { + { -0x0a07a395, 0x41f80c2a, 0x04fa6794, 0x687284c3, -0x5c45e453, -0x76ba2067, -0x0014a2ea, 0x0d1d2af9 }, + { 0x32de67c3, -0x4e5712e9, 0x461b4948, 0x3cb49418, 0x76cfbcd2, -0x7142bcbd, 0x1e188008, 0x0fee3e87 }, + { 0x32621edf, -0x5625755f, 0x59226579, 0x30b822a1, -0x58653e6d, 0x4004197b, 0x18531d76, 0x16acd797 } + }, + { + { 0x7887b6ad, -0x36a6393b, 0x5f90feba, -0x6b1e6153, -0x5cbd0afc, 0x16e24e62, 0x18161700, 0x164ed34b }, + { 0x2d9b1d3d, 0x72df72af, -0x5bcddba6, 0x63462a36, 0x16b39637, 0x3ecea079, -0x46cfdcf7, 0x123e0ef6 }, + { 0x192fe69a, 0x487ed94c, 0x3a911513, 0x61ae2cea, -0x465b21d9, -0x7884092d, 0x1073f3eb, 0x78da0fc6 } + }, + { + { 0x680c3a94, -0x5d607f0f, 0x1ae9e7e6, 0x71f77e15, 0x48017973, 0x1100f158, 0x16b38ddd, 0x054aa4b3 }, + { -0x1ad43996, 0x5bf15d28, 0x70f01a8e, 0x2c47e318, 0x06c28bdd, 0x2419afbc, 0x256b173a, 0x2d25deeb }, + { 0x19267cb8, -0x2037b973, 0x66e54daf, 0x0b28789c, 0x666eec17, 0x2aeb1d2a, -0x548258a0, 0x134610a6 } + }, +}, +{ + { + { -0x23fd73c4, -0x26ebcf20, 0x5217c771, 0x0eb955a8, 0x2c99a1fa, 0x4b09e1ed, -0x42958bc4, 0x42881af2 }, + { 0x7c59b23f, -0x350aa13e, 0x154d04f2, -0x665112c2, -0x1ebebe0c, 0x68441d72, 0x3932a0a2, 0x14034513 }, + { -0x54a352c3, 0x7bfec69a, 0x4cb2cfad, -0x3dc1732d, -0x04c8295e, 0x685dd14b, 0x15677a18, 0x0ad6d644 } + }, + { + { 0x47927e9f, 0x79148928, 0x370aa877, 0x33dad6ef, 0x11122703, 0x1f8f24fa, 0x2adf9592, 0x5265ac2f }, + { 0x417becb5, 0x781a439e, -0x2ef1fd9a, 0x4ac5938c, 0x0692ac24, 0x5da38511, -0x521cedcd, 0x11b065a2 }, + { -0x65034cba, 0x405fdd30, 0x28e63f54, -0x268dc2bc, 0x5f65aaae, -0x6b3fe210, -0x1eb3f7f7, 0x43e4dc3a } + }, + { + { -0x523d395d, -0x1590853d, -0x168e836c, -0x2f16d70a, -0x29ba150b, -0x1d2c8616, -0x3ae00442, 0x46dd8785 }, + { -0x56c75ae9, -0x43ed380f, 0x3180b2e1, 0x473028ab, -0x0432dab6, 0x3f78571e, 0x6ff6f90f, 0x74e53442 }, + { 0x375c8898, 0x709801be, -0x1c027cb8, 0x4b06dab5, 0x27230714, 0x75880ced, -0x22d0b3be, 0x2b09468f } + }, + { + { -0x7d005fd6, 0x5b979465, -0x01570ab7, -0x25f695af, 0x5f77af9b, -0x5f9caec9, 0x201d1e76, 0x1bcfde61 }, + { -0x48fe346a, -0x6838b612, -0x495c963d, -0x7c0bc72c, -0x65bfd327, 0x62962b8b, -0x67772085, 0x6976c750 }, + { 0x246a59a2, 0x4a4a5490, -0x17802270, -0x29c14222, 0x0d2371fa, -0x26bc8399, -0x2cf0712a, 0x69e87308 } + }, + { + { -0x7437fcfd, 0x0f80bf02, 0x7a18cefb, 0x6aae16b3, -0x28d3295d, -0x22b815b9, -0x0b12c656, 0x61943588 }, + { 0x5656beb0, 0x435a8bb1, 0x4f4d5bca, -0x07053646, 0x1548c075, -0x464d873c, -0x176d49de, 0x3eb0ef76 }, + { -0x6efc607b, -0x2d91a3c2, -0x090cc557, -0x3f161883, 0x70066a93, -0x176973ab, 0x1faaaddd, 0x3c34d188 } + }, + { + { 0x2fffe0d9, -0x42a4f471, 0x3ed24fb9, 0x6aa25410, -0x4d97de3c, 0x2ac7d7bc, 0x60dca36a, 0x605b394b }, + { -0x5f606140, 0x3f9d2b5e, -0x49dc5770, 0x1dab3b6f, 0x72d926c4, -0x5f645c16, 0x3fd8b36d, 0x37419351 }, + { 0x5a9d1ed2, -0x4b17a91c, 0x6c97a9a2, -0x1017b78a, 0x1e5eee7d, -0x4efb309c, -0x7758e371, 0x2f50b81c } + }, + { + { -0x5825add6, 0x2b552ca0, 0x449b0250, 0x3230b336, -0x5b466047, -0x0d3b3a44, 0x58074a22, 0x7b2c6749 }, + { -0x0397ee45, 0x31723c61, 0x6211800f, -0x634bafb8, 0x47995753, 0x768933d3, 0x02752fcd, 0x3491a535 }, + { 0x3ed28cdf, -0x2aae9a78, -0x2c9d21c7, 0x12d84fd2, -0x1cc871b1, 0x0a874ad3, 0x7c763e74, 0x000d2b1f } + }, + { + { 0x3e94a8ab, -0x69db8874, -0x16587414, 0x0ad6f3ce, 0x0d743c4f, -0x6b75387f, -0x55130334, 0x76627935 }, + { -0x2f92b599, 0x3d420811, -0x6f1f001d, -0x4103fb7b, -0x42b78422, -0x078f3949, 0x319afa28, 0x6e2a7316 }, + { -0x292a6561, 0x56a8ac24, 0x3096f006, -0x37248ac2, -0x70b3ad67, 0x477f41e6, -0x09379eec, 0x588d851c } + }, +}, +{ + { + { 0x77d1f515, -0x32d59a19, -0x70559f0f, 0x54899187, -0x2543f91b, -0x4e48c444, -0x56833605, 0x654878cb }, + { -0x72094f02, 0x51138ec7, -0x1a8a0ae5, 0x5397da89, 0x717af1b9, 0x09207a1d, 0x2b20d650, 0x2102fdba }, + { 0x055ce6a1, -0x69611bfb, 0x1251ad29, 0x36bca768, -0x55825beb, 0x3a1af517, 0x29ecb2ba, 0x0ad725db } + }, + { + { -0x64fa907b, -0x013843f4, -0x180a0029, 0x537d5268, 0x4312aefa, 0x77afc662, 0x02399fd9, 0x4f675f53 }, + { -0x7cb1dba9, -0x23bd984f, 0x70ce1bc5, -0x498abb4b, -0x082ea129, 0x1af07a0b, 0x71a03650, 0x4aefcffb }, + { 0x0415171e, -0x3cd2c9ca, -0x7667b7c5, -0x32d410ef, -0x2f6baef0, -0x78f59153, -0x5d579a9f, 0x0bccbb72 } + }, + { + { 0x50fe1296, 0x186d5e4c, -0x01176082, -0x1fc6847e, 0x507031b0, 0x3bc7f6c5, 0x108f37c2, 0x6678fd69 }, + { -0x154e5638, 0x185e962f, 0x65147dcd, -0x791819cb, -0x44a4920e, -0x4f6d1fcf, 0x59d6b73e, 0x4024f0ab }, + { 0x636863c2, 0x1586fa31, 0x572d33f2, 0x07f68c48, 0x789eaefc, 0x4f73cc9f, -0x7152b8ff, 0x2d42e210 } + }, + { + { 0x0f537593, 0x21717b0d, 0x131e064c, -0x6eb196f5, 0x752ae09f, 0x1bb687ae, -0x64bdc392, 0x420bf3a7 }, + { -0x6b202d65, -0x680aeceb, 0x313f4c6a, 0x6155985d, 0x08455010, -0x145ec0f9, -0x472d2cde, 0x676b2608 }, + { 0x1c5b2b47, -0x7ec7459b, 0x311b1b80, -0x798e4914, -0x43ceca50, 0x7bff0cb1, -0x63f30e20, 0x745d2ffa } + }, + { + { 0x21d34e6a, 0x6036df57, -0x66844c30, -0x4e2477d9, -0x378a9506, -0x2c3df63d, 0x4c1dc839, 0x06e15be5 }, + { 0x2bc9c8bd, -0x40ada5e2, 0x26479d81, -0x15a4d9f8, -0x20feaa25, -0x2aee38f2, -0x69f30a30, 0x1ae23ceb }, + { 0x1932994a, 0x5b725d87, -0x314e2550, 0x32351cb5, -0x254835fb, 0x7dc41549, 0x278ec1f7, 0x58ded861 } + }, + { + { -0x493d3658, 0x2dfb5ba8, -0x0ad3a674, 0x48eeef8e, -0x0ed2ea8d, 0x33809107, 0x531d5bd8, 0x08ba696b }, + { -0x0d993aa4, -0x27e8c86d, -0x33bab1b7, -0x3736893b, -0x43d93c58, 0x5ce382f8, 0x5485f6f9, 0x2ff39de8 }, + { -0x3c103a86, 0x77ed3eee, -0x2b00b7ef, 0x04e05517, -0x0e598e35, -0x15c285c1, -0x6b8301ac, 0x120633b4 } + }, + { + { 0x4912100a, -0x7d42ceb9, 0x7e6fbe06, -0x21dc8493, 0x11ea79c6, -0x1ee189e7, -0x34c6c422, 0x07433be3 }, + { -0x6e9effbe, 0x0b949878, -0x13140518, 0x4ee7b13c, -0x6b0f5b40, 0x70be7395, -0x4b2a6e7b, 0x35d30a99 }, + { 0x5ce997f4, -0x0086bb40, -0x4fa3ae5d, 0x575d3de4, 0x5a76847c, 0x583381fd, 0x7af6da9f, 0x2d873ede } + }, + { + { 0x4e5df981, -0x559dfd1f, 0x5015e1f5, -0x5df2a6e9, -0x451de294, 0x18a275d3, 0x01600253, 0x0543618a }, + { 0x43373409, 0x157a3164, -0x0b557e27, -0x05474812, -0x0a59b7fa, -0x4f6c011a, 0x707fa7b6, 0x2e773654 }, + { -0x68b3dc3f, 0x0deabdf4, -0x6231b96d, -0x5590f5db, -0x5d6545d4, 0x04202cb8, 0x2d07960d, 0x4b144336 } + }, +}, +{ + { + { 0x57c5715e, 0x299b1c3f, 0x6b686d90, -0x69346d62, 0x47235ab3, 0x30048064, -0x5bb2601f, 0x2c435c24 }, + { 0x53242cec, 0x47b837f7, -0x3fbded0e, 0x256dc48c, -0x1e26d73b, -0x1ddd0405, -0x5275d3f9, 0x48ea295b }, + { -0x7f077cc1, 0x0607c97c, -0x35da13a5, 0x0e851578, 0x161ebb6f, 0x54f7450b, -0x5f2107f2, 0x7bcb4792 } + }, + { + { 0x045224c2, 0x1cecd0a0, 0x69e53952, 0x757f1b1b, 0x5289f681, 0x775b7a92, 0x16736148, 0x1b6cc620 }, + { 0x2bc73659, -0x7b781c30, 0x059979df, 0x4baf8445, -0x23529041, -0x2e8368a6, -0x2103694a, 0x57369f0b }, + { 0x75638698, -0x0e5666ff, -0x11559f2d, 0x353dd1be, 0x4c9ba488, -0x7b6b8ecd, 0x43ade311, 0x63fa6e68 } + }, + { + { -0x2db4a149, 0x2195becd, -0x3f32bb07, 0x5e41f18c, 0x41ca9ede, -0x20d7f8bc, -0x0ca48299, 0x07073b98 }, + { 0x6597c168, -0x2ea3dfad, -0x672d7877, -0x608c8c00, 0x3257ba1f, 0x18aee7f1, 0x07346f14, 0x3418bfda }, + { 0x4ce530d4, -0x2fc39894, 0x3b5df9f4, 0x0b64c047, 0x19b3a31e, 0x065cef8b, 0x533102c9, 0x3084d661 } + }, + { + { 0x760321fd, -0x6593178a, -0x6149c528, 0x7fe2b510, -0x7537fa6e, 0x00e7d4ae, -0x44908dc6, 0x73d86b7a }, + { -0x407b9653, -0x1e094862, -0x1d99cecb, 0x15801004, -0x508be7e5, -0x65b67cd0, 0x049b673c, 0x3ba2504f }, + { 0x6dba5ab6, 0x0b52b560, -0x444e1255, -0x56ecb0f1, -0x64fb59cb, 0x30a9520d, 0x7973e5db, 0x6813b8f3 } + }, + { + { -0x0cea81d7, -0x0e6b35aa, 0x5ef528a5, 0x136d3570, -0x74fa6644, -0x22b31089, 0x24f833ed, 0x7d5472af }, + { 0x334127c1, -0x67ab4fac, -0x7d0400db, 0x105d0478, 0x44186f4f, -0x24b60807, -0x412f4700, 0x1768e838 }, + { -0x50cc25b9, -0x2f1078b3, -0x491cc607, 0x00d3be5d, -0x63631132, 0x3f2a8a2f, 0x2352435a, 0x5d1aeb79 } + }, + { + { -0x49e4588b, 0x12c7bfae, -0x1d9c4003, -0x47b19de1, 0x5c840dcf, 0x0b47a5c3, -0x335079cc, 0x7e83be0b }, + { 0x19cd63ca, -0x0a61944d, 0x21d06839, 0x670c1592, 0x2150cab6, -0x4f92a9a5, 0x104f12a3, 0x20fb199d }, + { 0x6d99c120, 0x61943dee, 0x460b9fe0, -0x79efe0d2, -0x7117a673, 0x6bb2f151, -0x033b8a34, 0x76b76289 } + }, + { + { 0x522ec0b3, 0x4245f1a1, 0x2a75656d, 0x558785b2, 0x48a1b3c0, 0x1d485a25, -0x2a701f61, 0x60959ecc }, + { 0x756286fa, 0x791b4cc1, -0x28b5ea84, -0x24312ce9, -0x158d421a, 0x7e732421, 0x1131c8e9, 0x01fe1849 }, + { -0x571285f7, 0x3ebfeb7b, -0x1afd8764, 0x49fdc2bb, 0x3c119428, 0x44ebce5d, -0x416b80b6, 0x35e1eb55 } + }, + { + { 0x726ccc74, 0x14fd6dfa, 0x2f53b965, 0x3b084cfe, 0x52a2c8b4, -0x0cc51b0b, 0x0d40166a, 0x59aab07a }, + { -0x3a8c722d, -0x242518ff, -0x4d90e412, -0x063909cb, 0x42f15ef4, 0x61e96a80, -0x509f5b28, 0x3aa1d11f }, + { -0x6da153db, 0x77bcec4c, 0x60137738, 0x18487184, -0x01560baf, 0x5b374337, -0x371955ba, 0x1865e78e } + }, +}, +{ + { + { 0x1c529ccb, -0x6983ab17, 0x64c635fb, 0x30f62692, 0x78121965, 0x2747aff4, -0x150990a4, 0x17038418 }, + { -0x4991e086, -0x333b4839, -0x0af3d082, 0x44157e25, 0x713eaf1c, 0x3ef06dfc, 0x52da63f7, 0x582f4467 }, + { 0x20324ce4, -0x39ce842d, -0x5bb7743c, -0x57efbd18, 0x4e5a1364, -0x4de10e75, -0x325d7237, 0x0c2a1c4b } + }, + { + { 0x69bd6945, -0x123b7eb8, -0x41e372de, 0x0d6d907d, -0x2aa33a55, -0x39c42dee, -0x5ceb237d, 0x5a6a9b30 }, + { 0x6f1f0447, -0x2db23830, -0x24783fa7, -0x4dd961c2, -0x044d2d71, -0x2ea4fd8e, -0x3909b789, 0x7c558bd1 }, + { -0x2c69b9c3, -0x2f13eadc, -0x3ca5db10, 0x12bb628a, 0x1cbc5fa4, -0x5af3c587, 0x0afbafc3, 0x0404a5ca } + }, + { + { 0x2a416fd1, 0x62bc9e1b, -0x1cafa675, -0x4a3908d8, 0x3d5d6967, 0x04343fd8, -0x18071168, 0x39527516 }, + { 0x0aa743d6, -0x73e0bff9, 0x5b265ee8, -0x33452f35, 0x668fd2de, 0x574b046b, -0x352269cd, 0x46395bfd }, + { 0x1a5d9a9c, 0x117fdb2d, -0x2effa3d6, -0x6388ba44, 0x54d56fea, -0x102b410f, -0x17dd2fea, 0x76579a29 } + }, + { + { 0x52b434f2, 0x333cb513, -0x6c217f1f, -0x27cdd7b7, 0x750d35ce, -0x4aaed779, 0x2a2777c1, 0x02c514bb }, + { 0x49c02a17, 0x45b68e7e, -0x43565c81, 0x23cd51a2, -0x13ddb3e5, 0x3ed65f11, -0x61fa424f, 0x43a384dc }, + { -0x740e49bb, 0x684bd5da, -0x094ab4ad, -0x04742c82, -0x564f2dad, 0x313916d7, 0x61548059, 0x11609209 } + }, + { + { 0x369b4dcd, 0x7a385616, 0x655c3563, 0x75c02ca7, -0x2b0e7fdf, 0x7dc21bf9, -0x6e191fbe, 0x2f637d74 }, + { 0x29dacfaa, -0x4bb2e997, -0x7beca671, -0x25ad60b4, 0x453d5559, -0x16109c36, -0x3a9671f5, 0x351e125b }, + { 0x1af67bbe, -0x2b4b64ba, -0x3754769f, -0x29fcfc86, -0x06596605, 0x71dee19f, -0x1831d566, 0x7f182d06 } + }, + { + { -0x71de8ade, 0x09454b72, -0x2b7b4728, -0x55a7170c, 0x7f46903c, -0x2ca7dab3, 0x241c5217, 0x44acc043 }, + { -0x54fe9714, 0x7a7c8e64, 0x15edc543, -0x34a5b5ab, 0x47cd0eda, 0x095519d3, 0x343e93b0, 0x67d4ac8c }, + { 0x4f7a5777, 0x1c7d6bbb, -0x6e7cec1f, -0x74ca012c, -0x3694b97c, 0x4adca1c6, 0x12ad71bd, 0x556d1c83 } + }, + { + { -0x4ee417df, -0x7e0f98aa, 0x10a3f3dd, 0x0faff823, 0x6a99465d, -0x074d2fab, -0x337380fb, 0x097abe38 }, + { 0x0c8d3982, 0x17ef40e3, 0x15a3fa34, 0x31f7073e, 0x0773646e, 0x4f21f3cb, 0x1d824eff, 0x746c6c6d }, + { 0x7ea52da4, 0x0c49c987, -0x6423e2bd, 0x4c436955, -0x0833142e, 0x022c3809, 0x4bee84bd, 0x577e14a3 } + }, + { + { -0x42b228d5, -0x6b013142, 0x060f2211, -0x0b95b026, -0x3f372e01, 0x124a5977, -0x04ff6d6b, 0x705304b8 }, + { 0x61a73b0a, -0x0f1d9754, 0x3791a5f5, -0x0d0505f0, 0x6b6d00e9, -0x3e1ec17e, 0x6fd78f42, 0x60fa7ee9 }, + { 0x4d296ec6, -0x49c2e2cb, 0x5fad31d8, -0x0c3cfac2, -0x4b42bd14, 0x670b958c, -0x5e9cac03, 0x21398e0c } + }, +}, +{ + { + { -0x79e48166, -0x793a03ea, 0x6a27c451, -0x095ccfb9, -0x5e16ca69, 0x01667267, 0x6082dfeb, 0x05ffb9cd }, + { -0x72582d11, 0x216ab2ca, -0x660bd7d9, 0x366ad9dd, 0x4fdd3c75, -0x519b4700, 0x53909e62, 0x403a395b }, + { -0x0ac09ec7, -0x59e80561, 0x13e66cb6, 0x60f2b5e5, -0x4cbb755c, -0x28574111, 0x6f5ea192, 0x7a293285 } + }, + { + { 0x79639302, -0x4763bbb8, 0x50c67f2c, 0x4ae4f193, -0x37e5063a, -0x0f4ca258, 0x46871017, 0x39d00035 }, + { -0x4fd21778, 0x0b39d761, -0x2dbeb1e1, 0x5f550e7e, 0x22e1a940, -0x59405ba8, -0x02bb8467, 0x050a2f7d }, + { -0x59af2489, 0x437c3b33, -0x453ad44e, 0x6bafe81d, 0x2db7d318, -0x0166bfd3, 0x372ba6ce, 0x2b5b7eec } + }, + { + { 0x613ac8f4, -0x596bbfb3, -0x056818d4, 0x500c3c2b, 0x1fcec210, -0x78befb2e, -0x79fb5712, 0x1b205fb3 }, + { -0x7c0af111, -0x4c43b443, -0x736d879a, 0x508f0c99, -0x37481992, 0x43e76587, -0x5b806727, 0x0f7655a3 }, + { -0x2db4ecc4, 0x55ecad37, 0x6038c90b, 0x441e147d, -0x29d39012, 0x656683a1, -0x781f1352, 0x0157d5dc } + }, + { + { -0x28e14adc, -0x6ad9aaec, 0x5df14593, -0x19fc277f, 0x0d4de6b7, 0x147cdf41, 0x0437c850, 0x5293b173 }, + { 0x0354c13d, -0x0d5850af, -0x55c8d4a0, -0x285f4ebb, 0x05a3d470, 0x2869b96a, -0x7db9fe8d, 0x6528e42d }, + { 0x4bccf226, 0x23d0e081, -0x7e69046d, -0x6d38ba33, 0x59541e5b, -0x749e8694, -0x3fde0688, 0x40a44df0 } + }, + { + { 0x4bc5d095, -0x793691af, -0x03597fb6, -0x0df2bf68, -0x37d915a3, 0x27363d89, 0x5719cacf, 0x39ca3656 }, + { 0x4f20ea6a, -0x25579677, 0x4c620618, -0x15eb5c2f, 0x090bf8be, 0x6001fccb, -0x6b816310, 0x35f4e822 }, + { 0x6f87b75c, -0x68af90d1, 0x034ae070, -0x39db5160, -0x552cb22a, 0x1ec856e3, -0x1bbf1a71, 0x055b0be0 } + }, + { + { 0x6ea33da2, 0x4d12a04b, -0x1c9ed923, 0x57cf4c15, -0x11bb2699, -0x6f13698b, 0x2a985aac, 0x64ca348d }, + { -0x768ca2ee, 0x6469a17d, -0x199d460f, -0x2490d82b, 0x6a395681, -0x60345cd8, -0x2d9650db, 0x363b8004 }, + { -0x1b3b6ed3, -0x66a771e7, 0x1ca5ce6b, -0x1033c4b2, -0x05a4672b, 0x4522ea60, 0x1de4a819, 0x7064bbab } + }, + { + { 0x42542129, -0x5d6f3f9f, -0x4172a470, -0x0d1d3d52, 0x76abfe1b, -0x30dba725, -0x7c29d941, 0x02157ade }, + { 0x5a770641, -0x46e61eaf, 0x4e7f8039, -0x565d1d39, 0x3df23109, 0x7527250b, -0x53d84875, 0x756a7330 }, + { 0x1b9a038b, 0x3e46972a, 0x7ee03fb4, 0x2e4ee66a, 0x6edbb4ca, -0x7e5db789, -0x7132fa9d, 0x1a944ee8 } + }, + { + { 0x182362d6, -0x44bf57a7, -0x75b2e545, -0x4660aa89, 0x758559f6, -0x72e74bd9, 0x4d26235a, 0x26c20fe7 }, + { 0x51039372, -0x2a56e2ef, -0x6635d922, 0x2ed377b7, -0x02c99495, -0x5e8dfd54, -0x296fe66b, 0x0730291b }, + { -0x1633dd0b, 0x648d1d9f, 0x28dd577c, 0x66bc5619, 0x652439d1, 0x47d3ed21, -0x125074b7, 0x49d271ac } + }, +}, +{ + { + { -0x4b48a9ff, 0x2798aaf9, 0x5c8dad72, 0x5eac7213, 0x61b7a023, -0x2d31559f, -0x167082b2, 0x1bbfb284 }, + { 0x382b33f3, -0x760afa76, -0x52b73f4c, 0x5ae2ba0b, -0x5ac24c92, -0x706c4afd, -0x6a5dcd1a, 0x5aa3ed9d }, + { -0x38269a9f, 0x656777e9, 0x72c78036, -0x34d4edac, -0x26af9112, 0x65053299, 0x5e8957cc, 0x4a07e14e } + }, + { + { -0x3b885b65, 0x240b58cd, 0x6447f017, -0x02c72522, -0x58379553, 0x19928d32, -0x7b505f7f, 0x50af7aed }, + { -0x67f20667, 0x4ee412cb, 0x3c6ec771, -0x5cea2891, -0x6da38803, -0x445a1222, 0x1d313402, 0x3f0bac39 }, + { 0x15f65be5, 0x6e4fde01, 0x216109b2, 0x29982621, 0x0badd6d9, 0x78020581, -0x45142ffa, 0x1921a316 } + }, + { + { -0x260c3e75, -0x28a55266, 0x60b1c19c, 0x566a0eef, 0x255c0ed9, 0x3e9a0bac, -0x5f9d380b, 0x7b049dec }, + { -0x20478f04, -0x76bdd082, 0x4f76b3bd, 0x2c296beb, 0x36c24df7, 0x0738f1d4, -0x1d8c5150, 0x6458df41 }, + { 0x35444483, -0x23341c86, 0x0fedbe93, 0x75887933, 0x12c5dd87, 0x786004c3, -0x3d6af19c, 0x6093dccb } + }, + { + { 0x6084034b, 0x6bdeeebe, 0x780fb854, 0x3199c2b6, -0x49d2f96b, -0x68cc8955, -0x749b8270, 0x6e3180c9 }, + { -0x7a1f8f93, 0x1ff39a85, -0x4c18c6cd, 0x36d0a5d8, 0x718f453b, 0x43b9f2e1, 0x4827a97c, 0x57d1ea08 }, + { -0x5ed74f8f, -0x11854919, -0x6c577456, -0x5b3ea693, -0x4dde9ed0, -0x084b217e, -0x226842e8, 0x363e999d } + }, + { + { -0x1db4513a, 0x2f1848dc, -0x454350a0, 0x769b7255, 0x3cefe931, -0x6f34c392, -0x39064cab, 0x231f979b }, + { 0x35ee1fc4, -0x6957bc3f, 0x08e4c8cf, -0x68914cab, -0x4a732cd0, -0x4bd097ff, 0x693a052b, 0x48ee9b78 }, + { -0x33d50c3a, 0x5c31de4b, -0x01df72e1, -0x4fb44fd0, -0x3eb04b9a, -0x48728ff7, 0x08792413, 0x079bfa9b } + }, + { + { -0x5d2abdbb, -0x0c361280, 0x77f63952, 0x0aa08b78, -0x2ef7ab8b, -0x2892539d, -0x6b8f9c95, 0x1ef4fb15 }, + { -0x25cff20c, -0x1c6fc5af, 0x3da95ab0, -0x7bc69bdd, 0x0b356480, -0x12c30ed3, -0x7b7e8e6c, 0x038c77f6 }, + { 0x5b167bec, -0x7ab1a11a, -0x692f323e, 0x59590a42, -0x67efde67, 0x72b2df34, 0x4a0bff56, 0x575ee92a } + }, + { + { 0x0aa4d801, 0x5d46bc45, -0x5acc4628, -0x3c50edd9, 0x2b8906c2, 0x389e3b26, 0x382f581b, 0x200a1e7e }, + { -0x75e7d031, -0x2b3f7f70, -0x66b76243, 0x30e170c2, 0x52f733de, 0x05babd57, 0x2cd3fd00, 0x43d4e711 }, + { -0x1506c53b, 0x518db967, 0x056652c0, 0x71bc989b, 0x567197f5, -0x01d47a27, 0x651e4e38, 0x050eca52 } + }, + { + { 0x60e668ea, -0x6853c68a, 0x153ab497, -0x64e64402, 0x34eca79f, 0x4cb179b5, -0x5ece51a9, 0x6151c09f }, + { 0x453f0c9c, -0x3cbce522, -0x008fc465, -0x160afba2, -0x127b84c3, -0x03268537, 0x1c58f4c6, 0x4b0ee6c2 }, + { -0x020fa26a, 0x3af55c0d, 0x2ab4ee7a, -0x22d9d120, 0x12171709, 0x11b2bb87, -0x7ff0fcf5, 0x1fef24fa } + }, +}, +{ + { + { -0x6fe99de0, -0x006e5996, 0x5bf1e009, -0x0ddaad52, 0x7f90df7c, 0x7dff85d8, 0x0c736fb9, 0x4f620ffe }, + { 0x6b6c6609, -0x4b69edc6, -0x7f54a6c8, -0x58af017b, -0x483d85a1, -0x0b8e40c7, 0x77ac193c, 0x507903ce }, + { -0x2021c1cc, 0x62f90d65, -0x4605a053, -0x30d73a6e, -0x39e9baf0, -0x66379107, 0x4a256c84, 0x25d44804 } + }, + { + { -0x36fdd4ab, 0x2c7c4415, -0x7ed14e02, 0x56a0d241, -0x2849a1f3, -0x0fd15e37, -0x2acdc4da, 0x4180512f }, + { -0x38164e91, -0x4297dcf2, -0x3e3a86a3, 0x0eb1b9c1, -0x6a494e01, 0x7943c8c4, 0x0bbacf5e, 0x2f9faf62 }, + { -0x75b75a25, -0x5b00c197, -0x426abfc5, -0x4595c7fa, 0x47d5b65d, -0x60831e51, 0x5939d2fb, 0x15e087e5 } + }, + { + { -0x0469c0c8, -0x776be792, -0x239c642b, 0x48a00e80, -0x1693e367, -0x5b17f6d5, -0x35a8c99f, 0x5a097d54 }, + { 0x745c1496, 0x12207543, -0x25c79ef4, -0x2500c303, 0x2c71c34f, -0x1b1868d9, 0x34bdede9, 0x39c07b19 }, + { 0x17c9e755, 0x2d45892b, -0x76cf7208, -0x2fcc028e, 0x525b8bd9, 0x6c2fe9d9, -0x3ee33f87, 0x2edbecf1 } + }, + { + { -0x2f785da1, -0x11f0f023, 0x5c3e34ee, -0x638aceab, -0x7054c54b, 0x660c572e, 0x544cd3b2, 0x0854fc44 }, + { -0x38ea5f2e, 0x1616a4e3, -0x07cbe2b3, 0x53623cb0, -0x38176635, -0x6910acd7, -0x5997455a, 0x3d4e8dbb }, + { 0x55edad19, 0x61eba0c5, -0x0f57c21a, 0x24b533fe, -0x7c455a08, 0x3b770428, -0x675b8173, 0x678f82b8 } + }, + { + { 0x57775696, 0x1e09d940, 0x3cd951db, -0x112ed9a4, 0x20bce16f, -0x056253d5, -0x172f760c, 0x0f7f76e0 }, + { -0x296ff3ac, -0x4eb6e2f5, -0x62ecd9ca, 0x3539722c, 0x0b362bc9, 0x4db92892, -0x59749621, 0x4d7cd1fe }, + { -0x2b7a4ff4, 0x36d9ebc5, -0x1b524c9b, -0x5da69b6e, -0x3dee6333, -0x3e9a6b80, 0x186e0d5f, 0x45306349 } + }, + { + { 0x2b072491, -0x695beb14, 0x27a7b65b, 0x1bb22181, 0x6e8a4af0, 0x6d284959, -0x32d889a1, 0x65f3b08c }, + { -0x593200e3, -0x6b222f3f, -0x17bdec52, 0x55f6f115, -0x66d03096, 0x6c935f85, 0x4a37f16f, 0x067ee0f5 }, + { 0x199801f7, -0x134d6001, -0x5d5f08d1, -0x62c9e2e1, 0x75fd2f49, 0x25f11d23, 0x0fe10fe2, 0x124cefe8 } + }, + { + { 0x31b16489, 0x1518e85b, -0x248ef405, -0x70552349, -0x5eb51dc7, 0x39b0bdf4, 0x503d20c1, 0x05f4cbea }, + { -0x2e720dab, 0x4c126cf9, 0x147a63b6, -0x3e2b8e17, -0x0c36c4a1, 0x2c6d3c73, -0x1c00795e, 0x6be3a6a2 }, + { -0x3fbeba44, -0x31fbf162, 0x08f6834c, -0x38e00b1e, -0x5477b85d, -0x42ab9173, -0x5b2d545b, 0x64666aa0 } + }, + { + { 0x3337e94c, -0x4f3ac409, 0x11e14f15, 0x7cb5697e, 0x1930c750, 0x4b84abac, -0x1f9bfb98, 0x28dd4abf }, + { 0x7c06d912, 0x6841435a, -0x44c07cf5, -0x35edc3df, -0x4e341d88, -0x2b4c84d9, -0x3890afba, 0x1d753b84 }, + { 0x44cb9f44, 0x7dc0b64c, -0x1c6da241, 0x18a3e1ac, 0x2d0457c4, 0x7a303486, -0x75f376d2, 0x4c498bf7 } + }, +}, +{ + { + { 0x30976b86, 0x22d2aff5, -0x3d2db9fc, -0x726f47fa, 0x4de5bae5, -0x235e7694, -0x37cbf3e9, 0x28005fe6 }, + { 0x1aa73196, 0x37d653fb, 0x3fd76418, 0x0f949530, -0x04c5e84e, -0x52dff4f7, 0x2fc8613e, 0x544d4929 }, + { 0x34528688, 0x6aefba9f, 0x25107da1, 0x5c1bff94, 0x66d94b36, -0x08a44433, 0x0f316dfa, 0x72e47293 } + }, + { + { -0x2cd589d9, 0x07f3f635, 0x5f6566f0, 0x7aaa4d86, 0x28d04450, 0x3c85e797, 0x0fe06438, 0x1fee7f00 }, + { -0x687ef7b1, 0x2695208c, 0x23450ee1, -0x4eafd5f5, 0x03efde02, -0x0262515a, 0x2733a34c, 0x5a9d2e8c }, + { 0x03dbf7e5, 0x765305da, 0x1434cdbd, -0x5b250db7, -0x2db57714, 0x7b4ad5cd, -0x11fbfabd, 0x00f94051 } + }, + { + { 0x07af9753, -0x28106c45, 0x3db766a7, 0x583ed0cf, 0x6e0b1ec5, -0x31966741, 0x5dd40452, 0x47b7ffd2 }, + { -0x3c2ccf4e, -0x72ca94dd, -0x4fb8e4fa, -0x0de37465, 0x6e42b83c, -0x4c93ce94, -0x74154ef3, 0x07d79c7e }, + { -0x43f722ee, -0x78040464, -0x1e113d65, -0x75f994c6, -0x24e03e41, 0x0d57242b, 0x5ea64bb6, 0x1c3520a3 } + }, + { + { 0x216bc059, -0x325790c0, 0x12bcd87e, 0x1fbb231d, 0x17c70990, -0x4b6a9562, 0x66d12e55, 0x38750c3b }, + { -0x43345cb6, -0x7f2dac5a, 0x3838219b, 0x3e61c3a1, -0x677d1c6a, -0x6f3c49ff, 0x5d0ee66f, 0x1c3d0577 }, + { -0x6bdd1ae6, 0x692ef140, 0x2b5df671, -0x343f38c4, 0x744ce029, 0x21014fe7, -0x2ccfb784, 0x0621e2c7 } + }, + { + { -0x4f240f0d, -0x4851e86a, -0x1e831e6a, 0x54dfafb9, -0x16555c4c, 0x25923071, -0x5effd163, 0x5d8e589c }, + { -0x7da67c73, -0x50679f34, -0x39606524, -0x6f15b73f, 0x65581e30, 0x65264837, 0x7bd3a5bc, 0x0007d609 }, + { 0x0842a94b, -0x3f40e26b, 0x588f2e3e, -0x4d2c3c9d, -0x44ae1d11, 0x0a961438, 0x3c1cbf86, 0x1583d778 } + }, + { + { -0x3362d739, -0x6ffcb8fc, -0x08d33a71, 0x1d1b679e, -0x41a478da, 0x16e12b5f, -0x7c3aa7f6, 0x4958064e }, + { 0x5da27ae1, -0x13115d11, 0x55670174, 0x597c3a14, 0x6609167a, -0x3659d5ee, -0x7e127090, 0x252a5f2e }, + { 0x5066e80d, 0x0d289426, 0x307c8c6b, -0x033c087b, 0x0c1112fd, 0x1b53da78, -0x27bc4c78, 0x079c170b } + }, + { + { -0x3f2a2faa, -0x322932b0, -0x44fca8c5, -0x65089793, -0x0c3c10b8, 0x3ca6723f, 0x317b8acc, 0x6768c0d7 }, + { 0x64fa6fff, 0x0506ece4, 0x6205e523, -0x411cbce2, 0x51b8ea42, 0x35794224, 0x4ac9fb00, 0x6dec05e3 }, + { -0x0eaa3e4d, -0x6b49da1b, -0x6684846f, 0x417bf3a7, 0x6d6b2600, -0x3dd34224, -0x2232ad0c, 0x51445e14 } + }, + { + { 0x2bbea455, -0x76ceb855, -0x6df86ed7, -0x73ac5db1, -0x41cf0859, 0x4b49f948, 0x6e4fd43d, 0x12e99008 }, + { 0x3b144951, 0x57502b4b, 0x444bbcb3, -0x71980095, 0x166385db, -0x474296d9, -0x1c6d6a38, 0x13186f31 }, + { 0x7fdfbb2e, -0x0ef3694d, 0x121ceaf9, -0x60656ca2, 0x3a5b983f, -0x20eec93c, 0x5d3e99af, 0x77b2e3f0 } + }, +}, +{ + { + { -0x33a32d65, -0x6acd0b71, -0x5c31c98f, 0x2ba851be, 0x51122941, 0x32dacaa0, 0x350004f2, 0x478d99d9 }, + { -0x630ed9a9, -0x02f28a79, -0x1ac5f1d7, -0x17d0106c, 0x5bbb4be7, -0x33cb5810, -0x5af3c75e, 0x0b251172 }, + { -0x6f44fd40, 0x1d5ad948, 0x0ec25115, 0x50e208b1, 0x4ef21702, -0x5d95dd77, 0x3b524805, 0x4dc92334 } + }, + { + { -0x0c93b68b, 0x3ad3e3eb, 0x37862125, -0x28a2da5b, -0x5fda5aea, -0x178c6bc3, -0x3bee37b9, 0x6bbc7cb4 }, + { 0x0f8086b6, -0x1c7d73c0, -0x6860f238, 0x3f77e6f7, 0x4df42cb4, 0x7ef6de30, -0x4954287c, 0x5265797c }, + { -0x2b5af2aa, 0x3c6f9cd1, -0x39015482, -0x49dbbf89, 0x3580972e, 0x6ff9bf48, -0x4ccd5305, 0x00375883 } + }, + { + { 0x6c75c99c, -0x3674137b, 0x00e33cf4, -0x1bbe7b40, -0x456f89cc, 0x0a676b9b, 0x71f379d7, 0x669e2cb5 }, + { 0x28cb0940, 0x0001b2cd, 0x6f1c24c9, 0x63fb51a0, -0x232a35cf, -0x4a52796f, -0x73baf9a0, 0x67238dbd }, + { -0x5b642cf8, -0x34ee948d, 0x2392729e, 0x025aad6b, 0x3f55d9b1, -0x4b86c106, 0x40678bb9, 0x72a10561 } + }, + { + { -0x1d1afa4a, 0x0d8d2909, -0x3fd6edd0, -0x67358755, -0x564edcd9, 0x77ef5569, -0x7ebc64b9, 0x7c77897b }, + { 0x1cc9249d, -0x5d497ed5, 0x21211f58, 0x62866eee, 0x5df10ece, 0x2cb5c5b8, -0x1d9c5200, 0x03a6b259 }, + { -0x21cce34b, -0x0e3e4a1e, 0x15fca420, 0x5a9f5d8e, 0x7bd932b1, -0x605bc70f, 0x1c6146e7, 0x2a381bf0 } + }, + { + { -0x4acbe991, -0x083f41ce, 0x19cf70d4, 0x27e6ca64, -0x56a858a7, -0x6cb20829, -0x54213d56, 0x5701461d }, + { -0x3037ee3f, -0x53646787, 0x3756e567, -0x7482d67f, 0x7c70edfc, 0x50da4e60, -0x77bbff4a, 0x5dbca62f }, + { 0x2c915c25, 0x2c674740, 0x0b0d340a, 0x1bdcd1a8, 0x07b43f5f, 0x5e5601bd, 0x5539a242, 0x2555b4e0 } + }, + { + { -0x781b9c2c, 0x78409b1d, -0x32049c63, -0x52b256a6, 0x55259b9c, -0x13d788c9, -0x3cedcf55, 0x69c806e9 }, + { 0x66ddd216, 0x6fc09f52, -0x371c8fb8, -0x231a9f59, -0x5d209d03, -0x139a6c63, -0x1ad12e6e, 0x7a869ae7 }, + { 0x14bb3f22, 0x7b48f574, -0x51233378, 0x68c7cee4, 0x79ed80be, -0x12d06c9f, 0x5f77bc4b, 0x25d70b88 } + }, + { + { -0x44e51b2c, -0x67ba62d7, 0x39f954ec, 0x56b9c4c7, -0x3d64b4c2, -0x7cd8bc0a, -0x67497876, 0x21ea8e27 }, + { 0x762bf4de, 0x4151c3d9, 0x2745d82b, 0x083f435f, 0x0d23ddd5, 0x29775a2e, 0x69a5db24, 0x138e3a62 }, + { 0x6a5a7b9c, -0x78410b4c, 0x5fc1d062, -0x2dd662e5, -0x22cde9b8, -0x7dbf67e8, -0x1a5d1fc3, 0x5c5abeb1 } + }, + { + { 0x1306a233, 0x02cde6de, 0x116f8ec7, 0x7b5a52a2, -0x3ee9c4a5, -0x1e397e0c, 0x60d32643, 0x241d3506 }, + { -0x48c3d225, 0x14722af4, 0x5a05060d, -0x43b8f3a1, 0x2581b02e, 0x00943eac, 0x1f499c8f, 0x0e434b3b }, + { 0x0ebc52c7, 0x6be4404d, -0x4e586e0b, -0x51b9dcc5, -0x2da24bd5, 0x2aec170e, 0x6645d694, 0x1d8dfd96 } + }, +}, +{ + { + { 0x12ddb0a4, -0x2a679c64, -0x3fdb7995, -0x5a2e60d0, 0x58fce460, -0x2e83d0fd, 0x2e095e8a, 0x07a19515 }, + { -0x63d13b22, 0x296fa9c5, 0x4f84f3cb, -0x43749e41, 0x17a8f908, 0x1c7706d9, 0x7ad3255d, 0x63b795fc }, + { 0x389e5fc8, -0x57c970fe, -0x30721bc5, -0x6fbcc4fe, -0x3abed9bd, -0x505e02a3, 0x032f0137, 0x3e8fe83d } + }, + { + { -0x17102ec4, 0x08704c8d, 0x33e03731, -0x203ae572, 0x1260cde3, -0x5a62a25b, -0x59da737a, 0x22d60899 }, + { 0x0570a294, 0x2f8b15b9, 0x67084549, -0x6b0dbd90, 0x61bbfd84, -0x21e3a51f, 0x7fac4007, 0x75ba3b79 }, + { 0x70cdd196, 0x6239dbc0, 0x6c7d8a9a, 0x60fe8a8b, -0x14bfeda0, -0x4c77b844, -0x788861a2, 0x0904d07b } + }, + { + { 0x48f940b9, -0x0bcdd29a, -0x42d2f3c7, 0x06952f0c, -0x5f7e06cf, 0x167697ad, -0x4508d594, 0x6240aace }, + { -0x22456e64, -0x4b31e02c, -0x38b37256, -0x30ce24c2, -0x527933af, 0x2c63cc63, -0x43e221f9, 0x43e2143f }, + { 0x5ba295a0, -0x07cb8b64, -0x35c82da6, -0x296b83a5, -0x1836ce96, 0x66f13ba7, -0x724bf354, 0x56bdaf23 } + }, + { + { -0x3e62c44e, 0x1310d36c, 0x622386b9, 0x062a6bb7, -0x285eb0a4, 0x7c9b8591, 0x7e1e5754, 0x03aa3150 }, + { -0x0acacc15, 0x362ab9e3, 0x6eb93d40, 0x338568d5, 0x1d5a5572, -0x61f1ebae, -0x7c8bece8, 0x1d24a86d }, + { -0x002b31e1, -0x0b1389b8, 0x54ac8c1c, -0x1fba1510, 0x1d09357c, -0x772dda7e, -0x6514b7a7, 0x43b261dc } + }, + { + { 0x6c951364, 0x19513d8b, 0x000bf47b, -0x6b018eda, -0x2ab06a99, 0x028d10dd, 0x42940964, 0x02b4d5e2 }, + { -0x77448645, -0x1aa4e1e7, -0x3e85ca63, -0x5f612f83, 0x603dea33, -0x4fd3d11e, 0x5b276bc2, 0x326055cf }, + { 0x28d18df2, -0x4b5eaa35, 0x186ce508, -0x1533b9ba, 0x6c824389, -0x3b630b6d, -0x51a2cbf0, 0x27a6c809 } + }, + { + { -0x3bc296ac, -0x32d3d8f6, 0x6a66cab2, -0x22b5c1a9, 0x69d7036c, 0x79fa5924, 0x3d8c2599, 0x22150360 }, + { 0x1f0db188, -0x74591433, 0x675a5be8, 0x37d3d73a, 0x15f5585a, -0x0dd1205d, -0x009f5e82, 0x2cb67174 }, + { 0x390be1d0, 0x59eecdf9, 0x728ce3f1, -0x56bddfbc, 0x7a94f0f4, -0x7d76e39a, 0x3890f436, 0x7b1df4b7 } + }, + { + { 0x07f8f58c, 0x5f2e2218, -0x2b6bf62c, -0x1caaa361, 0x1fb6a630, -0x4d555773, -0x2cad1fc3, 0x68698245 }, + { -0x4c4d5ddc, -0x1b6d0d20, 0x2b551160, 0x7c6c9e06, 0x0d7f7b0e, 0x15eb8fe2, 0x58fc5992, 0x61fcef26 }, + { 0x2a18187a, -0x244ea27b, -0x79225329, -0x0c1b552d, 0x0ff6c482, 0x44bae281, 0x3daf01cf, 0x46cf4c47 } + }, + { + { -0x0eb67ec0, 0x213c6ea7, 0x392b4854, 0x7c1e7ef8, 0x5629ceba, 0x2488c38c, 0x0d8cc5bb, 0x1065aae5 }, + { -0x613b1a07, 0x426525ed, 0x16903303, 0x0e5eda01, -0x341a3524, 0x72b1a7f2, 0x14eb5f40, 0x29387bcd }, + { -0x20dff2a9, 0x1c2c4525, -0x403598b6, 0x5c3b2dd6, -0x1e7cbfd0, 0x0a07e7b1, 0x4f1ce716, 0x69a198e6 } + }, +}, +{ + { + { -0x61d2b8cc, 0x7b26e56b, -0x7e39e98b, -0x3b38ecd5, -0x13632181, -0x10a36adb, -0x18e8bc53, 0x39c80b16 }, + { -0x10562969, 0x7afcd613, 0x1c067959, 0x0cc45aa4, -0x3e05256a, -0x5a901efc, 0x72e40365, 0x3a73b704 }, + { 0x1b826c68, 0x0f196e0d, 0x4960e3db, -0x08e00f1e, 0x23b7436c, 0x61131670, 0x77da7282, 0x0cf0ea58 } + }, + { + { 0x3ba6945a, -0x1ccd312c, -0x177e3fa3, -0x21f4ec9f, 0x5e67ed3b, 0x1ad40f09, -0x4739c2a3, 0x5da8acda }, + { -0x222b3343, 0x196c80a4, -0x6a0d2263, 0x22e6f55d, 0x40d6c71b, -0x38a1cc39, -0x34c3fbd1, 0x7bb51279 }, + { 0x3a70159f, -0x3b4999b6, 0x0a904e14, 0x76194f0f, -0x5bf693ed, -0x5a9eb3c7, -0x68601313, 0x6cd0ff50 } + }, + { + { -0x4fb45e72, 0x7fecfabd, 0x3bddbcf7, -0x2f038404, 0x057a131c, -0x5be2b792, -0x0dddc59f, 0x641a4391 }, + { -0x70bbd754, -0x3f1f9819, -0x59eeca1d, 0x14835ab0, 0x38062935, -0x0de2eb0d, -0x20fb7b64, 0x6390a4c8 }, + { -0x59f95725, -0x3a3946a6, -0x4f97da0f, -0x6eb48062, 0x44fc9eff, 0x2a731f6b, 0x62705cfc, 0x30ddf385 } + }, + { + { 0x68bcd52c, 0x33bef2bd, 0x69482ef2, -0x39b62450, 0x41cb1aee, -0x4a4911f4, 0x0212a7e5, 0x5c294d27 }, + { -0x2e400807, 0x4e3dcbda, 0x20645717, -0x36ee717e, 0x0f189d56, -0x45333144, -0x2bb98998, 0x1b4822e9 }, + { 0x25563781, -0x54c9f581, 0x480f7958, 0x2512228a, 0x6114b4e3, -0x38a2fad9, -0x268901d6, 0x222d9625 } + }, + { + { 0x0a344f85, 0x0f94be7e, -0x780dd3c8, -0x14d05574, 0x4ee16f0f, -0x631e18a2, 0x18a08dea, 0x43e64e54 }, + { -0x4c8d531f, 0x1c717f85, 0x4638bf18, -0x7e6cf197, 0x6bc08b58, 0x239cad05, -0x7807000c, 0x0b34271c }, + { 0x1a35ce63, -0x7eaa1dae, -0x06edfd72, -0x41eff2b3, -0x5a822314, -0x4007f408, 0x6d6bc6e4, 0x57342dc9 } + }, + { + { 0x1e707bf6, -0x0c3c4349, 0x7291a762, 0x351d9b8c, -0x252965cd, 0x00502e6e, 0x1ec8807f, 0x522f521f }, + { -0x3731a668, -0x10110f9b, -0x4a34155e, -0x40fd6af0, 0x20b7c458, -0x739b5efa, 0x31c24855, 0x35134fb2 }, + { -0x065c6fd5, 0x272c1f46, -0x669a8434, -0x36e45c49, 0x4f8a1c0e, -0x519eb4d0, 0x0b99017b, 0x7afcaad7 } + }, + { + { -0x107bd495, -0x577ebe14, -0x6854193b, 0x55e7b147, 0x03784ffe, -0x738b7069, -0x5032ff49, 0x5b50a1f7 }, + { -0x5b4741bf, -0x3da212ac, 0x1bb0e2dd, -0x6fd2ec1f, -0x3217d54e, 0x41f43233, -0x3c551835, 0x1085faa5 }, + { -0x0ec9eceb, -0x647bf09a, 0x701003e9, 0x18462242, -0x1b5daf80, 0x65ed45fa, 0x3fda7320, 0x0a286239 } + }, + { + { 0x6ecb9d17, -0x69f18c85, -0x2983151f, -0x050db6b8, -0x2aa1e477, 0x37e7a9b4, -0x4b93a615, 0x5cb7173c }, + { 0x347cbc9d, 0x46ab13c8, -0x663edc7d, 0x3849e8d4, -0x7829b537, 0x4cea3140, -0x4e5d6119, 0x1f354134 }, + { -0x7d485410, 0x4a89e68b, -0x64594847, -0x0be326d9, -0x1e727891, 0x16e6c210, 0x7f1b09c6, 0x7cacdb0f } + }, +}, +{ + { + { -0x233a3513, -0x1efebbcc, 0x3c84fb33, 0x47ed5d96, -0x12795f19, 0x70019576, -0x2d98061c, 0x25b2697b }, + { -0x26e58744, -0x6f9d4d20, -0x37af6999, 0x47c9889c, 0x405070b8, -0x620ab59a, 0x2493a1bf, 0x7369e6a9 }, + { 0x13986864, -0x6298c005, 0x415dc7b8, 0x3ca5fbd9, -0x20d8c4a2, -0x1fb133c5, -0x4ab1b32e, 0x1420683d } + }, + { + { -0x3e33a530, 0x34eebb6f, -0x69b95375, 0x6a1b0ce9, -0x599421ad, -0x2c4f25b7, 0x61d081c1, 0x31e83b41 }, + { 0x249dd197, -0x4b8742e2, 0x5e58c102, 0x620c3500, -0x334553a4, -0x04fd2cd1, -0x0af758d3, 0x60b63beb }, + { -0x61f9d4b1, -0x681738ee, 0x29320ad8, 0x49e48f4f, 0x6f18683f, 0x5bece14b, 0x2d550317, 0x55cf1eb6 } + }, + { + { 0x7df58c52, 0x3076b5e3, -0x186633ca, -0x28c54623, 0x4913ee20, -0x427ce31d, 0x62ba0133, 0x1a56fbaa }, + { 0x65c23d58, 0x58791010, 0x5094819c, -0x7462f793, 0x12c55fa7, -0x1dbfd057, 0x570891d4, 0x669a6564 }, + { 0x5c9dc9ec, -0x6bc194b0, -0x5883c8e6, 0x302557bb, 0x41347651, -0x678c51aa, -0x663a75a4, 0x13c48367 } + }, + { + { 0x5d8bd080, -0x3b230496, 0x571a4842, -0x21143b14, -0x471aac9b, -0x2b4d177d, -0x371a47d9, 0x50bdc87d }, + { 0x5ab3e1b9, 0x423a5d46, -0x380ec09f, -0x03ec3e79, -0x134a464a, 0x19f83664, -0x59c849f9, 0x66f80c93 }, + { 0x6edfe111, 0x606d3783, -0x0fee5427, 0x32353e15, 0x25b73b96, 0x64b03ac3, 0x725fd5ae, 0x1dd56444 } + }, + { + { 0x08bac89a, -0x3d681a00, -0x151e3c20, 0x7d4cea11, -0x60186884, -0x0c1c741f, 0x63a305cd, 0x3a3a450f }, + { 0x3362127d, -0x705b8008, 0x71cd7c15, -0x4360953c, 0x49220c8b, 0x6e714543, 0x219f732e, 0x0e645912 }, + { -0x27c6b9d9, 0x078f2f31, -0x216b5af0, 0x389d3183, 0x17996f80, -0x2e1c9393, -0x6c565785, 0x318c8d93 } + }, + { + { -0x54e22c68, 0x5d669e29, 0x342d9e3b, -0x036de9a8, -0x0ca68c33, 0x55851dfd, 0x25950af6, 0x509a41c3 }, + { 0x2afffe19, -0x0d8ba2fd, 0x7f24db66, 0x0c9f3c49, -0x457a6711, -0x43672c1d, -0x65e2acec, 0x224c7c67 }, + { -0x5906da17, -0x423f9124, 0x641b1f33, 0x793ef3f4, -0x627cc177, -0x7d13ed80, 0x28a11389, 0x05bff023 } + }, + { + { 0x0dc512e4, 0x6881a0dd, 0x44a5fafe, 0x4fe70dc8, -0x70b5adc0, 0x1f748e6b, -0x11fe5c16, 0x576277cd }, + { 0x23cae00b, 0x36321370, -0x2e5330a7, 0x544acf0a, -0x2de5e378, -0x698befb7, -0x05d5bb59, 0x780b8cc3 }, + { 0x234f305f, 0x1ef38abc, 0x1405de08, -0x65a88043, 0x34e62a0d, 0x5e82a514, 0x6271b7a1, 0x5ff41872 } + }, + { + { 0x13b69540, -0x1a24b818, 0x432610e1, -0x0ca2d5c5, 0x38781276, -0x53e0d917, -0x5f5f3497, 0x29d4db8c }, + { 0x1789db9d, 0x398e080c, -0x0c18870b, -0x589fdfdb, 0x06bd035d, -0x056776b4, 0x25a966be, 0x106a03dc }, + { 0x333353d0, -0x2652f551, -0x532cf61b, 0x38669da5, -0x37770810, 0x3c57658a, 0x052cbefa, 0x4ab38a51 } + }, +}, +{ + { + { -0x7f621fac, -0x09701d18, -0x637d452f, -0x1c43f696, 0x0aadbf45, 0x076353d4, -0x215e6a62, 0x7b9b1fb5 }, + { 0x4324c0e9, -0x20253412, 0x3f955bb7, 0x05444288, -0x15ce9f61, -0x21085558, 0x42287cff, 0x68aee706 }, + { 0x7471cc0c, -0x0fe3370f, 0x579082bb, -0x6adbd1c9, -0x2c1b94a1, 0x27776093, 0x28bd85fb, 0x2d13d55a } + }, + { + { 0x7aee7a52, -0x40fe6332, -0x1bab152d, -0x57212d4a, -0x785744e7, 0x3c619f0b, 0x560916d8, 0x3619b5d7 }, + { 0x5b35b8da, -0x053a2dfa, -0x7a9db449, -0x57257566, 0x3d21cd0f, -0x332d356f, -0x7406f2a8, 0x6b8341ee }, + { 0x0282c4b2, 0x3579f26b, 0x4fafefae, 0x64d592f2, 0x28c8c7c0, -0x48321285, 0x7173a8d7, 0x6a927b6b } + }, + { + { 0x3ece88eb, -0x728fbf7a, -0x7f113f74, -0x0f1cf857, 0x0d788fda, -0x53ddaf9f, 0x3a0d478d, 0x056d92a4 }, + { -0x6791b9aa, 0x1f6db24f, -0x2e16efa5, 0x1021c02e, 0x2cc0a375, -0x0700c001, -0x3937da6e, 0x1d2a6bf8 }, + { -0x03c25a5f, 0x1b05a196, 0x43b59ed0, 0x77d7a8c2, -0x682e86e8, 0x06da3d62, -0x0edcac09, 0x66fbb494 } + }, + { + { -0x0edcf62a, -0x2928f66a, -0x163c2ac7, -0x2404dc7b, -0x08aadbef, 0x46d602b0, 0x57843e0c, 0x270a0b05 }, + { -0x27a3f048, 0x751a50b9, -0x7430f685, -0x2e5023db, -0x7cf65697, 0x2f16a6a3, -0x1a4ff9a7, 0x14ddff9e }, + { -0x5879d434, 0x61ff0640, 0x5f11abfe, -0x7e353f66, 0x55d12abb, -0x6fb87cfc, -0x6ba5178d, 0x19a4bde1 } + }, + { + { -0x3f893b61, 0x40c709de, 0x7f3e53f6, 0x657bfaf2, -0x135fbd3c, 0x40662331, 0x7eb4df04, 0x14b37548 }, + { 0x20a6200a, -0x6460d90b, -0x30ec1508, 0x64804443, -0x79ce122d, -0x759c98c1, 0x1ed39dc1, 0x72bbbce1 }, + { -0x549923b9, -0x517ac36c, -0x2089d292, -0x149dcbc2, 0x6fb2f7d1, -0x0f71f1e8, 0x700ab37a, 0x4f0b1c02 } + }, + { + { -0x3e4d1dc1, 0x79fd21cc, 0x453df52a, 0x4ae7c281, -0x2eaeb795, -0x37e8d137, 0x3e0a7534, 0x68abe944 }, + { -0x27e6ae06, -0x1e8f9879, -0x4d6f3885, -0x5ef5d372, 0x3ed66773, -0x18c7d060, 0x0bcc4b54, 0x0a4d8471 }, + { 0x07831dcb, -0x25ed393c, 0x4d5c510d, 0x0da230d7, 0x6bd404e1, 0x4ab1531e, -0x430bbf11, 0x4106b166 } + }, + { + { 0x39e4ecf2, -0x5b7a332b, 0x0555bab5, 0x5aa3f3ad, -0x6c8207d3, 0x145e3439, 0x1214283f, 0x1238b51e }, + { 0x1cd23668, 0x02e57a42, 0x0eaef6fd, 0x4ad9fb5d, -0x4edbbb80, -0x6ab198d9, 0x2699f331, 0x7f792f9d }, + { 0x5fd4d924, 0x0b886b92, 0x3626a80d, 0x60906f7a, -0x467542ee, -0x132c984c, -0x210cbb31, 0x2876beb1 } + }, + { + { 0x3a8a85f8, -0x2a6b4ccd, -0x187282a8, 0x4ea37689, 0x5e8e351f, 0x73bf9f45, -0x43be144c, 0x5507d7d2 }, + { 0x63144691, -0x237b16cb, -0x29e0dc0c, 0x632fe8a0, 0x12a9a8d5, 0x4caa8006, 0x0e9918d3, 0x48f9dbfa }, + { 0x299572fc, 0x1ceb2903, -0x6afd2f12, 0x7c8ccaa2, 0x11cce67b, -0x6e405bcc, 0x64a831e7, 0x57844819 } + }, +}, +{ + { + { 0x5fddc09c, -0x29302e11, -0x08a8a232, -0x17d4c103, 0x201634c2, 0x25d56b5d, 0x04ed2b9b, 0x3041c6bb }, + { 0x6768d593, -0x2583d4db, 0x4422ca13, -0x673e3fa9, -0x35f531e3, -0x0e57f42b, -0x3f775970, 0x29cdd1ad }, + { -0x26a91eb8, 0x0ff2f2f9, -0x60ca94d2, -0x5218688b, 0x5f6c025c, 0x1a4698bb, 0x14049a7b, 0x104bbd68 } + }, + { + { -0x29800e9d, -0x56a265a1, 0x4cc75681, -0x16d41963, -0x21df0da9, -0x4807fdb4, -0x04f8d20b, 0x204f2a20 }, + { 0x68f1ed67, 0x51f0fd31, -0x2790c43e, 0x2c811dcd, 0x04d2f2de, 0x44dc5c43, 0x092a7149, 0x5be8cc57 }, + { 0x30ebb079, -0x37ebc4c3, -0x429ad1d0, 0x7589155a, -0x7092a3cf, 0x653c3c31, -0x3d86e9e1, 0x2570fb17 } + }, + { + { 0x0bb8245a, 0x192ea955, -0x706faf2f, -0x37190458, -0x775b36cb, 0x7986ea2d, -0x21fe7998, 0x241c5f91 }, + { 0x2cb61575, 0x3efa367f, 0x1cd6026c, -0x0a06908a, 0x65b52562, -0x1738ebd6, 0x53030acd, 0x3dcb65ea }, + { 0x40de6caa, 0x28d81729, 0x22d9733a, -0x7040d310, 0x235b01d1, 0x16d7fcdd, 0x5fcdf0e5, 0x08420edd } + }, + { + { 0x04f410ce, 0x0358c34e, 0x276e0685, -0x49eca4a6, -0x1446eadf, 0x5d9670c7, 0x21db889c, 0x04d654f3 }, + { -0x7c9d05b6, -0x3200df55, -0x1de5c192, 0x57e118d4, -0x03c619d5, -0x1ce869e9, -0x43e89603, 0x0d9a53ef }, + { -0x22424a2b, 0x5e7dc116, -0x725a22d3, 0x2954deb6, 0x3334a292, 0x1cb60817, 0x18991ad7, 0x4a7a4f26 } + }, + { + { -0x50c8d5b5, 0x24c3b291, 0x718147f2, -0x6c257d90, -0x7976610e, -0x227b7a9c, 0x23e0ee33, 0x4a963142 }, + { 0x5fb15f95, -0x0b58e7fe, 0x6b5c1b8f, 0x3df65f34, 0x00e01112, -0x32030f7b, -0x222ce7b8, 0x11b50c4c }, + { 0x08a4ffd6, -0x5917d8bc, -0x63ea8927, 0x738e177e, 0x3d02b3f2, 0x773348b6, -0x319433af, 0x4f4bce4d } + }, + { + { -0x3b62f491, 0x30e2616e, -0x3513dce9, -0x1ba98e71, -0x0d94b05a, 0x48eb409b, 0x61595f37, 0x3042cee5 }, + { -0x1ddbda7c, -0x58e031a6, -0x6d0a7562, 0x26ea7256, 0x1cea3cf4, -0x2de5f629, -0x48e3fe1a, 0x73fcdd14 }, + { 0x449bac41, 0x427e7079, -0x431dcef6, -0x7aa51c93, 0x5f841a7c, 0x4cae7621, -0x65631e2a, 0x389e740c } + }, + { + { 0x570eac28, -0x3642870a, 0x27919ce1, -0x1aa4f4ce, -0x5e646e13, 0x65fc3eab, -0x29d9c970, 0x25c425e5 }, + { 0x34dcb9ce, 0x64fcb3ae, -0x1cb72f53, -0x68affcdd, 0x62c6381b, 0x45b3f07d, 0x465a6788, 0x61545379 }, + { -0x0e282192, 0x3f3e06a6, -0x71f9dcf8, 0x3ef97627, 0x4e8a6c77, -0x73eb09da, 0x15484759, 0x6539a089 } + }, + { + { 0x14bb4a19, -0x223b242c, -0x67bdb072, 0x19b2bc3c, 0x36ca7169, 0x48a89fd7, -0x0fe64270, 0x0f65320e }, + { -0x3c2d088d, -0x162de08c, 0x25c46845, -0x3eafabbf, -0x064661cd, 0x624e5ce8, -0x3a32e794, 0x11c5e4aa }, + { -0x35021f3a, -0x2b792e4f, 0x163b5181, 0x4f3fe6e3, -0x050d6c66, 0x59a8af0d, -0x13ccf8d6, 0x4cabc7bd } + }, +}, +{ + { + { 0x1a54a044, -0x083f5e64, 0x77bd9fbb, 0x4a1c5e24, 0x5af22972, -0x591c35ef, 0x3f2e9e0d, 0x1819bb95 }, + { 0x532f7428, 0x16faa8fb, 0x46a4e272, -0x242bd160, -0x74615b80, 0x5337653b, 0x23973f03, 0x40659472 }, + { 0x5e042e84, 0x498fbb79, 0x7698b714, 0x7d0dd89a, 0x27fe6295, -0x7404f45c, 0x21200524, 0x36ba82e7 } + }, + { + { 0x57274ed5, -0x372962f6, 0x60804b17, 0x45ba8032, 0x2255dfac, -0x20c325f0, 0x2709b339, 0x77d22123 }, + { 0x4245ec41, -0x29f13449, 0x34348716, -0x02641762, -0x1bdd7b22, -0x36dbf502, -0x2face24c, 0x4472f648 }, + { 0x64ad94d8, 0x498a6d70, -0x6509dd9d, -0x5a4a3703, 0x45c141f4, -0x735712fb, 0x662d358c, 0x2c63bec3 } + }, + { + { -0x7a790741, -0x65ae74c6, -0x344e6910, -0x6118e50a, -0x5dc7a30e, -0x55f9da1a, -0x2228372f, 0x1deb2176 }, + { -0x158786ab, 0x7fe60d8b, -0x4a0bfe49, -0x4623ee82, 0x19355cce, -0x6e383f66, -0x6bbd4121, 0x22692ef5 }, + { 0x2066cf6c, -0x7a9c2e66, 0x4dcc7cd7, 0x401bfd8c, -0x32f2709e, -0x26895942, -0x5d874fa2, 0x67cfd773 } + }, + { + { 0x5a4e586a, 0x2d5fa985, 0x49beab7e, 0x65f8f7a4, -0x0de2cc2d, -0x55f8b223, 0x1bcb9dee, 0x185cba72 }, + { -0x10c11b8b, -0x7213ce06, -0x61dd026e, -0x66240076, 0x4e26cab1, 0x512d1159, -0x13bcef47, 0x0cde561e }, + { -0x0b1c34bf, -0x6c79625d, 0x40f7977e, -0x40fc6d0b, -0x2fb9c47d, 0x026204fc, -0x61139113, 0x3ec91a76 } + }, + { + { -0x4f5cbfd1, 0x0fad2fb7, -0x04960b58, 0x46615ecb, -0x3a07155a, -0x08ba4338, 0x4a94e896, 0x7a5fa879 }, + { -0x087e9953, 0x1e9df75b, -0x14f32851, 0x4dfda838, -0x3e150678, -0x45ffd128, 0x11f33cfc, 0x13fedb3e }, + { 0x13cd67a1, 0x52958faa, -0x74244ae9, -0x69a11f7f, 0x2e8845b3, 0x16e58daa, 0x5499da8f, 0x357d397d } + }, + { + { 0x194bfbf8, 0x481dacb4, -0x451a7d67, 0x4d77e3f1, 0x7d1372a0, 0x1ef4612e, 0x70ff69e1, 0x3a8d867e }, + { -0x4f453194, 0x1ebfa05f, 0x1caf9a1e, -0x36cb9df4, 0x1d82b61a, -0x3388e33c, -0x5a08b014, 0x2d94a16a }, + { 0x55aff958, 0x6f58cd5d, 0x75567721, -0x45c155a4, -0x6e9add83, 0x75c12399, -0x3d0d4ca2, 0x69be1343 } + }, + { + { 0x684b8de3, -0x7d444254, 0x3fca0718, -0x5d0b3830, -0x1f695558, 0x337f92fb, 0x63587376, 0x200d4d8c }, + { -0x1e6836d6, 0x0e091d5e, 0x2945119f, 0x4f51019f, -0x0fcb1664, 0x143679b9, 0x4d24c696, 0x7d88112e }, + { 0x4893b32b, 0x208aed4b, -0x41a6469c, 0x3efbf23e, -0x245a1af9, -0x289d2150, -0x7e42626c, 0x69607bd6 } + }, + { + { -0x6cdc56fe, 0x3b7f3bd4, 0x6b2c6e53, 0x7c21b556, 0x3a7852a7, -0x1a45700b, -0x7c713200, 0x28bc77a5 }, + { 0x68de1ce1, -0x0941fdf0, 0x0edcbc1f, -0x172ae719, 0x1b5505a5, -0x1c100230, -0x2c13c030, 0x35f63353 }, + { -0x1da27fca, 0x63ba78a8, -0x6bcccb70, 0x63651e00, 0x288ce532, 0x48d82f20, 0x36b57524, 0x3a31abfa } + }, +}, +{ + { + { 0x3f78d289, -0x3f708771, -0x5ebfb261, -0x01cf58d4, -0x309a3363, -0x0d887404, 0x5acb2021, 0x7ee49816 }, + { 0x089c0a2e, 0x239e9624, 0x3afe4738, -0x38b73b40, 0x764fa12a, 0x17dbed2a, 0x321c8582, 0x639b93f0 }, + { -0x6eee5e3d, 0x7bd508e3, -0x7f6f8b77, 0x2b2b90d4, -0x518d02e7, -0x182d513e, -0x7a49fd5a, 0x0edf493c } + }, + { + { -0x7b89beed, 0x6767c4d2, -0x080a07cb, -0x5f6fbfc1, -0x35194122, 0x1c8fcffa, -0x2e205c97, 0x04c00c54 }, + { 0x599b5a68, -0x51337ea8, -0x14521df2, -0x15a8b0f1, 0x22b67f07, 0x4fe41d74, 0x019d4fb4, 0x403b92e3 }, + { -0x74b9a308, 0x4dc22f81, 0x1480eff8, 0x71a0f35a, 0x04c7d657, -0x51174053, -0x4d9e890c, 0x355bb12a } + }, + { + { 0x5a8c7318, -0x5cfe2539, -0x4c3155ef, -0x126ffc63, 0x3bae3f2d, 0x6f077cbf, -0x1fad5272, 0x7518eaf8 }, + { 0x7493bbf4, -0x58e19b34, -0x135c4f3d, -0x1a427b27, -0x05fa187b, 0x0a6bc50c, 0x182ec312, 0x0f9b8132 }, + { 0x1b7f6c32, -0x5b77a63c, -0x0bc7cd68, 0x0f2d60bc, -0x364e2e27, 0x1815a929, -0x44e8aa3c, 0x47c3871b } + }, + { + { -0x37af9950, -0x0419a2b0, -0x4c5d6650, 0x62ecc4b0, 0x441ae8e0, -0x1ac8ab16, -0x172b72a1, 0x08fea02c }, + { 0x71ec4f48, 0x51445397, -0x3673a292, -0x07fa4e83, 0x47c3c66b, -0x089d3ee6, 0x764699dc, 0x00b89b85 }, + { 0x68deead0, -0x7db2228a, 0x4b685d23, -0x379bbae0, 0x5d89d665, -0x4aeb3033, 0x4f75d537, 0x473829a7 } + }, + { + { -0x52c6fd37, 0x23d9533a, -0x10fca771, 0x64c2ddce, -0x301ed04c, 0x15257390, 0x44e4d390, 0x6c668b4d }, + { 0x4679c418, -0x7d2d258b, -0x4d9e7210, -0x19c42828, -0x53b814f6, 0x355eef24, 0x4833c6b4, 0x2078684c }, + { 0x7a78820c, 0x3b48cf21, -0x7ed8c169, -0x0895f54e, -0x73711285, -0x56939a59, 0x4f8a433f, 0x7411a605 } + }, + { + { 0x18b175b4, 0x579ae53d, -0x0c6d5efe, 0x68713159, 0x1eef35f5, -0x7baa1346, 0x458c398f, 0x1ec9a872 }, + { -0x46623793, 0x4d659d32, 0x603af115, 0x044cdc75, -0x233d1b78, -0x4cb38ed4, -0x047ecb01, 0x7c136574 }, + { 0x00a2509b, -0x47195b2c, 0x0bc882b4, -0x647e28fe, -0x0e6a8a9f, 0x57e7cc9b, -0x38329ba0, 0x3add88a5 } + }, + { + { 0x59393046, -0x7a3d672c, 0x5ff659ec, -0x7081ca68, -0x0d0991c6, 0x1d2ca22a, -0x5bf958e0, 0x61ba1131 }, + { -0x49ca230e, -0x5476a890, -0x0993e044, 0x02dfef6c, -0x41492e79, -0x7aacfd98, -0x3378618c, 0x249929fc }, + { 0x16959029, -0x5c2f5f0f, -0x45814277, 0x023b6b6c, 0x26783307, 0x7bf15a3e, -0x44271319, 0x5620310c } + }, + { + { 0x77e285d6, 0x6646b5f4, 0x6c8f6193, 0x40e8ff67, -0x544a6b23, -0x59138cef, 0x658cec4d, 0x7ec846f3 }, + { 0x4934d643, 0x52899343, -0x5aeddd0b, -0x462407fa, -0x3c0be3de, -0x70927871, 0x4d9d9730, 0x37676a2a }, + { 0x1da22ec7, -0x64a170c1, 0x6c01cd13, 0x130f1d77, -0x5d676048, 0x214c8fcf, 0x399b9dd5, 0x6daaf723 } + }, +}, +{ + { + { 0x2cd13070, -0x7e514423, -0x07a5f162, -0x69d1bcdb, -0x35200135, -0x216c6e56, 0x52c230e6, 0x53177fda }, + { 0x10628564, 0x591e4a56, -0x574b20cc, 0x2a4bb87c, -0x185c71bd, -0x21d5da8e, -0x011afb92, 0x3cbdabd9 }, + { 0x50b9de79, -0x584368fa, -0x3cfe4a65, 0x3d12a7fb, -0x2c951c74, 0x02652e68, 0x5a6199dc, 0x79d73983 } + }, + { + { 0x0d591737, 0x21c9d992, -0x164b932a, -0x6415be2e, 0x0d89bfca, -0x1df17be0, 0x6eae5ff8, 0x79d99f94 }, + { 0x4131c1bd, -0x26cab20a, -0x7913a7de, 0x758094a1, -0x1ba60c3e, 0x4464ee12, -0x34eccd7e, 0x6c11fce4 }, + { 0x68673205, -0x0e84b7cb, 0x3caad96c, 0x387deae8, 0x56ffe386, 0x61b471fd, -0x48ba5a67, 0x31741195 } + }, + { + { 0x3b02a047, 0x17f8ba68, -0x01104938, 0x50212096, 0x1556cbe2, 0x70139be2, 0x1d98915b, 0x203e44a1 }, + { -0x4885c9f5, -0x172efe70, -0x666a18fe, -0x66467ce0, -0x05fdb856, -0x42b02008, -0x1f2c9579, 0x2772e344 }, + { 0x37b9e39f, -0x2979c146, 0x723b5a23, 0x105bc169, -0x59a3f89e, 0x104f6459, 0x5b4d38d4, 0x56795129 } + }, + { + { 0x0d4b497f, 0x07242eb3, -0x46433379, 0x1ef96306, -0x27ee90bb, 0x37950934, 0x01405b04, 0x05468d62 }, + { 0x13037524, 0x535fd606, -0x4f043d96, -0x1def520a, 0x23e990ae, -0x5372f565, -0x28d02407, 0x47204d08 }, + { -0x06cd9822, 0x00f565a9, -0x3f2a7176, -0x31302873, -0x0ce71d72, -0x5dea1d24, -0x649cccae, 0x4599ee91 } + }, + { + { -0x79e51a87, -0x538b9295, -0x09515624, 0x31ab0650, 0x40256d4c, 0x241d6611, 0x3d21a5de, 0x2f485e85 }, + { 0x70e0e76b, -0x2c3ddf36, -0x1560cf6c, -0x4ed415a8, -0x3cd8ed7e, 0x294ddec8, -0x5e2e2fd8, 0x0c3539e1 }, + { -0x63f7cc0d, 0x32974483, -0x2d543b7c, 0x6fe6257f, 0x4b358817, 0x5327d181, -0x76c01644, 0x65712585 } + }, + { + { -0x28f711c1, -0x7e3d60e5, -0x519bf830, -0x2234a5fb, -0x2d5c1459, -0x68513e29, -0x6e2af7cf, 0x1590521a }, + { 0x32a61161, -0x63efd049, 0x34d520a8, -0x1b71ef23, 0x6f9a9176, 0x365c6354, 0x046f6006, 0x32f6fe4c }, + { -0x386ef534, 0x40a3a11e, -0x0e92d852, -0x6fec2008, -0x544e6a2c, 0x1a9720d8, 0x2ea98463, 0x1bb9fe45 } + }, + { + { -0x33c98b84, -0x30a1936b, 0x6b0bc30d, 0x29420153, -0x11868510, 0x453ac67c, 0x2a8bb3c9, 0x5eae6ab3 }, + { -0x4c2ab062, -0x162e26b0, -0x1ff2cc3f, 0x2d5f9cbe, -0x5fb03954, 0x51c2c656, 0x3c1cbcc9, 0x65c091ee }, + { 0x14f118ea, 0x70836611, -0x6bcb6353, 0x2b37b87b, -0x4b1660c0, 0x7273f51c, 0x23d75698, 0x78a2a958 } + }, + { + { 0x5ef83207, -0x4b0dc3be, -0x3656cb4b, -0x54076b2d, 0x39fd87f7, -0x2f8f73ed, 0x17166130, 0x18767891 }, + { 0x5c8c2ace, -0x5d4f8d17, 0x651e9c4b, 0x69cffc96, 0x42e7b42b, 0x44328ef8, 0x22aadeb3, 0x5dd996c1 }, + { 0x670c507c, -0x6da4a110, -0x46c3cc41, -0x7e6437be, 0x70dd003f, 0x10792e9a, 0x6e28dc74, 0x59ad4b7a } + }, +}, +{ + { + { -0x5352715e, 0x583b04bf, 0x148be884, 0x29b743e8, 0x0810c5db, 0x2b1e583b, -0x714c4456, 0x2b5449e5 }, + { -0x14c241b9, 0x5f3a7562, -0x71425f48, -0x0815c7ac, 0x45747299, 0x00c3e531, 0x1627d551, 0x1304e9e7 }, + { 0x6adc9cfe, 0x789814d2, -0x74b722f5, 0x3c1bab3f, -0x068639f6, -0x25f01e01, 0x7c2dd693, 0x4468de2d } + }, + { + { -0x079cf832, 0x4b9ad8c6, 0x435d0c28, 0x21113531, 0x657a772c, -0x2b57993b, 0x63247352, 0x5da6427e }, + { -0x6be6b962, 0x51bb355e, 0x23ddc754, 0x33e6dc4c, 0x447f9962, -0x6c5a492a, -0x04bb429d, 0x6cce7c6f }, + { -0x2153dd36, 0x1a94c688, -0x4451e008, -0x46f99109, -0x72a6a7f1, -0x775273c8, -0x1860d358, 0x58f29abf } + }, + { + { 0x710ecdf6, 0x4b5a64bf, 0x462c293c, -0x4eb31ac8, -0x2af4c547, 0x3643d056, 0x185b4870, 0x6af93724 }, + { -0x7218c198, -0x16f13055, 0x377e76a5, 0x54036f9f, -0x41fea67e, -0x0fb6a4f5, -0x580be1ca, 0x577629c4 }, + { 0x09c6a888, 0x32200245, 0x4b558973, -0x2d1fc9ed, 0x3c33289f, -0x7c1dc9dd, 0x0caec18f, 0x701f25bb } + }, + { + { 0x7cbec113, -0x62e70927, 0x74bfdbe4, -0x7bb5f91a, -0x53b19f2a, 0x20f5b522, 0x50955e51, 0x720a5bc0 }, + { -0x1b9e9313, -0x3c574f08, -0x61da5783, -0x08ff99f2, -0x0b435a64, 0x61e3061f, -0x423bf417, 0x2e0c92bf }, + { -0x647fa5cb, 0x0c3f0943, 0x6242abfc, -0x17b174c9, 0x5c229346, 0x691417f3, 0x144ef0ec, 0x0e9b9cbb } + }, + { + { 0x5db1beee, -0x7211642b, 0x0a723fb9, -0x363c54c9, 0x1c68d791, 0x44a8f1bf, 0x1cfd3cde, 0x366d4419 }, + { -0x04a8df53, -0x04452b71, -0x2406f2f2, -0x117e6e95, 0x635543bf, -0x2b7eceae, 0x3f337bd8, 0x221104eb }, + { -0x0d4373ec, -0x61c3e8bd, -0x4a7a93c5, 0x2eda26fc, 0x68a7fb97, -0x3347d0f2, -0x43a6cdbc, 0x4167a4e6 } + }, + { + { -0x07317012, -0x3d41d99b, -0x177f29d4, -0x169800ec, 0x2f364eee, -0x0ed19182, -0x34812d0a, 0x34b33370 }, + { 0x76f62700, 0x643b9d28, 0x0e7668eb, 0x5d1d9d40, 0x21fc0684, 0x1b4b4303, 0x2255246a, 0x7938bb7e }, + { -0x797e2934, -0x323a6e12, -0x127a58ad, -0x31fdef64, 0x58808883, -0x128b7a3f, 0x2dfe65e4, 0x1176fc6e } + }, + { + { 0x49770eb8, -0x246f1d77, -0x530bbf5d, -0x670433d6, -0x21287865, 0x21354ffe, -0x0d96f94a, 0x1f6a3e54 }, + { 0x5b9c619b, -0x4b509330, -0x4d5a7b80, 0x2ddfc9f4, -0x1416b23c, 0x3d4fa502, 0x677d5f34, 0x08fc3a4c }, + { -0x2cf8cb16, 0x60a4c199, 0x31165cd6, 0x40c085b6, -0x08a67d6b, -0x1dccc1dd, 0x16b900d1, 0x4f2fad01 } + }, + { + { -0x48c449c8, -0x69d326e3, -0x03ed63f8, -0x19fa8856, -0x0c49e977, 0x6f619b39, 0x2944ee81, 0x3451995f }, + { -0x6b51b1ac, 0x44beb241, 0x1857ef6c, 0x5f541c51, 0x368d0498, -0x59e194d3, -0x68d10855, 0x445484a4 }, + { -0x60158284, -0x6ead0330, -0x4f6ca30a, 0x4a816c94, 0x47285c40, 0x258e9aaa, 0x042893b7, 0x10b89ca6 } + }, +}, +{ + { + { 0x79d34aa0, -0x2983212a, -0x33b24c61, -0x33f46140, -0x1ca2e6f1, -0x5aca5baa, -0x09e09011, 0x2e05d9ea }, + { 0x3b646025, -0x64d5bd92, 0x385ce4cf, 0x32127190, -0x229215bb, -0x5da3003e, -0x4157218b, 0x06409010 }, + { -0x29e414a7, -0x3bb86fe6, -0x1a2377f6, 0x661f19bc, -0x483597d9, 0x24685482, -0x101f80da, 0x293c778c } + }, + { + { -0x5ee00e00, 0x16c795d6, -0x4ea7ea37, -0x348f2f1e, -0x64ac6a4b, -0x760d6ce0, 0x31e47b4f, 0x50b8c2d0 }, + { 0x07069096, -0x797f6190, -0x1b1afe77, -0x5528a4eb, -0x5de5feb9, 0x07f35715, 0x12815d5e, 0x0487f3f1 }, + { 0x068a4962, 0x48350c08, 0x51092c9a, 0x6ffdd053, -0x50903723, 0x17af4f4a, 0x3cdba58b, 0x4b0553b5 } + }, + { + { 0x27c152d4, -0x40fadee5, -0x42e509c7, 0x5ec26849, -0x71905468, 0x5e0b2caa, 0x50bd0840, 0x054c8bdd }, + { 0x1b32ff79, -0x639a0342, 0x03b50f9b, -0x148a1561, 0x6c07e606, -0x0312d594, 0x51717908, 0x35106cd5 }, + { 0x1dcf073d, 0x38a0b12f, -0x48095d8a, 0x4b60a8a3, -0x2cbfb066, -0x012a53db, 0x5505c229, 0x72e82d5e } + }, + { + { 0x69771d02, 0x00d9cdfd, 0x6cfbf17e, 0x410276cd, 0x1cb12ec7, 0x4c45306c, 0x27500861, 0x2857bf16 }, + { -0x0f27bb38, 0x6b0b697f, -0x268634b7, -0x44ed07a4, -0x3e25f0e1, -0x2d5abe3a, 0x58ce7211, 0x7b7c2429 }, + { 0x0101689e, -0x60de6fc1, -0x4079effb, -0x2886202d, 0x3deb0f1b, -0x5edd11a1, 0x485a00d4, 0x510df84b } + }, + { + { -0x38f53ea2, 0x24b3c887, -0x047e48ce, -0x4f0c5aa9, -0x1a8733e5, -0x64d321d1, 0x03b54f8e, 0x4cf7ed07 }, + { -0x6d885e06, -0x5abecc45, 0x63991237, 0x74ec3b62, 0x35d2f15a, 0x1a3c54dc, -0x1b7d45c6, 0x2d347144 }, + { -0x670411f1, 0x6bd47c65, -0x54aa41d3, -0x61b8cc1e, 0x127610c5, 0x1093f624, -0x2f5e155c, 0x4e05e26a } + }, + { + { -0x1e701940, 0x1833c773, -0x2c378d9b, -0x1c3b8ee6, 0x0116b283, 0x3bfd3c4f, -0x4b32b248, 0x1955875e }, + { 0x4b531f20, -0x2564949e, 0x77509abb, 0x429a760e, -0x17dc3480, -0x24160ade, -0x77f3707e, 0x618f1856 }, + { 0x0e399799, 0x6da6de8f, 0x40fda178, 0x7ad61aa4, 0x5e3563dd, -0x4cd327f0, 0x2ae340ae, 0x15f6beae } + }, + { + { -0x6dba1deb, -0x4565f085, -0x2673f245, -0x0c979ed3, -0x0ddf4fe0, 0x2e84e4cb, 0x62d90eda, 0x6ba92fe9 }, + { 0x31ec3a62, -0x79d434f4, 0x1138f3c2, -0x7ef1d4bb, 0x39dac2a4, 0x788ec4b8, -0x51d56d7f, 0x28f76867 }, + { 0x5884e2aa, 0x3e4df965, -0x242b9a5b, -0x429d0425, 0x0de9e524, -0x28a69356, -0x4d4e4c29, 0x6e8042cc } + }, + { + { 0x16521f7e, 0x15306536, -0x69dfc246, 0x660d06b8, 0x545f0879, 0x2d3989bc, 0x78ebd7b0, 0x4b5303af }, + { -0x31d73592, -0x0ef2c3d7, -0x0349f6c3, -0x452cbac0, -0x5d15d2c1, -0x18bd9129, 0x4ff298b9, 0x08af9d4e }, + { -0x41434218, 0x72f8a6c3, -0x23c57177, 0x4f0fca4a, -0x38402086, 0x6fa9d4e8, -0x649db149, 0x0dcf2d67 } + }, +}, +{ + { + { 0x5a45f06e, 0x753941be, 0x6d9c5f65, -0x2f835113, 0x72ff51b6, 0x11776b9c, -0x10f2b257, 0x17d2d1d9 }, + { -0x68e7d764, 0x3d594749, 0x24533f26, 0x12ebf8c5, 0x14c3ef15, 0x0262bfcb, 0x77b7518e, 0x20b878d5 }, + { 0x073f3e6a, 0x27f2af18, -0x28adef97, -0x02c01ae7, 0x3ca60022, 0x22e3b72c, -0x339a3959, 0x72214f63 } + }, + { + { -0x0bc4d637, 0x1d9db7b9, 0x4f518f75, -0x29fa7db6, 0x312f9dc4, -0x0d3f8d43, 0x5a1545b0, 0x1f24ac85 }, + { 0x5307a693, -0x4b1c80c0, 0x2f336795, -0x5458eb29, 0x73761099, -0x29042f59, -0x7e8e3437, 0x5fdf48c5 }, + { -0x716afa56, 0x24d60832, 0x0c1420ee, 0x4748c1d1, 0x06fb25a2, -0x38001ba4, 0x2ae395e6, 0x00ba739e } + }, + { + { -0x157744da, -0x51bbd90b, -0x7b68c405, 0x360679d9, 0x26694e50, 0x5c9f030c, -0x2ae72dda, 0x72297de7 }, + { 0x5c8790d6, 0x592e98de, 0x45c2a2df, -0x1a40482d, -0x064b66de, 0x115a3b60, 0x67ad78f3, 0x03283a3e }, + { -0x41f346c7, 0x48241dc7, -0x749ccf80, 0x32f19b4d, 0x02289308, -0x2c2036f3, 0x46271945, 0x05e12968 } + }, + { + { 0x242c4550, -0x52404438, -0x2fcf7e27, -0x4337f314, -0x0a37206e, -0x7bca995a, -0x7da731b4, 0x78cf25d3 }, + { 0x2d9c495a, -0x457d114d, -0x0ed44684, -0x31103704, -0x6c4a2e20, -0x4fd25452, 0x13698d9b, 0x39c00c9c }, + { 0x31489d68, 0x15ae6b8e, -0x63d40f79, -0x557ae355, -0x0fb105fb, -0x3658a569, 0x6b3ff832, 0x006b5207 } + }, + { + { -0x4631f7d3, -0x0a3481ea, 0x417abc29, 0x3407f14c, 0x2bf4a7ab, -0x2b4c9432, 0x1a9f75ce, 0x7de2e956 }, + { -0x626a87e4, 0x29e0cfe1, -0x699cef1e, -0x497e20e8, 0x70516b39, 0x57df39d3, 0x3bc76122, 0x4d57e344 }, + { -0x495aa135, -0x218f2b0c, 0x5d85db99, 0x4801527f, -0x2c11657f, -0x24363bc0, 0x1a6029ed, 0x6b2a90af } + }, + { + { 0x5bb2d80a, 0x77ebf324, 0x2fb9079b, -0x27cfe4b9, 0x4cee7333, -0x39b8190e, 0x276c2109, 0x465812c8 }, + { -0x6519e169, 0x6923f4fc, -0x1fc0a02f, 0x5735281d, -0x19122ed3, -0x589b51bd, -0x2ed2c1b6, 0x5fd8f4e9 }, + { 0x2a1062d9, 0x4d43beb2, 0x3831dc16, 0x7065fb75, -0x21d69729, 0x180d4a7b, 0x1cb16790, 0x05b32c2b } + }, + { + { 0x7ad58195, -0x08035bd4, 0x4333f3cc, 0x3214286e, 0x340b979d, -0x493d62f3, 0x567307e1, 0x31771a48 }, + { -0x2db25703, -0x373fa134, 0x05dfef83, -0x5e30e554, 0x7df9cd61, -0x2441100e, 0x7b471e99, 0x3b5556a3 }, + { -0x1eb22b7e, 0x32b0c524, 0x1a2ba4b6, -0x124caeac, 0x282b5af3, -0x5c2e9fb8, 0x7a7336eb, 0x4fc079d2 } + }, + { + { 0x0c86c50d, -0x23cb74bc, -0x336b19af, 0x1337cbc9, 0x643e3cb9, 0x6422f74d, -0x451c32f8, 0x241170c2 }, + { -0x7640d081, 0x51c938b0, 0x02dfe9a7, 0x2497bd65, 0x7880e453, -0x00003f64, -0x3506716e, 0x124567ce }, + { 0x0ac473b4, 0x3ff9ab86, 0x0113e435, -0x0f6ee212, -0x14393b51, 0x4ae75060, 0x6c87000d, 0x3f861296 } + }, +}, +{ + { + { 0x638c7bf3, 0x529fdffe, 0x388b4995, -0x20d461a0, 0x1bad0249, -0x1fd84cb1, -0x46058b13, 0x7bc92fc9 }, + { -0x086a841c, 0x0c9c5303, -0x1f7a3ebb, -0x5c3ce5e0, -0x2f7affb0, -0x4f8de28f, -0x54f40d26, 0x0aba390e }, + { -0x7fe52607, -0x606810d2, 0x79afda3a, -0x7c9682ac, -0x42a694b0, -0x16f94c01, -0x22c04720, 0x02672b37 } + }, + { + { 0x398ca7f5, -0x116458d7, 0x7a4849db, -0x146359db, 0x7ec544e1, 0x29eb29ce, -0x08c91d38, 0x232ca21e }, + { 0x260885e4, 0x48b2ca8b, -0x7d4cb3e4, -0x5bd79414, 0x17f58f74, -0x6c81e5da, -0x54d35d5b, 0x741d1fcb }, + { 0x253fcb17, -0x409ebdc3, -0x05c614ec, 0x08803cea, -0x67ae3851, -0x0e79fd21, 0x49e3414b, 0x0400f3a0 } + }, + { + { -0x5f9184fa, 0x2efba412, 0x2c8d2560, 0x14678545, -0x29856e39, -0x2068ec15, 0x157eadf3, 0x32830ac7 }, + { -0x459e3aa5, -0x5431fb8a, -0x3b2c68ea, 0x36a3d6d7, -0x1727d2f7, 0x6eb259d5, -0x7b28a905, 0x0c9176e9 }, + { -0x48c89618, 0x0e782a7a, 0x75b18e2c, 0x04a05d78, -0x1433151f, 0x29525226, -0x7c1457e0, 0x0d794f83 } + }, + { + { -0x585d1e54, 0x7be44ce7, -0x052e4749, 0x411fd93e, 0x0d5f7c9b, 0x1734a1d7, 0x3127db16, 0x0d659223 }, + { -0x61eae90c, -0x00ca0a35, 0x648aae45, -0x117fa431, -0x46c5610d, -0x0f28c3d5, 0x2092a6c2, 0x097b0bf2 }, + { 0x21a9d733, -0x3b7454eb, -0x29e544db, -0x593d1516, -0x3934bcfb, 0x625c6c1c, -0x6c14c599, 0x7fc90fea } + }, + { + { -0x63834dc3, -0x3ad8214b, 0x5328404e, -0x6aac6e97, 0x7ccf2c7a, -0x29bc6d7f, -0x082705ef, 0x6ce97dab }, + { 0x1f5c5926, 0x0408f1fe, 0x3b258bf4, 0x1a8f2f5e, -0x0238e997, 0x40a951a2, -0x3674a882, 0x6598ee93 }, + { 0x0ef7c48f, 0x25b5a8e5, 0x6f2ce532, -0x149fcbef, -0x1ac21ac9, -0x3a18ae8d, -0x73ed44fd, 0x73119fa0 } + }, + { + { 0x21f4774d, 0x7845b94d, 0x7897b727, -0x409d0e94, 0x3c56522b, 0x671857c0, -0x6a9dedee, 0x3cd6a852 }, + { 0x53f1a4cb, -0x12cfed6c, -0x370ac879, -0x4319de37, 0x38bee7b9, -0x0534d4ed, -0x6157bd74, 0x3025798a }, + { 0x3aeca999, 0x3fecde92, 0x62e8c12f, -0x4255a500, -0x69677522, 0x67b99dfc, 0x52661036, 0x3f52c028 } + }, + { + { -0x113be93a, -0x6da74067, -0x562d098f, -0x5375afe9, 0x16dea4ab, 0x629549ab, -0x66f6ea97, 0x05d0e85c }, + { 0x2a1351c6, -0x00155b72, -0x0580ac29, 0x28624754, 0x7582ddf1, 0x0b5ba9e5, -0x596953a7, 0x60c0104b }, + { -0x21634169, 0x051de020, -0x4af4308c, -0x05f803aa, 0x0f11df65, 0x378cec9f, -0x546921b3, 0x36853c69 } + }, + { + { -0x053a1842, 0x4433c0b0, 0x4c08dcbe, 0x724bae85, 0x46978f9b, -0x0e0db33c, 0x62825fc8, 0x4a0aff6d }, + { 0x78f39b2d, 0x36d9b8de, -0x57b84614, 0x7f42ed71, 0x79bd3fde, 0x241cd1d6, -0x6d043195, 0x6a704fec }, + { 0x61095301, -0x16e80462, 0x02a092f8, -0x3efd206c, -0x0599e6f5, -0x40f61d0b, -0x1f2301c9, 0x681109be } + }, +}, +{ + { + { 0x36048d13, -0x63e70306, 0x73899ddd, 0x29159db3, -0x606d2f56, -0x2360caf5, -0x7875e62c, 0x26f57eee }, + { 0x782a0dde, 0x559a0cc9, -0x158e7c7b, 0x551dcdb2, 0x31ef238c, 0x7f62865b, 0x7973613d, 0x504aa776 }, + { 0x5687efb1, 0x0cab2cd5, 0x247af17b, 0x5180d162, 0x4f5a2467, -0x7a3ea5cc, -0x6245cf97, 0x4041943d } + }, + { + { -0x5d935523, 0x4b217743, 0x648ab7ce, 0x47a6b424, 0x03fbc9e3, -0x34e2b086, -0x67ff2fe7, 0x12d93142 }, + { 0x43ebcc96, -0x3c3f1146, 0x26ea9caf, -0x728b6364, 0x1c77ccc6, -0x26056a12, 0x7684340f, 0x1420a1d9 }, + { -0x2cc8a6b1, 0x00c67799, -0x4dc55b85, 0x5e3c5140, -0x1ca00c6b, 0x44182854, 0x4359a012, 0x1b4f9231 } + }, + { + { -0x5b67994f, 0x33cf3030, 0x215f4859, 0x251f73d2, 0x51def4f6, -0x547d55c0, 0x6f9a23f6, 0x5ff191d5 }, + { -0x76eaf6af, 0x3e5c109d, 0x2de9696a, 0x39cefa91, -0x68a0cfe0, 0x20eae43f, 0x7f132dae, 0x239b572a }, + { -0x53d26f98, -0x7e612bcd, 0x5fc98523, 0x2883ab79, 0x5593eb3d, -0x10ba8d80, 0x758f36cb, 0x020c526a } + }, + { + { -0x0fbd3377, -0x16ce10a7, -0x71edb44a, 0x2c589c9d, -0x5138a669, -0x52371e76, 0x5602c50c, 0x452cfe0a }, + { -0x61272444, 0x779834f8, -0x23835b94, -0x370d5507, -0x5c1e4f8c, -0x56adb324, 0x15313877, 0x02aacc46 }, + { 0x647877df, -0x795f0860, 0x0e607c9f, -0x443b9bd9, -0x0e04ee37, -0x54e815db, 0x304b877b, 0x4cfb7d7b } + }, + { + { -0x687610ee, -0x1d79663e, -0x20a8e6f3, 0x2b6ecd71, -0x13368f30, -0x3cbc37a9, 0x434d3ac5, 0x5b1d4cbc }, + { -0x47648a02, 0x72b43d6c, -0x63952380, 0x54c694d9, 0x3ee34c9f, -0x473c55c9, 0x39075364, 0x14b4622b }, + { -0x33f560da, -0x4904d9eb, -0x4772331b, 0x3a4f0e2b, 0x3369a705, 0x1301498b, 0x58592dd1, 0x2f98f712 } + }, + { + { 0x4f54a701, 0x2e12ae44, -0x56342822, -0x0301c110, 0x75835de0, -0x314076f3, -0x189ebaac, 0x1d8062e9 }, + { -0x4af061aa, 0x0c94a74c, -0x7171ece0, 0x5b1ff4a9, -0x7dcff099, -0x65d533df, -0x27f95507, 0x3a6ae249 }, + { -0x566f83a6, 0x657ada85, -0x6e46f09e, 0x1a0ea8b5, -0x20cb4b17, -0x72f1e205, -0x510da00d, 0x298b8ce8 } + }, + { + { 0x0a2165de, -0x7c858d16, 0x0bcf79f6, 0x3fab07b4, 0x7738ae70, 0x521636c7, 0x03a7d7dc, 0x6ba62718 }, + { -0x1008f34e, 0x2a927953, 0x79157076, 0x4b89c92a, 0x30a7cf6a, -0x6be7ba86, 0x4d5ce485, 0x34b8a840 }, + { -0x7c96cccb, -0x3d91134b, 0x63b5fefd, -0x2a57ec21, -0x5b4dda8d, -0x5d6c5566, 0x465e1c6a, 0x71d62bdd } + }, + { + { -0x4e08a10b, -0x32d24a26, 0x16b065f5, -0x28806a31, 0x3f49f085, 0x14571fea, 0x262b2b3d, 0x1c333621 }, + { -0x2c872080, 0x6533cc28, 0x0a0fa4b4, -0x0924bc87, -0x08fe25a6, -0x1c9ba007, -0x0ce8d45c, 0x74d5f317 }, + { 0x67d9ca81, -0x57901aac, 0x2b298c37, 0x398b7c75, -0x1c539dc5, -0x2592f76e, 0x47e9d98c, 0x4aebcc45 } + }, +}, +{ + { + { -0x5fa65bbb, 0x0de9b204, 0x4b17ad0f, -0x1ea34b56, 0x1f79c557, -0x1e4413ae, -0x2f8ef7e5, 0x2633f1b9 }, + { 0x05d21a77, 0x53175a72, -0x2c46cb2c, -0x4f3fbbde, -0x22a21524, -0x52260db5, -0x60ef0074, 0x074f46e6 }, + { 0x018b9910, -0x3e04be89, 0x6c0fe140, -0x5915df24, 0x4354c6ff, -0x299e0c19, -0x0e5cbf86, 0x5ecb72e6 } + }, + { + { -0x17179669, -0x01151efa, -0x672f6c7d, -0x679ccc81, -0x55f91411, -0x6b8fb7f2, -0x2b3a3d30, 0x038b6898 }, + { 0x2259fb4e, -0x5aea5ce5, 0x2bcac52f, 0x0960f397, -0x72cbab35, -0x124ad014, -0x3b893fe7, 0x382e2720 }, + { -0x7531af5a, -0x0c6e3ae3, -0x51d2d6b8, 0x3142d0b9, 0x7f24ca80, -0x24b2a5e6, 0x59250ea8, 0x21aeba8b } + }, + { + { -0x0ff780dd, 0x53853600, -0x2582a87c, 0x4c461879, -0x4be097a0, 0x6af303de, -0x3d83e713, 0x0a3c16c5 }, + { -0x30bfaad0, 0x24f13b34, 0x43088af7, 0x3c44ea4a, 0x0006a482, 0x5dd5c517, -0x76f4f793, 0x118eb8f8 }, + { -0x336b80c3, 0x17e49c17, -0x553e2d85, -0x3339125a, -0x4f0f71aa, -0x209f6d32, 0x2c67c36b, 0x4909b3e2 } + }, + { + { 0x706ff64e, 0x59a16676, 0x0d86a53d, 0x10b953dd, -0x31a3f46a, 0x5848e1e6, 0x12780c68, 0x2d8b78e7 }, + { 0x63fe2e89, -0x63637a16, 0x0e9412ec, -0x41e4506f, -0x79040185, -0x70845576, -0x10697494, 0x0fb17f9f }, + { -0x503c6fd5, 0x79d5c62e, -0x7617f8d8, 0x773a2152, -0x1efedf47, -0x3c7519c0, 0x7b2b1a6d, 0x09ae2371 } + }, + { + { -0x52cd4e30, 0x10ab8fa1, -0x1d8874dc, -0x165312e5, 0x373de90f, -0x577a9440, -0x225ac66a, 0x66f35ddd }, + { 0x4e4d083c, -0x4495e6d6, 0x0029e192, 0x34ace063, -0x55054515, -0x67dba5a7, -0x25680554, 0x6d9c8a9a }, + { 0x24997323, -0x2d826505, -0x090fe2d2, 0x1bb7e07e, -0x0ad13381, 0x2ba7472d, 0x646f9dc8, 0x03019b4f } + }, + { + { -0x194c2395, -0x50f64dec, -0x5282d09b, 0x3f7573b5, 0x100a23b0, -0x2fe62678, -0x74a3ca09, 0x392b63a5 }, + { 0x565345cd, 0x04a186b5, -0x433bee96, -0x111899f0, 0x78fb2a45, 0x689c73b4, 0x65697512, 0x387dcbff }, + { -0x63f83dfb, 0x4093addc, -0x0acd3c82, -0x3a9a41eb, 0x1583402a, 0x63dbecfd, -0x10d1fcd2, 0x61722b4a } + }, + { + { -0x7e34f1c4, -0x294f85ab, -0x26bbb697, 0x290ff006, 0x16dcda1f, 0x08680b6a, 0x5a06de59, 0x5568d2b7 }, + { -0x1342b851, 0x0012aafe, 0x1cd46309, 0x55a266fb, 0x0967c72c, -0x0dfc1498, -0x35c3ebd7, 0x39633944 }, + { 0x1b37cfe1, -0x72f34774, 0x053818f3, 0x05b6a5a3, -0x487826a7, -0x0d1643fc, -0x6522809c, 0x6beba124 } + }, + { + { 0x43f5a53b, 0x5c3cecb9, 0x06c08df2, -0x633659e3, -0x7a76abb9, -0x30459c66, 0x0df09fd5, 0x5a845ae8 }, + { -0x5a4e4ebd, 0x1d06005c, 0x7fd1cda2, 0x6d4c6bb8, 0x53fcffe7, 0x6ef59676, -0x3e31e15b, 0x097c29e8 }, + { 0x5deb94ca, 0x4ce97dbe, -0x738f63b8, 0x38d0a438, -0x5e962f69, -0x3bc1312c, -0x081a783d, 0x0a1249ff } + }, +}, +{ + { + { 0x7354b610, 0x0b408d9e, 0x5ba85b6e, -0x7f94cdad, 0x4a58a207, -0x2419c5fd, -0x365e20d4, 0x173bd9dd }, + { 0x276d01c9, 0x12f0071b, -0x793b7390, -0x1847453b, 0x71d6fba9, 0x5308129b, 0x5a3db792, 0x5d88fbf9 }, + { -0x01a78d21, 0x2b500f1e, -0x2bc6e73f, 0x58d6582e, -0x3698c520, -0x1912d872, -0x4e615ce7, 0x06e1cd13 } + }, + { + { -0x61a4fcad, 0x472baf62, 0x278d0447, 0x3baa0b90, -0x69bc40d9, 0x0c785f46, -0x727c84ed, 0x7f3a6a1a }, + { 0x6f166f23, 0x40d0ad51, 0x1fab6abe, 0x118e3293, -0x5fb2f772, 0x3fe35e14, 0x26e16266, 0x30806035 }, + { 0x5d3d800b, -0x0819bbc7, -0x36fe120a, -0x6a572aab, 0x592c6339, 0x68cd7830, 0x2e51307e, 0x30d0fded } + }, + { + { 0x68b84750, -0x634b68e2, 0x6664bbcf, -0x5f6a8dd7, 0x72fa412b, 0x5c8de726, 0x51c589d9, 0x46150843 }, + { -0x0dedcc4d, -0x1fa6b2e6, -0x0f33b264, 0x1bdbe78e, -0x70b66589, 0x6965187f, 0x2c099868, 0x0a921420 }, + { -0x51465fd2, -0x436fe640, 0x16034cae, 0x55c7110d, 0x659932ec, 0x0e6df501, -0x6a35a202, 0x3bca0d28 } + }, + { + { -0x6133fe41, -0x6397714a, -0x59bb7691, -0x0f437c53, 0x5f7a9fe2, -0x35d26aa1, -0x720d7dbf, 0x4ea8b403 }, + { 0x3c5d62a4, 0x40f031bc, -0x300f85a0, 0x19fc8b3e, 0x130fb545, -0x67e7c25e, -0x5170ec33, 0x5631dedd }, + { -0x0e352dfe, 0x2aed460a, -0x5b73117d, 0x46305305, 0x49f11a5f, -0x6ede88bb, 0x542ca463, 0x24ce0930 } + }, + { + { -0x020cf47b, 0x3fcfa155, 0x36372ea4, -0x2d08e972, 0x6492f844, -0x4d1f9b22, 0x324f4280, 0x549928a7 }, + { -0x02f93efa, 0x1fe890f5, 0x5d8810f2, -0x4a3b97cb, 0x6e8caf3e, -0x7d87f702, -0x75f928b5, 0x41d4e3c2 }, + { 0x63ee1a2e, -0x0d91cd59, -0x2da00216, -0x516e1b49, -0x2e80b297, -0x43c42cc5, -0x3f230096, 0x491b66de } + }, + { + { -0x2f259b5f, 0x75f04a8e, 0x67e2284b, -0x12ddd351, 0x1f7b7ba4, -0x7dcb5c87, -0x48fe7499, 0x4cf6b8b0 }, + { -0x3815cd59, -0x670a4ec3, 0x7e16db98, -0x1c2a0734, -0x340726b9, -0x53f540ae, -0x37a11b54, 0x08f338d0 }, + { -0x66e58c43, -0x3c7c57df, -0x20cdf386, -0x54d843ff, -0x7b888f9d, -0x3ec2cce5, -0x14f87567, 0x530d4a82 } + }, + { + { 0x6c9abf9e, 0x6d697345, 0x4900a880, 0x257fb2fc, -0x373047b0, 0x2bacf412, 0x0cbfbd5b, 0x0db3e7e0 }, + { -0x1e06b7db, 0x004c3630, -0x7354aca6, 0x7e2d7826, -0x337b0075, -0x38b7dcdd, 0x101770b9, 0x65ea753f }, + { -0x1df69c9d, 0x3d66fc3e, 0x61b5cb6b, -0x7e29d381, 0x13443b1a, 0x0fbe0442, 0x21e1a1db, 0x02a4ec19 } + }, + { + { -0x0e3086a1, -0x0a379e9e, 0x26ee57f2, 0x118c8619, 0x1c063578, 0x17212485, -0x13f98031, 0x36d12b5d }, + { 0x3b24b8a2, 0x5ce6259a, 0x45afa0b8, -0x47a88534, -0x745f8fc9, -0x33341918, 0x127809bf, 0x3d143c51 }, + { 0x79154557, 0x126d2791, -0x0387c5f6, -0x2a1b70a4, -0x20e86454, 0x36bdb6e8, 0x5ba82859, 0x2ef51788 } + }, +}, +{ + { + { 0x7c6da1e9, 0x1ea43683, 0x1fb9bdbe, -0x063e7651, -0x31a22eab, 0x303001fc, -0x43a841ae, 0x28a7c99e }, + { -0x2ee1f2b6, -0x7742bc74, 0x43ccf308, 0x30cb610d, -0x6e6c8434, -0x1f65f1c9, 0x25b1720c, 0x4559135b }, + { -0x172e6163, -0x47026c67, -0x69dbdc01, -0x6f7e6e35, 0x47c742a3, -0x4d46b729, -0x2804bb3c, 0x37f33226 } + }, + { + { -0x37de4ee3, 0x33912553, 0x41e301df, 0x66ed42c2, 0x104222fd, 0x066fcc11, -0x3e6de971, 0x307a3b41 }, + { -0x4aa091f8, 0x0dae8767, 0x5b203a02, 0x4a43b3b3, -0x7f507387, -0x1c8da592, 0x705fa7a3, 0x0f7a7fd1 }, + { 0x6eb55ce0, -0x7114a2f9, -0x55f26da6, 0x2fc536bf, -0x23493918, -0x417e7cf1, -0x7d8450ae, 0x556c7045 } + }, + { + { 0x2bf44406, -0x46b46ffe, -0x006f4acc, -0x542bdc82, -0x050792c6, 0x7600a960, -0x3dcdd11d, 0x2f45abda }, + { 0x02e9d8b7, -0x71d4ae8d, 0x248714e8, -0x1c1add97, 0x4ca960b5, -0x42b04289, -0x3a135257, 0x6f4b4199 }, + { -0x37107596, 0x61af4912, 0x43fb6e5e, -0x1a705b02, 0x6fd427cf, -0x4a5033a3, 0x1e1e11eb, 0x6a539328 } + }, + { + { 0x149443cf, 0x0fff04fe, -0x79a32229, 0x53cac6d9, 0x531ed1b7, 0x31385b03, -0x532efc63, 0x5846a27c }, + { -0x5a2e1177, -0x0c25aec7, -0x006c9678, -0x7ebaba84, 0x00e188c4, 0x3f622fed, -0x2474a5c3, 0x0f513815 }, + { 0x1eb08717, 0x4ff5cdac, -0x6f0d1644, 0x67e8b295, 0x237afa99, 0x44093b5e, -0x78f7474e, 0x0d414bed } + }, + { + { 0x294ac9e8, -0x7e77956e, -0x2aaab842, 0x23162b45, 0x03715983, -0x6b3043bc, 0x134bc401, 0x50eb8fdb }, + { -0x02f18a0a, -0x30497d9b, -0x446f18f9, -0x1ba4c1d8, -0x6006d386, 0x7242a8de, -0x6ccdfd23, 0x685b3201 }, + { -0x294ccf33, -0x3f48c13a, 0x132faff1, -0x7b1bb7f9, -0x3b5a211f, 0x732b7352, -0x55832d2e, 0x5d7c7cf1 } + }, + { + { -0x648c5a9e, 0x33d1013e, 0x48ec26e1, -0x6da310a9, -0x22b97fa8, -0x580319ec, 0x1e9aa438, 0x78b0fad4 }, + { 0x7a4aafa2, -0x50c4b941, 0x4d40d411, -0x4878fa14, -0x3583ea1d, 0x114f0c6a, -0x56b762b3, 0x3f364faa }, + { -0x12fa4b78, -0x40a95bcf, -0x63b6a382, -0x5acc1994, -0x780c9ae6, -0x179ad451, 0x59d66c33, 0x02418000 } + }, + { + { -0x30c715ff, 0x28350c7d, -0x4d6e854a, 0x7c6cdbc0, -0x7a8f7d09, -0x53183042, -0x5d265e20, 0x4d2845ab }, + { -0x5c85a41c, -0x314f8802, -0x1a5a1149, -0x249bd0fe, 0x471270b8, -0x3d192f3b, 0x38e4529c, 0x4771b655 }, + { 0x447070de, -0x44ac8020, 0x6dd557df, -0x3458bbbd, 0x3600dbcb, -0x2c4a5cb9, -0x06002808, 0x4aeabbe6 } + }, + { + { -0x3b56370e, 0x6a2134bc, -0x7531d1c9, -0x040702e4, -0x66ee5f46, 0x000ae304, 0x6bc89b9e, 0x046e3a61 }, + { 0x40d8f78c, 0x4630119e, 0x3c710e11, -0x5fe5643b, -0x76ef2287, 0x486d2b25, -0x24fcdb1b, 0x1e6c47b3 }, + { -0x0fc6f942, 0x14e65442, -0x1c9d41d6, 0x4a019d54, -0x723dcf39, 0x68ccdfec, -0x509479e4, 0x7cfb7e3f } + }, +}, +{ + { + { 0x305b2f51, -0x69114005, -0x776a6948, -0x2c06c753, 0x46d5dd25, -0x0f0ad239, -0x44c5ff6b, 0x57968290 }, + { -0x73a75124, 0x4637974e, -0x540fbe5c, -0x4610dd05, -0x167f8e76, -0x1e7a26aa, -0x4ebc575a, 0x2f1b78fa }, + { 0x0a20e101, -0x08e547bd, 0x24f0ec47, -0x0c6c9a73, 0x6ee2eed1, -0x308af658, -0x23d55c1f, 0x7dc43e35 } + }, + { + { 0x273e9718, 0x5a782a5c, 0x5e4efd94, 0x3576c699, 0x1f237d3e, 0x0f2ed805, -0x7d2af567, 0x044fb81d }, + { -0x7782263d, -0x7a69999b, 0x4bb05355, -0x36f064cf, -0x10df864f, -0x391f7208, 0x758cc12f, 0x7ef72016 }, + { -0x56f81c27, -0x3e20e73b, -0x31b39ca7, 0x57b3371d, -0x4dfe44b7, -0x358fbacc, -0x63cf22d2, 0x7f79823f } + }, + { + { 0x68f587ba, 0x6a9c1ff0, 0x0050c8de, 0x0827894e, 0x7ded5be7, 0x3cbf9955, 0x1c06d6f0, 0x64a9b043 }, + { -0x5c4aec18, -0x7ccb2dc7, -0x46e05728, -0x3ec98f2c, -0x0a6f42cd, 0x12b54136, -0x287b264c, 0x0a4e0373 }, + { 0x5b7d2919, 0x2eb3d6a1, -0x2ac57dcb, -0x4f4b0960, -0x765ba2b9, 0x7156ce43, -0x31e7cb94, 0x071a7d0a } + }, + { + { 0x20e14431, -0x33f3caae, 0x09b15141, 0x0d659507, 0x209d5f36, -0x650a9de5, 0x617755d3, 0x7c69bcf7 }, + { -0x377845f5, -0x2cf8d256, -0x405a9d12, 0x01262905, -0x3f108975, -0x30abcffe, 0x46ea7e9c, 0x2c3bcc71 }, + { 0x04e8295f, 0x07f0d7eb, 0x2f50f37d, 0x10db1825, 0x171798d7, -0x16ae565d, 0x22aca51d, 0x6f5a9a73 } + }, + { + { -0x5c26bb42, -0x18d62b15, -0x7f875062, -0x7261f6c0, 0x47869c03, 0x4525567a, -0x1172c4dc, 0x02ab9680 }, + { 0x2f41c6c5, -0x745efff4, 0x0cfefb9b, -0x3b60863f, 0x3cc51c9f, 0x4efa4770, -0x1eb85036, 0x494e21a2 }, + { -0x221af266, -0x105b757b, 0x0fb9a249, 0x219a224e, -0x26e10927, -0x05f6e0e3, -0x15b944cc, 0x6b5d76cb } + }, + { + { 0x1e782522, -0x1f06bee9, 0x036936d3, -0x0e19518c, -0x2f0338ba, 0x408b3ea2, 0x03dd313e, 0x16fb869c }, + { -0x13f3266c, -0x77a8aa94, 0x5cd01dba, 0x6472dc6f, -0x70bd4b89, -0x50fe96ec, -0x7ad88cac, 0x0ae333f6 }, + { 0x33b60962, 0x288e1997, -0x27541ecd, 0x24fc72b4, 0x0991d03e, 0x4811f7ed, -0x708f2f8b, 0x3f81e38b } + }, + { + { 0x5f17c824, 0x0adb7f35, -0x28bd665c, 0x74b923c3, -0x34071509, -0x2a83c175, 0x4cdedc3d, 0x0ad3e2d3 }, + { 0x7ed9affe, 0x7f910fcc, 0x2465874b, 0x545cb8a1, 0x4b0c4704, -0x57c6812e, 0x04f50993, 0x50510fc1 }, + { 0x336e249d, 0x6f0c0fc5, -0x3cce3027, 0x745ede19, 0x09eefe1c, -0x0d290300, -0x0f05e142, 0x127c158b } + }, + { + { -0x51ae468c, -0x215d703c, 0x744dfe96, 0x1d9973d3, -0x78c7b758, 0x6240680b, -0x2e98206b, 0x4ed82479 }, + { 0x2e9879a2, -0x09e683be, 0x52ca3647, -0x5bb5222c, 0x4b4eaccb, -0x64bec03f, 0x07ef4f68, 0x354ef87d }, + { 0x60c5d975, -0x011c4ade, -0x14be4f48, 0x50352efc, -0x56099ac4, -0x77f753d0, 0x0539236d, 0x302d92d2 } + }, +}, +{ + { + { 0x0df53c30, -0x6a847475, -0x719f0f68, 0x2a1c770a, 0x345796de, -0x44385990, -0x6f366437, 0x22a48f9a }, + { -0x34c10484, 0x4c59023f, -0x39c3d56c, 0x6c2fcb99, -0x3c381f7c, -0x45be6f1e, -0x5ae78b27, 0x0e545dae }, + { -0x72c053a8, 0x6b7dc0dc, -0x191bd403, 0x5497cd6c, -0x0bff2cfb, 0x542f7d1b, 0x048d9136, 0x4159f47f } + }, + { + { -0x442db7c7, 0x748515a8, -0x504fd4ab, 0x77128347, 0x49a2a17f, 0x50ba2ac6, 0x3ad730f1, 0x06052551 }, + { 0x39e31e32, 0x20ad6608, -0x7bfa41b0, -0x07e1e42b, -0x0b254397, -0x07f9bfaa, -0x318e468b, 0x14d23dd4 }, + { -0x755d807e, -0x0dc671f7, -0x765e4fdc, 0x6d7982bb, 0x214dd24c, -0x0596bf7c, -0x5cdcfe3d, 0x71ab966f } + }, + { + { 0x02809955, -0x4ef775f9, 0x0b43c391, 0x43b273ea, -0x01f97913, -0x35649852, -0x7cca0b13, 0x605eecbf }, + { 0x4ded02fc, 0x2dcbd8e3, 0x596f22aa, 0x1151f3ec, 0x4e0328da, -0x435daabd, -0x6dbee4de, 0x35768fbe }, + { 0x6c340431, -0x7cdff59b, -0x711a63d1, -0x60328e99, 0x71300f8a, 0x75d4613f, 0x60f542f9, 0x7a912faf } + }, + { + { -0x05d2aa69, 0x253f4f8d, 0x5477130c, 0x25e49c40, -0x6694eefe, 0x00c052e5, 0x33bb6c4a, 0x33cb966e }, + { 0x5edc1a43, -0x4dfba7a2, 0x5897c73c, -0x60f1e912, 0x4e70483c, 0x5b82c0ae, 0x2bddf9be, 0x624a170e }, + { 0x7f116909, 0x59702804, 0x1e564467, -0x7d753be4, -0x19de8c79, 0x70417dbd, -0x0453bc7c, 0x721627ae } + }, + { + { 0x410b2f22, -0x02cf6844, -0x4a3057bc, -0x0e5fa259, -0x10a8358c, 0x61289a1d, -0x447de6fe, 0x245ea199 }, + { -0x78c9522b, -0x682fc43d, -0x3acd4ed0, 0x2f1422af, 0x7101bbc4, 0x3aa68a05, -0x18b06059, 0x4c946cf7 }, + { 0x78d477f8, -0x51235997, 0x29117fe1, 0x1898ba3c, 0x720cbd58, -0x308c067d, -0x474a9caf, 0x67da12e6 } + }, + { + { -0x7137cf74, 0x2b7ef3d3, 0x71eb94ab, -0x7d702814, -0x3af9d543, -0x7f83c4ca, 0x31a94141, 0x0cb64cb8 }, + { -0x4b4291f9, 0x7067e187, -0x382e018c, 0x6e8f0203, 0x38c85a30, -0x6c3955d1, 0x3d75a78a, 0x76297d1f }, + { 0x534c6378, 0x3030fc33, -0x1abe179f, -0x469ca3a4, -0x264d38d8, 0x15d9a9be, -0x0c88a235, 0x49233ea3 } + }, + { + { 0x1c9f249b, 0x7b3985fe, -0x5edccd6d, 0x4fd6b2d5, 0x1adf4d62, -0x314cba6c, 0x542de50c, 0x6987ff6f }, + { -0x724003c6, 0x629398fa, -0x2ab24bab, -0x1ed01ad3, -0x250dad6b, -0x0c41ee21, -0x31a184af, 0x628b140d }, + { -0x707c8ac4, 0x47e24142, -0x79950669, 0x6317bebc, 0x3d1a9829, -0x2544a4bd, 0x5287fb2d, 0x074d8d24 } + }, + { + { -0x3f1ceb78, 0x481875c6, -0x1ddfcb4c, 0x219429b2, 0x31283b65, 0x7223c98a, 0x342277f9, 0x3420d60b }, + { 0x440bfc31, -0x7cc82633, -0x50ce7029, 0x729d2ca1, 0x772c2070, -0x5fbf5b5c, 0x3a7349be, 0x46002ef0 }, + { -0x50019a09, -0x055dc522, 0x5be0764c, 0x78261ed4, 0x2f164403, 0x441c0a1e, 0x7a87d395, 0x5aea8e56 } + }, +}, +{ + { + { -0x1b1f0e89, 0x2dbc6fb6, -0x5b42956d, 0x04e1bf29, 0x787af6e8, 0x5e1966d4, -0x4bd92fa0, 0x0edc5f5e }, + { -0x435bd7c3, 0x7813c1a2, -0x5e79c227, -0x129d0f6f, -0x3d97057a, -0x51384348, 0x6f1cae4c, 0x10e5d3b7 }, + { 0x53da8e67, 0x5453bfd6, 0x24a9f641, -0x1623e114, 0x03578a23, -0x4078d9c5, 0x361cba72, 0x45b46c51 } + }, + { + { -0x75801c1c, -0x3162b223, 0x76620e30, -0x54ec9baa, -0x4cf166a8, 0x4b594f7b, 0x321229df, 0x5c1c0aef }, + { 0x314f7fa1, -0x56bfd541, -0x71730bb0, -0x1da80e24, 0x23a8be84, 0x1dbbd54b, 0x6dcb713b, 0x2177bfa3 }, + { -0x05862471, 0x37081bbc, -0x3da0a64d, 0x6048811e, -0x637cdb79, 0x087a7665, 0x7d8ab5bb, 0x4ae61938 } + }, + { + { -0x67a4047d, 0x61117e44, 0x71963136, -0x031fb9d6, -0x2bda6fb5, -0x7c53cbb8, 0x5ba43d64, 0x75685abe }, + { 0x5344a32e, -0x72240956, -0x4be4bf88, 0x7d88eab4, 0x4a130d60, 0x5eb0eb97, 0x17bf3e03, 0x1a00d91b }, + { -0x149e0d4e, 0x6e960933, -0x3600b6ae, 0x543d0fa8, 0x7af66569, -0x208d8af0, 0x23b0e6aa, 0x135529b6 } + }, + { + { -0x1dd17c02, -0x0a38e944, -0x17f67a3f, -0x4bd414e7, 0x14254aae, -0x136259c9, 0x1590a613, 0x5972ea05 }, + { -0x522e2ae8, 0x18f0dbd7, -0x303ee0ef, -0x68608778, 0x7114759b, -0x78cd1e10, 0x65ca3a01, 0x79b5b81a }, + { -0x237087ef, 0x0fd4ac20, -0x53b2b058, -0x65652d6c, -0x4cc9fbcc, -0x3fe4d29c, -0x6fa0c425, 0x4f7e9c95 } + }, + { + { 0x355299fe, 0x71c8443d, -0x24141529, -0x7432c4e4, -0x0e5b6b9a, -0x7f6db662, -0x5ebb5238, 0x1942eec4 }, + { 0x5781302e, 0x62674bbc, -0x765223f1, -0x27adf0c7, 0x53fbd9c6, -0x73d66652, 0x2e638e4c, 0x31993ad9 }, + { -0x51dcb66e, 0x7dac5319, 0x0cea3e92, 0x2c1b3d91, 0x253c1122, 0x553ce494, 0x4ef9ca75, 0x2a0a6531 } + }, + { + { 0x3c1c793a, -0x30c9e533, 0x5a35bc3b, 0x2f9ebcac, -0x57325955, 0x60e860e9, 0x6dea1a13, 0x055dc39b }, + { -0x0806d83e, 0x2db7937f, 0x17d0a635, -0x248be0fa, 0x1155af76, 0x5982f3a2, 0x647c2ded, 0x4cf6e218 }, + { -0x3d72a44a, -0x4ee6dd84, 0x774dffab, 0x07e24ebc, -0x1b5cd377, -0x57c38732, 0x10aa24b6, 0x121a3077 } + }, + { + { -0x388b7c37, -0x29a68ec2, -0x47d46951, -0x77401f89, 0x1097bcd3, 0x289e2823, 0x6ced3a9b, 0x527bb94a }, + { -0x60fcb569, -0x1b24a2a2, 0x3034bc2d, -0x1eac03f7, -0x6aae2c4f, 0x46054691, 0x7a40e52d, 0x333fc76c }, + { -0x66a4b7d2, 0x563d992a, 0x6e383801, 0x3405d07c, 0x2f64d8e5, 0x485035de, 0x20a7a9f7, 0x6b89069b } + }, + { + { -0x4a382489, 0x4082fa8c, -0x38cb3eab, 0x068686f8, -0x09185a82, 0x29e6c8d9, -0x589c6431, 0x0473d308 }, + { 0x6270220d, -0x7ed55fbf, -0x06dba4b2, -0x66a57606, 0x5072ef05, -0x00523b32, -0x558c148d, 0x23bc2103 }, + { 0x03589e05, -0x351186da, 0x46dcc492, 0x2b4b4212, -0x19fe56b1, 0x02a1ef74, -0x21fbcbe6, 0x102f73bf } + }, +}, +{ + { + { -0x6c5c9db9, 0x358ecba2, -0x4d97029b, -0x5070679e, 0x68a01c89, 0x412f7e99, -0x328abadc, 0x5786f312 }, + { 0x7ec20d3e, -0x4a5d2af4, -0x5f368d9d, -0x39b42292, -0x3e008cb3, 0x56e89052, 0x2b2ffaba, 0x4929c6f7 }, + { -0x35ebfcd4, 0x337788ff, 0x447f1ee3, -0x0c6defd8, 0x231bccad, -0x74ebf8e1, -0x0dcbb87d, 0x4c817b4b } + }, + { + { -0x5bf4bb7c, 0x413ba057, 0x4f5f6a43, -0x45b3d1e6, -0x511e29e4, 0x614ba0a5, -0x74fa23ad, 0x78a1531a }, + { 0x2871b96e, 0x0ff85385, 0x60c3f1bb, -0x1ec16055, 0x25344402, -0x1102a6ad, 0x75b7744b, 0x0a37c370 }, + { 0x3ad0562b, 0x6cbdf170, -0x36dade5d, -0x7130b7d0, -0x027bdb19, -0x25142cfd, 0x2e5ec56f, 0x72ad82a4 } + }, + { + { 0x67024bc3, -0x3c976c6f, 0x49502fda, -0x71962e93, -0x1ba0b4d7, -0x030d13c4, -0x5c4b343c, 0x065f669e }, + { -0x45049a0a, 0x3f9e8e35, -0x0d8d6c5f, 0x39d69ec8, -0x73095c30, 0x6cb8cd95, 0x73adae6d, 0x17347781 }, + { 0x5532db4d, -0x75ff5139, 0x43e31bb1, -0x47965b1c, -0x2c580aeb, 0x4a0f8552, 0x303d7c08, 0x19adeb7c } + }, + { + { 0x43c31794, -0x62fa4583, -0x6ccddada, 0x2470c8ff, 0x16197438, -0x7cdc2138, -0x7ea964ad, 0x28527098 }, + { 0x53ead9a3, -0x38df349f, 0x512b636e, 0x55b2c97f, -0x2bfd6f4f, -0x4e1ca4a1, 0x3b530ee2, 0x2fd9ccf1 }, + { 0x47f796b8, 0x07bd475b, 0x542c8f54, -0x2d384fed, 0x3b24f87e, 0x2dbd23f4, 0x7b0901d6, 0x6551afd7 } + }, + { + { -0x5e2a3654, 0x68a24ce3, 0x10ff6461, -0x44885cc3, 0x25d3166e, 0x0f86ce44, 0x50b9623b, 0x56507c09 }, + { 0x54aac27f, 0x4546baaf, -0x4d5ba5d8, -0x09099014, 0x562bcfe8, 0x582d1b5b, -0x6df087a1, 0x44b123f3 }, + { -0x2e8ec19d, 0x1206f0b7, 0x15bafc74, 0x353fe3d9, 0x0ad9d94d, 0x194ceb97, -0x062fc52d, 0x62fadd7c } + }, + { + { -0x1831ba6c, 0x3cd7bc61, -0x4822d982, -0x3294ca57, 0x4366ef27, -0x5f7f5438, 0x59c79711, 0x6ec7c46f }, + { 0x5598a074, -0x394a6985, -0x71b6c1db, 0x5efe91ce, 0x49280888, -0x2b48d3bb, -0x5d98bf3e, 0x20ef1149 }, + { 0x6f09a8a2, 0x2f07ad63, 0x24205e7d, -0x79681932, -0x11ca5ec7, -0x3f5103fb, -0x4a062769, 0x15e80958 } + }, + { + { 0x5bb061c4, 0x4dd1ed35, -0x6be3f900, 0x42dc0cef, -0x0279cbf2, 0x61305dc1, 0x0e55a443, 0x56b2cc93 }, + { 0x0c3e235b, 0x25a5ef7d, -0x41ecb119, 0x6c39c17f, 0x2dc5c327, -0x388b1ecc, -0x6dfde0c7, 0x021354b8 }, + { -0x59403a5e, 0x1df79da6, -0x6021bc97, 0x02f3a274, -0x325c6f59, -0x4cdc260e, -0x788b2c9d, 0x7be0847b } + }, + { + { 0x5307fa11, 0x1466f5af, -0x1293f50e, -0x7e803383, -0x3c5b5c05, 0x0a6de44e, -0x436d82f5, 0x74071475 }, + { -0x74c0aa3d, -0x736633a6, 0x3fded2a0, 0x0611d725, 0x36b70a36, -0x12d66a01, -0x2875d9e7, 0x1f699a54 }, + { 0x73e7ea8a, -0x188d6d0d, -0x34fba5cf, 0x296537d2, -0x2cd8b022, 0x1bd0653e, 0x76bd2966, 0x2f9a2c44 } + }, +}, +{ + { + { -0x4aaee366, -0x5d4b2520, 0x2bffff06, 0x7ac86029, -0x0aafbdcc, -0x67e0c8a3, -0x25b15ed3, 0x3f6bd725 }, + { 0x7f5745c6, -0x14e74655, 0x5787c690, 0x023a8aee, 0x2df7afa9, -0x48d8ed26, -0x15a3fec3, 0x36597d25 }, + { 0x106058ac, 0x734d8d7b, 0x6fc6905f, -0x26bfa862, -0x6dfd6cd3, 0x6466f8f9, -0x259f2930, 0x7b7ecc19 } + }, + { + { -0x58830565, 0x6dae4a51, -0x185c79b0, -0x7dd9c9ac, -0x70d27d25, 0x09bbffcd, 0x1bf5caba, 0x03bedc66 }, + { 0x695c690d, 0x78c2373c, 0x0642906e, -0x22dad19a, 0x4ae12bd2, -0x6ae2bbbc, 0x01743956, 0x4235ad76 }, + { 0x078975f5, 0x6258cb0d, -0x6e760d68, 0x49294254, -0x1d1c911c, -0x5f354bdd, -0x320f995f, 0x0e7ce2b0 } + }, + { + { -0x26b48f07, -0x01590121, -0x3e0345d3, -0x0ecf3faf, 0x7f2fab89, 0x4882d47e, -0x7513114b, 0x61525613 }, + { -0x3b737a5d, -0x3b6b9bc6, 0x3c6139ad, -0x02c9e20c, 0x3ae94d48, 0x09db17dd, -0x704b98b6, 0x666e0a5d }, + { 0x4870cb0d, 0x2abbf64e, -0x55ba7495, -0x329a4310, 0x75e8985d, -0x6541b146, -0x2aeb211c, 0x7f0bc810 } + }, + { + { 0x737213a0, -0x7c536253, 0x2ef72e98, -0x60090746, 0x43ec6957, 0x311e2edd, -0x213a548b, 0x1d3a907d }, + { 0x26f4136f, -0x46ff945c, 0x57e03035, -0x7298c962, 0x4f463c28, -0x34372027, -0x0711240b, 0x0d1f8dbc }, + { 0x3ed081dc, -0x45e96ccf, -0x7ae4cb80, 0x29329fad, 0x030321cb, 0x0128013c, -0x5ce4021d, 0x00011b44 } + }, + { + { 0x6a0aa75c, 0x16561f69, 0x5852bd6a, -0x3e408da4, -0x65869953, 0x11a8dd7f, -0x2d7aefda, 0x63d988a2 }, + { 0x3fc66c0c, 0x3fdfa06c, 0x4dd60dd2, 0x5d40e38e, 0x268e4d71, 0x7ae38b38, 0x6e8357e1, 0x3ac48d91 }, + { -0x5042dcd2, 0x00120753, -0x0227097d, -0x16d43148, -0x7b18d46f, -0x07e9964d, 0x2368a066, 0x33fad52b } + }, + { + { -0x3bdd3018, -0x72d33730, 0x05a13acb, 0x072b4f7b, -0x13095a91, -0x5c01491a, -0x46f58e1e, 0x3cc355cc }, + { -0x3a1be1ea, 0x540649c6, 0x333f7735, 0x0af86430, -0x0cfa18ba, -0x4d53032e, -0x5da92359, 0x16c0f429 }, + { -0x6fc16ecf, -0x16496bbd, 0x7a5637ce, -0x475b6b35, -0x45456dbc, -0x37832e5c, 0x6bae7568, 0x631eaf42 } + }, + { + { -0x5c8ff218, 0x47d975b9, -0x1d07faae, 0x7280c5fb, 0x32e45de1, 0x53658f27, 0x665f80b5, 0x431f2c7f }, + { -0x25990161, -0x4c16fbf0, 0x6c16e5a6, -0x7a22b4ae, 0x1ef9bf83, -0x43c2689f, 0x1ea919b5, 0x5599648b }, + { -0x7a7084e7, -0x29fd9cbc, -0x5e15aeb6, 0x14ab352f, 0x2090a9d7, -0x76ffbbe6, -0x6edac4da, 0x7b04715f } + }, + { + { -0x3b19453a, -0x4c893d80, 0x6d1d9b0b, -0x68f12c23, 0x450bf944, -0x4f656aa8, 0x57cde223, 0x48d0acfa }, + { -0x530951bd, -0x7c1242d8, 0x7d5c7ab4, -0x79ca8375, -0x4814d3bc, -0x3fbfb897, -0x3d09a7c1, 0x59b37bf5 }, + { 0x7dabe671, -0x49f0d91c, 0x622f3a37, -0x0e2e5e69, -0x1669fc6c, 0x4208ce7e, 0x336d3bdb, 0x16234191 } + }, +}, +{ + { + { 0x3d578bbe, -0x7ad22e03, -0x3cd79ef8, 0x2b65ce72, -0x1531dd8d, 0x658c07f4, -0x13c754c0, 0x0933f804 }, + { 0x33a63aef, -0x0e651539, 0x4442454e, 0x2c7fba5d, 0x4795e441, 0x5da87aa0, -0x5b1f4f0b, 0x413051e1 }, + { -0x72b69b8a, -0x58549687, -0x034a5438, -0x7ede5522, 0x7b539472, -0x5a23ed11, 0x5e45351a, 0x07fd4706 } + }, + { + { -0x6517183d, 0x30421155, -0x6bb77d5b, -0x0d7e4dd7, 0x378250e4, -0x75ec53d2, 0x54ba48f4, 0x014afa09 }, + { 0x258d2bcd, -0x37a7c3c3, -0x509f48c1, 0x17029a4d, 0x416a3781, -0x05f0362a, 0x38b3fb23, 0x1c1e5fba }, + { 0x1bb3666c, -0x34ce6900, 0x4bffecb9, 0x33006052, 0x1a88233c, 0x29371199, 0x3d4ed364, 0x29188436 } + }, + { + { -0x43e54915, -0x0462c83d, 0x4d57a240, 0x02be1453, -0x075a1e0a, -0x0b28cbeb, 0x0ccc8188, 0x5964f430 }, + { -0x23b45406, 0x033c6805, 0x5596ecc1, 0x2c15bf5e, -0x4a64e2c5, 0x1bc70624, -0x5e60f13b, 0x3ede9850 }, + { 0x2d096800, -0x1bb5dceb, 0x70866996, 0x5c08c559, 0x46affb6e, -0x20d249f6, -0x07a90277, 0x579155c1 } + }, + { + { 0x0817e7a6, -0x4a0e949d, 0x3c351026, -0x7f7396dd, 0x54cef201, 0x324a983b, 0x4a485345, 0x53c09208 }, + { 0x12e0c9ef, -0x69cdb123, -0x0dbdfd69, 0x468b878d, -0x5b0a8c42, 0x199a3776, -0x716e16d6, 0x1e7fbcf1 }, + { -0x0e345041, -0x2d2beb7f, 0x716174e5, 0x231d2db6, -0x1d5aa368, 0x0b7d7656, 0x2aa495f6, 0x3e955cd8 } + }, + { + { 0x61bb3a3f, -0x54c60c11, 0x2eb9193e, -0x714bff9b, 0x38c11f74, -0x4a219134, 0x26f3c49f, 0x654d7e96 }, + { 0x3ed15433, -0x1b70aca2, 0x0d7270a3, -0x2f8a96d6, -0x55219c79, 0x40fbd21d, -0x30bb6a0b, 0x14264887 }, + { 0x5c7d2ceb, -0x1a9b3023, -0x28c83347, -0x7d115022, -0x2e064f55, 0x6107db62, -0x4bca7245, 0x0b6baac3 } + }, + { + { 0x3700a93b, 0x204abad6, -0x25886c8d, -0x41ffdc2d, 0x633ab709, -0x27a0fcba, -0x6f7dfbee, 0x00496dc4 }, + { -0x79dd0168, 0x7ae62bcb, -0x31476e51, 0x47762256, -0x0d1bf94c, 0x1a5a92bc, -0x7b1beaff, 0x7d294017 }, + { -0x3d819ca0, 0x1c74b88d, -0x72eb7af4, 0x07485426, 0x3e0dcb30, -0x5eba0485, 0x43803b23, 0x10843f1b } + }, + { + { -0x1cdb9765, -0x2a9098d3, -0x4c6b567f, -0x2e257513, -0x6e973013, -0x2284a702, 0x4d56c1e8, 0x7ce246cd }, + { 0x376276dd, -0x3a06fbab, -0x289ba327, -0x31a6ea73, 0x1d366b39, -0x6d09a2af, 0x526996c4, 0x11574b6e }, + { 0x7f80be53, -0x470bcf72, 0x34a9d397, 0x5f3cb8cb, 0x33cc2b2c, 0x18a961bd, 0x3a9af671, 0x710045fb } + }, + { + { 0x059d699e, -0x5fc0379e, -0x659e6197, 0x2370cfa1, 0x2f823deb, -0x3b01c4ee, -0x580f7bb2, 0x1d1b056f }, + { 0x101b95eb, 0x73f93d36, 0x4f6f4486, -0x0510cc87, -0x70ea1a9e, 0x5651735f, 0x58b40da1, 0x7fa3f190 }, + { -0x1a9409e1, 0x1bc64631, 0x6e5382a3, -0x2c8654f0, 0x0540168d, 0x4d58c57e, -0x7bbd271c, 0x56625662 } + }, +}, +{ + { + { 0x1ff38640, -0x22b6632a, 0x063625a0, 0x29cd9bc3, 0x3dd73dc3, 0x51e2d802, 0x203b9231, 0x4a25707a }, + { -0x09d9800a, -0x461b6622, 0x742c0843, 0x7772ca7b, -0x165b0d4f, 0x23a0153f, -0x2a2faffa, 0x2cdfdfec }, + { 0x53f6ed6a, 0x2ab7668a, 0x1dd170a1, 0x30424258, 0x3ae20161, 0x4000144c, 0x248e49fc, 0x5721896d } + }, + { + { -0x5e2f25b2, 0x285d5091, -0x4a01c1f8, 0x4baa6fa7, -0x1e6c6c4d, 0x63e5177c, -0x3b4fcf03, 0x03c935af }, + { -0x02e7e452, 0x0b6e5517, 0x2bb963b4, -0x6fdd9d61, 0x32064625, 0x5509bce9, -0x09c3ec26, 0x578edd74 }, + { 0x492b0c3d, -0x668d893a, -0x201dfa04, 0x47ccc2c4, -0x229dc5c4, -0x232d647c, 0x0288c7a2, 0x3ec2ab59 } + }, + { + { -0x51cd2e35, -0x58dec5f7, 0x40f5c2d5, 0x0f2b87df, -0x17e154d7, 0x0baea4c6, 0x6adbac5e, 0x0e1bf66c }, + { -0x1b278447, -0x5e5f2d85, 0x61391aed, -0x5674b215, 0x73cb9b83, -0x665f2230, 0x200fcace, 0x2dd5c25a }, + { 0x792c887e, -0x1d542a17, -0x346d92a3, 0x1a020018, -0x4551a0e2, -0x40459633, 0x5ae88f5f, 0x730548b3 } + }, + { + { -0x5e291ccc, -0x7fa4f6b5, 0x09353f19, -0x40c10e89, 0x0622702b, 0x423f06cb, -0x2787ba23, 0x585a2277 }, + { -0x34574712, -0x3bcaae5d, -0x4deea0ea, 0x65a26f1d, -0x5473c7b0, 0x760f4f52, 0x411db8ca, 0x3043443b }, + { 0x33d48962, -0x5e75a07e, -0x1387da81, 0x6698c4b5, 0x373e41ff, -0x5871905b, 0x50ef981f, 0x76562789 } + }, + { + { -0x15793063, -0x1e8f8c5d, 0x07155fdc, 0x3a8cfbb7, 0x31838a8e, 0x4853e7fc, -0x49ec09ea, 0x28bbf484 }, + { -0x2ae03740, 0x38c3cf59, 0x0506b6f2, -0x64122d03, -0x54a8f171, 0x26bf109f, -0x3e47b95a, 0x3f4160a8 }, + { 0x6f136c7c, -0x0d9ed0a4, -0x0922ee42, -0x50152ef9, 0x13de6f33, 0x527e9ad2, -0x7e7708a3, 0x1e79cb35 } + }, + { + { -0x0a1f7e7f, 0x77e953d8, 0x299dded9, -0x7b5af3bc, -0x79bada1b, -0x2393d2f4, 0x39d1f2f4, 0x478ab52d }, + { -0x11081c0f, 0x013436c3, -0x0161ef08, -0x7d749581, -0x43062104, 0x7ff908e5, 0x3a3b3831, 0x65d7951b }, + { -0x6dad2ea7, 0x66a6a4d3, -0x78e537f9, -0x1a221e44, -0x593e3691, -0x47d394c0, 0x1a212214, 0x16d87a41 } + }, + { + { -0x2ab1fa7d, -0x045b2a1e, 0x2ebd99fa, -0x1de05029, 0x6ee9778f, 0x497ac273, 0x7a5a6dde, 0x1f990b57 }, + { 0x42066215, -0x4c4281a6, 0x0c5a24c1, -0x78641c33, -0x29066b49, 0x57c05db1, 0x65f38ca6, 0x28f87c81 }, + { 0x1be8f7d6, -0x5ccbb153, -0x53158671, 0x7d1e50eb, 0x520de052, 0x77c6569e, 0x534d6d3e, 0x45882fe1 } + }, + { + { -0x6bc3901c, -0x275366d7, -0x5c7c6d5e, -0x4a060e9f, -0x4137650d, 0x2699db13, -0x1bfa0f8c, 0x7dcf843c }, + { 0x757983d6, 0x6669345d, 0x17aa11a6, 0x62b6ed11, -0x67a1ed71, 0x7ddd1857, -0x09d90923, 0x688fe5b8 }, + { 0x4a4732c0, 0x6c90d648, -0x35a9cd67, -0x2adebc03, -0x6ea2391f, -0x4c41d73d, 0x7327191b, 0x6739687e } + }, +}, +{ + { + { -0x363468e1, -0x731a5530, -0x602ab5d7, 0x1156aaa9, 0x15af9b78, 0x41f72470, 0x420f49aa, 0x1fe8cca8 }, + { 0x200814cf, -0x609a3a16, 0x69a31740, -0x7bfac91f, 0x25c8b4ad, -0x74f12ec7, -0x16c9c9e3, 0x0080dbaf }, + { 0x3c0cc82a, 0x72a1848f, -0x788361ac, 0x38c560c2, -0x31aabec0, 0x5004e228, 0x03429d71, 0x042418a1 } + }, + { + { 0x20816247, 0x58e84c6f, -0x1c90286d, -0x724d4d4a, 0x1d484d85, -0x688e7daa, -0x79cd5429, 0x0822024f }, + { -0x540c00a1, -0x766215af, 0x2fc2d8ba, -0x646c5799, -0x419142a4, 0x2c38cb97, -0x68d9c4a3, 0x114d5784 }, + { 0x6b1beca3, -0x4cfe4484, -0x3914ec8b, 0x55393f6d, -0x68491b15, -0x6ef2d7f0, -0x62b8615d, 0x1ad4548d } + }, + { + { 0x0fe9fed3, -0x5f901993, 0x1c587909, -0x578cc5c0, 0x0df98953, 0x30d14d80, -0x384cfda8, 0x41ce5876 }, + { 0x389a48fd, -0x32a58260, -0x6587c8e2, -0x4c705b56, 0x2cdb8e6c, -0x392689e5, -0x3681ebbd, 0x35cf51db }, + { -0x298f3fde, 0x59ac3bc5, -0x64ee6bfa, -0x151983f0, -0x4c87d026, -0x68674210, -0x02f8bf6e, 0x651e3201 } + }, + { + { 0x1efcae9e, -0x5a845b60, -0x23cf756c, 0x769f4bee, 0x3603cb2e, -0x2e0ef115, 0x7e441278, 0x4099ce5e }, + { -0x10cf3a31, -0x29c27b7d, 0x2361cc0c, 0x4cd4b496, -0x5b7bd954, -0x116f1b00, 0x18c14eeb, 0x0af51d7d }, + { -0x75aede17, 0x1ac98e4f, -0x2405d020, 0x7dae9544, -0x29bcf207, -0x7cdf55f3, 0x2c4a2fb5, 0x66728265 } + }, + { + { 0x2946db23, -0x52574920, 0x7b253ab7, 0x1c0ce51a, 0x66dd485b, -0x7bb737a6, -0x2f98a521, 0x7f1fc025 }, + { -0x27943655, -0x78b9de0c, 0x56fe6fea, -0x4ab38442, 0x7fadc22c, 0x077a2425, 0x19b90d39, 0x1ab53be4 }, + { 0x319ea6aa, -0x2711e4e8, 0x3a21f0da, 0x004d8808, -0x77c5b0b5, 0x3bd6aa1d, -0x202602ec, 0x4db9a3a6 } + }, + { + { -0x34488398, -0x26a4ff45, -0x6e0e87b7, -0x22437b96, -0x41d7264d, 0x7cf700ae, -0x7a2ce0c2, 0x5ce1285c }, + { -0x4663f8ab, -0x73184dc5, -0x3b0af086, 0x35c5d6ed, -0x1264af3d, 0x7e1e2ed2, -0x176cb25f, 0x36305f16 }, + { -0x674f4218, 0x31b6972d, -0x535921a5, 0x7d920706, -0x6f759a61, -0x198cef08, -0x1020fdcb, 0x50fac2a6 } + }, + { + { -0x090bb644, 0x295b1c86, 0x1f0ab4dd, 0x51b2e84a, -0x5571aae3, -0x3ffe34d0, 0x44f43662, 0x6a28d359 }, + { 0x5b880f5a, -0x0c2c560d, -0x24fc183e, -0x1213faf4, -0x060f4e5e, -0x576967e1, -0x53a1cb5c, 0x49a4ae2b }, + { 0x04a740e0, 0x28bb12ee, -0x64317e8c, 0x14313bbd, -0x173ef3c0, 0x72f5b5e4, 0x36adcd5b, 0x7cbfb199 } + }, + { + { -0x33c91920, -0x7186c586, 0x7d586eed, -0x0605485d, -0x451e0b1c, 0x3a4f9692, -0x00a0bb82, 0x1c14b03e }, + { 0x6b89792d, -0x5cee223e, -0x25aed99c, 0x1b30b4c6, -0x30eaf7a7, 0x0ca77b4c, 0x1b009408, 0x1de443df }, + { 0x14a85291, 0x19647bd1, 0x1034d3af, 0x57b76cb2, 0x0f9d6dfa, 0x6329db44, 0x6a571493, 0x5ef43e58 } + }, +}, +{ + { + { -0x37f3e540, -0x59923363, 0x1b38a436, -0x685fa30c, -0x6a24283a, -0x58140c42, -0x72818255, 0x7da0b8f6 }, + { 0x385675a6, -0x1087dfec, -0x55025618, -0x5d9b60d0, 0x5cdfa8cb, 0x4cd1eb50, 0x1d4dc0b3, 0x46115aba }, + { -0x3c4a258a, -0x2bf0e6ad, 0x21119e9b, 0x1dac6f73, -0x014da6a0, 0x03cc6021, -0x7c98b4b5, 0x5a5f887e } + }, + { + { -0x5f59bc47, -0x6169d72d, -0x193cdf9c, -0x4a3c3500, 0x7c2dec32, -0x64acfd77, -0x2a2e38f4, 0x43e37ae2 }, + { 0x70a13d11, -0x709cfe31, 0x350dd0c4, -0x303147eb, -0x5b435b82, -0x08fd682c, -0x1bb2ebcc, 0x3669b656 }, + { -0x12591ecd, 0x387e3f06, -0x665ec540, 0x67301d51, 0x36263811, -0x42a52708, 0x4fd5e9be, 0x6a21e6cd } + }, + { + { 0x6699b2e3, -0x10bed6ee, 0x708d1301, 0x71d30847, 0x1182b0bd, 0x325432d0, 0x001e8b36, 0x45371b07 }, + { 0x3046e65f, -0x0e39e8f6, 0x00d23524, 0x58712a2a, -0x737d48ab, 0x69dbbd3c, -0x5e6a00a9, 0x586bf9f1 }, + { 0x5ef8790b, -0x5924f773, 0x610937e5, 0x5278f0dc, 0x61a16eb8, -0x53fcb62e, -0x6f1ade87, 0x0eafb037 } + }, + { + { 0x0f75ae1d, 0x5140805e, 0x2662cc30, -0x13fd041d, -0x156dc693, 0x2cebdf1e, -0x3abca44d, 0x44ae3344 }, + { 0x3748042f, -0x69faaa3f, -0x7df455ef, 0x219a41e6, 0x73486d0c, 0x1c81f738, 0x5a02c661, 0x309acc67 }, + { -0x445abc12, -0x630d7647, 0x5ac97142, -0x0c89f163, 0x4f9360aa, 0x1d82e5c6, 0x7f94678f, 0x62d5221b } + }, + { + { 0x3af77a3c, 0x7585d426, -0x0116ebb3, -0x205184ef, 0x59f7193d, -0x5af98f80, -0x7c6ddfc9, 0x14f29a53 }, + { 0x18d0936d, 0x524c299c, -0x75f3e5f4, -0x37944a94, -0x24b579cf, -0x5c8afad2, -0x438aba9e, 0x5c0efde4 }, + { 0x25b2d7f5, -0x208e8124, -0x664acfc0, 0x21f970db, -0x3c12b39e, -0x256dcb49, 0x7bee093e, 0x5e72365c } + }, + { + { 0x2f08b33e, 0x7d933906, -0x2060cd42, 0x5b9659e5, 0x1f9ebdfd, -0x5300c253, -0x348cb649, 0x70b20555 }, + { 0x4571217f, 0x575bfc07, 0x0694d95b, 0x3779675d, -0x0be6e1cd, -0x65f5c845, 0x47b4eabc, 0x77f1104c }, + { 0x55112c4c, -0x41aeec3b, -0x6577e033, 0x6688423a, 0x5e503b47, 0x44667785, 0x4a06404a, 0x0e34398f } + }, + { + { 0x3e4b1928, 0x18930b09, 0x73f3f640, 0x7de3e10e, 0x73395d6f, -0x0bcde826, -0x35c863c2, 0x6f8aded6 }, + { 0x3ecebde8, -0x4982dd27, 0x27822f07, 0x09b3e841, -0x4fa49273, 0x743fa61f, -0x75c9dc8e, 0x5e540536 }, + { -0x02484d66, -0x1cbfedc3, -0x5de54d6f, 0x487b97e1, -0x02196b62, -0x066982fe, -0x372c2169, 0x780de72e } + }, + { + { 0x00f42772, 0x671feaf3, 0x2a8c41aa, -0x708d14d6, -0x68c8cd6e, 0x29a17fd7, 0x32b587a6, 0x1defc6ad }, + { 0x089ae7bc, 0x0ae28545, 0x1c7f4d06, 0x388ddecf, 0x0a4811b8, 0x38ac1551, 0x71928ce4, 0x0eb28bf6 }, + { -0x10ae6a59, -0x50a441e6, -0x6e84ea13, 0x148c1277, 0x7ae5da2e, 0x2991f7fb, -0x0722d799, 0x467d201b } + }, +}, +{ + { + { 0x296bc318, 0x745f9d56, -0x27ead19b, -0x66ca7f2c, 0x5839e9ce, -0x4f1a4ec1, -0x2bc6de40, 0x51fc2b28 }, + { -0x0842d195, 0x7906ee72, 0x109abf4e, 0x05d270d6, -0x46be575c, -0x72a301bb, 0x1c974287, 0x44c21867 }, + { -0x6a1d5674, 0x1b8fd117, 0x2b6b6291, 0x1c4e5ee1, 0x7424b572, 0x5b30e710, 0x4c4f4ac6, 0x6e6b9de8 } + }, + { + { -0x07f34f78, 0x6b7c5f10, 0x56e42151, 0x736b54dc, -0x3910663c, -0x3d49df5b, -0x3c5f90be, 0x5f4c802c }, + { 0x4b1de151, -0x200da032, -0x1ee3bfdb, -0x27be3f39, 0x54749c87, 0x2554b3c8, -0x6f71f207, 0x2d292459 }, + { 0x7d0752da, -0x649a370f, -0x38811800, -0x77e31cc8, 0x5b62f9e3, -0x3c4aeb10, -0x413ef2b8, 0x66ed5dd5 } + }, + { + { -0x3435fb83, -0x0f520c37, -0x0baad095, -0x7e3c4d35, 0x44735f93, -0x3025eed3, 0x7e20048c, 0x1f23a0c7 }, + { 0x0bb2089d, 0x7d38a1c2, -0x69332bee, -0x7f7ccb1f, 0x6c97d313, -0x3b58f474, 0x03007f20, 0x2eacf8bc }, + { -0x1a43ea90, -0x0dcab985, 0x0dbab38c, 0x03d2d902, -0x03061f62, 0x27529aa2, -0x62cb43b0, 0x0840bef2 } + }, + { + { 0x7f37e4eb, -0x32ab1f95, -0x0a169336, -0x733ea079, -0x2ca68232, -0x47db7450, 0x6074400c, 0x246affa0 }, + { -0x23ef4d79, 0x796dfb35, 0x5c7ff29d, 0x27176bcd, -0x384db6fb, 0x7f3d43e8, -0x6e3abd8a, 0x0304f5a1 }, + { -0x041bacdf, 0x37d88e68, -0x3f28afce, -0x79f68ab8, -0x76b5f2cb, 0x4e9b13ef, 0x5753d325, 0x25a83cac } + }, + { + { 0x3952b6e2, -0x60f099d7, 0x0934267b, 0x33db5e0e, -0x29f60124, -0x00badad5, -0x3af91f37, 0x06be10f5 }, + { -0x1127e9a2, 0x10222f48, 0x4b8bcf3a, 0x623fc123, -0x3dde1710, 0x1e145c09, -0x3587d9d0, 0x7ccfa59f }, + { -0x49d5cba1, 0x1a9615a9, 0x4a52fecc, 0x22050c56, 0x28bc0dfe, -0x585d877b, 0x1a1ee71d, 0x5e82770a } + }, + { + { 0x42339c74, -0x17fd17f6, -0x5800051b, 0x34175166, 0x1c408cae, 0x34865d1f, 0x605bc5ee, 0x2cca982c }, + { -0x527695a4, 0x35425183, -0x1872ad0a, -0x1798c505, -0x6d5ca09c, 0x2c66f25f, 0x3b86b102, 0x09d04f3b }, + { 0x197dbe6e, -0x02d2a2cb, -0x741b005d, 0x207c2eea, 0x325ae918, 0x2613d8db, 0x27741d3e, 0x7a325d17 } + }, + { + { 0x7e2a076a, -0x132d82ff, 0x1636495e, -0x28779761, -0x6e6dcc1b, 0x52a61af0, 0x7bb1ae64, 0x2a479df1 }, + { -0x2e92021e, -0x2fc94645, -0x3b6857d7, -0x5dfaa8a9, -0x580ed999, -0x7193369a, 0x1239c180, 0x4d3b1a79 }, + { 0x33db2710, -0x61a11172, -0x293bc35b, 0x189854de, -0x6d8e7ec8, -0x5be3dd3b, -0x5bc5a165, 0x27ad5538 } + }, + { + { -0x71b8f884, -0x34a5829d, 0x20a1c059, -0x7248ac9f, -0x74120234, 0x549e1e4d, 0x503b179d, 0x080153b7 }, + { 0x15350d61, 0x2746dd4b, -0x116ade49, -0x2fc03438, 0x138672ca, -0x1791c9a6, 0x7e7d89e2, 0x510e987f }, + { 0x0a3ed3e3, -0x2259626d, -0x329f58de, 0x3d386ef1, -0x4255b11a, -0x37e852a8, 0x4fe7372a, 0x23be8d55 } + }, +}, +{ + { + { 0x567ae7a9, -0x43e10b43, -0x29bb6743, 0x3f624cb2, 0x2c1f4ec8, -0x1bef9b2e, -0x45c7bfff, 0x2ef9c5a5 }, + { 0x74ef4fad, -0x6a016e66, -0x095cf75e, 0x3a827bec, 0x09a47b01, -0x69b1fe2d, 0x5ba3c797, 0x71c43c4f }, + { -0x05618b33, -0x4902920a, -0x1b50d986, -0x0e7d8744, -0x0e1066f2, -0x7daa4c30, -0x6f3a0d6d, 0x5a758ca3 } + }, + { + { 0x1d61dc94, -0x731f6e75, -0x657ecf9a, -0x7212c9ba, -0x5017552d, -0x2b1957d7, -0x09c62bc1, 0x0a738027 }, + { -0x26b9db6b, -0x5d48d8f0, -0x2a82affd, 0x3aa8c6d2, -0x5f4b7836, -0x1c2bff41, -0x4c148d14, 0x2dbae244 }, + { 0x57ffe1cc, -0x67f0b5d1, -0x1e7c67bd, 0x00670d0d, 0x49fb15fd, 0x105c3f4a, 0x5126a69c, 0x2698ca63 } + }, + { + { 0x5e3dd90e, 0x2e3d702f, -0x1b2dac7a, -0x61c0f6e8, 0x024da96a, 0x5e773ef6, 0x4afa3332, 0x3c004b0c }, + { 0x32b0ba78, -0x189ace78, -0x6da30075, 0x381831f7, -0x5fd6e034, 0x08a81b91, 0x49caeb07, 0x1fb43dcc }, + { 0x06f4b82b, -0x6556b954, -0x57f93b0d, 0x1ca284a5, -0x3932b879, 0x3ed3265f, -0x32e02de9, 0x6b43fd01 } + }, + { + { 0x3e760ef3, -0x4a38bda8, -0x11f54670, 0x75dc52b9, 0x072b923f, -0x40ebd83e, 0x6ff0d9f0, 0x73420b2d }, + { 0x4697c544, -0x3858a2b5, -0x20f00041, 0x15fdf848, -0x55b987a6, 0x2868b9eb, 0x5b52f714, 0x5a68d710 }, + { -0x617ae1fa, -0x50d30935, -0x39ddc73c, -0x70a6c6ed, -0x66040c8d, -0x2575476a, -0x15cb4362, 0x3db5632f } + }, + { + { -0x7d67da2b, 0x2e4990b1, 0x3e9a8991, -0x12151479, 0x4c704af8, -0x110fc2c7, -0x6a20d4f2, 0x59197ea4 }, + { -0x08a22628, -0x0b9111d5, 0x396759a5, 0x0d17b1f6, 0x499e7273, 0x1bf2d131, 0x49d75f13, 0x04321adf }, + { -0x1b1aa552, 0x04e16019, 0x7e2f92e9, -0x1884bc86, 0x6f159aa4, -0x3831d23f, -0x0b28f340, 0x45eafdc1 } + }, + { + { -0x30334e13, -0x49f1b9dc, -0x42a3fc6b, 0x59dbc292, -0x23fb7e37, 0x31a09d1d, 0x5d56d940, 0x3f73ceea }, + { -0x7fba28d5, 0x69840185, -0x30d0f9af, 0x4c22faa2, 0x6b222dc6, -0x6be5c99b, 0x0362dade, 0x5a5eebc8 }, + { 0x0a4e8dc6, -0x4858402f, 0x44c9b339, -0x41a8ff82, 0x1557aefa, 0x60c1207f, 0x266218db, 0x26058891 } + }, + { + { -0x39891abe, 0x4c818e3c, 0x03ceccad, 0x5e422c93, -0x4bed60f8, -0x13f83336, -0x4dbbbc48, 0x0dedfa10 }, + { -0x7c9f00fc, 0x59f704a6, 0x7661e6f4, -0x3c26c022, 0x12873551, -0x7ce4d58d, 0x4e615d57, 0x54ad0c2e }, + { -0x47d4add6, -0x11c4982b, -0x605a3e15, 0x36f16346, 0x6ec19fd3, -0x5a4b2d0e, -0x58856bf8, 0x62ecb2ba } + }, + { + { -0x5049d78c, -0x6df8d7ca, 0x79e104a5, 0x5fcd5e85, -0x39cf5eb6, 0x5aad01ad, 0x75663f98, 0x61913d50 }, + { 0x61152b3d, -0x1a1286ae, 0x0eddd7d1, 0x4962357d, -0x4694b38f, 0x7482c8d0, -0x56992742, 0x2e59f919 }, + { 0x1a3231da, 0x0dc62d36, -0x6bdffd90, -0x05b8a7ce, 0x3f9594ce, 0x02d80151, 0x31c05d5c, 0x3ddbc2a1 } + }, +}, +{ + { + { 0x004a35d1, -0x048ca53e, 0x3a6607c3, 0x31de0f43, -0x3ad72a67, 0x7b8591bf, -0x0a44faf4, 0x55be9a25 }, + { 0x4ffb81ef, 0x3f50a50a, 0x3bf420bf, -0x4e1fcaf7, -0x3955d330, -0x645571e4, -0x05dc85c0, 0x32239861 }, + { 0x33db3dbf, 0x0d005acd, -0x7f53ca1e, 0x0111b37c, 0x6f88ebeb, 0x4892d66c, 0x6508fbcd, 0x770eadb1 } + }, + { + { -0x5faf8e47, -0x0e2c497f, 0x3592ff3a, 0x2207659a, 0x7881e40e, 0x5f016929, -0x7945c8b2, 0x16bedd0e }, + { 0x5e4e89dd, -0x7bae0620, -0x4386c6c9, -0x3f9cfd01, 0x56a6495c, 0x5d227495, -0x5fa9fc05, 0x09a6755c }, + { 0x2c2737b5, 0x5ecccc4f, 0x2dccb703, 0x43b79e0c, 0x4ec43df3, 0x33e008bc, -0x0f8a9940, 0x06c1b840 } + }, + { + { -0x64fd7fa4, 0x69ee9e7f, 0x547d1640, -0x34007d76, -0x4dbcf698, 0x3d93a869, 0x3fe26972, 0x46b7b8cd }, + { -0x5c770789, 0x7688a5c6, -0x214d4954, 0x02a96c14, 0x1b8c2af8, 0x64c9f343, 0x54a1eed6, 0x36284355 }, + { -0x01811420, -0x167edf7a, 0x2f515437, 0x4cba6be7, 0x516efae9, 0x1d04168b, 0x43982cb9, 0x5ea13910 } + }, + { + { -0x2a2c4ffe, 0x6f2b3be4, 0x6a09c880, -0x5013cc27, -0x57433b34, 0x035f73a4, 0x4662198b, 0x22c5b928 }, + { -0x0b8fd11f, 0x49125c9c, -0x74da4cd3, 0x4520b71f, 0x501fef7e, 0x33193026, -0x372d14d5, 0x656d8997 }, + { 0x433d8939, -0x34a73702, 0x6a8d7e50, -0x765f34d2, 0x09fbbe5a, 0x79ca9553, -0x32803efa, 0x0c626616 } + }, + { + { -0x040bab4f, -0x70203c87, -0x0e5b488f, 0x45a5a970, -0x452ca6eb, -0x536de109, -0x57e3de6e, 0x42d088dc }, + { 0x4879b61f, 0x1ffeb80a, 0x4ada21ed, 0x6396726e, 0x368025ba, 0x33c7b093, -0x0c3ce878, 0x471aa0c6 }, + { -0x5fe9ae67, -0x7025f0c9, -0x375f1cbd, 0x0adadb77, -0x378a17e0, 0x20fbfdfc, 0x0c2206e7, 0x1cf2bea8 } + }, + { + { 0x02c0412f, -0x67d291e6, -0x24a71702, -0x6f05b37d, -0x234e7440, 0x01c2f5bc, 0x216abc66, 0x686e0c90 }, + { -0x4c9dfd54, -0x3d220e22, -0x2d1d855b, -0x6d5a01f7, -0x03f60e2d, 0x7d1648f6, 0x13bc4959, 0x74c2cc05 }, + { -0x5abc6a59, 0x1fadbadb, -0x51f25996, -0x4be5fd60, -0x445c83f9, -0x40e60a68, -0x21b7bcf3, 0x6a12b8ac } + }, + { + { 0x1aaeeb5f, 0x793bdd80, -0x3eae778f, 0x00a2a0aa, 0x1f2136b4, -0x175c8c5d, -0x036e10e7, 0x48aab888 }, + { 0x39d495d9, -0x072515e1, 0x525f1dfc, 0x592c190e, -0x3666e2e5, -0x247342fc, -0x2770f349, 0x11f7fda3 }, + { 0x5830f40e, 0x041f7e92, 0x79661c06, 0x002d6ca9, 0x2b046a2e, -0x79236007, -0x74fb6c2f, 0x76036092 } + }, + { + { 0x695a0b05, -0x4bcef71b, -0x52c85c75, 0x6cb00ee8, -0x5cac8c7f, 0x5edad6ee, -0x4923cddc, 0x3f2602d4 }, + { 0x120cf9c6, 0x21bb41c6, -0x21325a65, -0x154d55ee, 0x0aa48b34, -0x3e58d2fe, -0x1782c498, 0x215d4d27 }, + { 0x5bcaf19c, -0x374db84a, -0x4e4d39ae, 0x49779dc3, -0x2a131d1e, -0x765e7f45, -0x31371fc7, 0x13f098a3 } + }, +}, +{ + { + { 0x2796bb14, -0x0c55a85e, -0x64f825df, -0x77c54549, 0x31a0391c, -0x1ab41de8, -0x27cdfa07, 0x5ee7fb38 }, + { -0x31a13ab5, -0x6523f007, -0x73d0ecf3, 0x039c2a6b, -0x0f076aeb, 0x028007c7, -0x53fb4c95, 0x78968314 }, + { 0x41446a8e, 0x538dfdcb, 0x434937f9, -0x5a530257, 0x263c8c78, 0x46af908d, -0x6435f2f7, 0x61d0633c } + }, + { + { -0x07038c21, -0x525cd744, -0x590fc804, -0x117b96a3, 0x38c2a909, 0x637fb4db, -0x07f98424, 0x5b23ac2d }, + { -0x0024da9a, 0x63744935, 0x780b68bb, -0x3a429477, 0x553eec03, 0x6f1b3280, 0x47aed7f5, 0x6e965fd8 }, + { -0x117fad85, -0x652d46ad, -0x05219273, -0x1770e656, 0x150e82cf, 0x0e711704, -0x226a2124, 0x79b9bbb9 } + }, + { + { -0x71608c8c, -0x2e668252, -0x3044f7ea, -0x5fcd5d08, 0x6d445f0a, -0x329345ee, 0x0accb834, 0x1ba81146 }, + { 0x6a3126c2, -0x144caac0, 0x68c8c393, -0x2d9c7c58, -0x1a46857e, 0x6c0c6429, -0x3602deb9, 0x5065f158 }, + { 0x0c429954, 0x708169fb, -0x28913099, -0x1eb9ff54, 0x70e645ba, 0x2eaab98a, 0x58a4faf2, 0x3981f39e } + }, + { + { 0x6de66fde, -0x37ba205b, 0x2c40483a, -0x1ead5b00, -0x384b09ce, -0x162d1e9d, -0x2343e49b, 0x30f4452e }, + { 0x59230a93, 0x18fb8a75, 0x60e6f45d, 0x1d168f69, 0x14a93cb5, 0x3a85a945, 0x05acd0fd, 0x38dc0837 }, + { -0x3a8a68c0, -0x7a92d87e, -0x06634134, -0x05ecba97, -0x3f15b18f, -0x77bb038d, 0x593f2469, 0x632d9a1a } + }, + { + { -0x12f37b59, -0x40f602ef, 0x0d9f693a, 0x63f07181, 0x57cf8779, 0x21908c2d, -0x7509b45e, 0x3a5a7df2 }, + { -0x47f8345a, -0x094494eb, -0x43ab0f29, 0x1823c7df, 0x6e29670b, -0x44e268fd, 0x47ed4a57, 0x0b24f488 }, + { 0x511beac7, -0x23252b42, -0x12d9330e, -0x5bac7f8b, 0x005f9a65, -0x1e630061, 0x75481f63, 0x34fcf744 } + }, + { + { 0x78cfaa98, -0x5a44e255, 0x190b72f2, 0x5ceda267, 0x0a92608e, -0x6cf636ef, 0x2fb374b0, 0x0119a304 }, + { 0x789767ca, -0x3e681fb4, 0x38d9467d, -0x478eb235, -0x7c06a058, 0x55de8882, 0x4dfa63f7, 0x3d3bdc16 }, + { -0x173de883, 0x67a2d89c, 0x6895d0c1, 0x669da5f6, -0x4d7d5d50, -0x0a9a671b, -0x121df58d, 0x56c088f1 } + }, + { + { 0x24f38f02, 0x581b5fac, -0x451cf343, -0x56f41602, -0x75306d10, -0x65de96fe, -0x7ca6fc71, 0x038b7ea4 }, + { 0x10a86e17, 0x336d3d11, 0x0b75b2fa, -0x280c77ce, 0x25072988, -0x06eacc8a, -0x66ef7479, 0x09674c6b }, + { -0x66ce9008, -0x60b107df, -0x155872b1, 0x2f49d282, 0x5aef3174, 0x0971a5ab, 0x5969eb65, 0x6e5e3102 } + }, + { + { 0x63066222, 0x3304fb0e, -0x785345c1, -0x04caf977, -0x73ef9e5d, -0x42e6db89, -0x2e7c79e0, 0x3058ad43 }, + { -0x781a6c05, -0x4e939d0b, -0x35a2c18f, 0x4999edde, 0x14cc3e6d, -0x4b6e3e20, -0x76572458, 0x08f51147 }, + { -0x1a899c30, 0x323c0ffd, -0x5dd159f0, 0x05c3df38, -0x5366b066, -0x42387543, -0x101c2367, 0x26549fa4 } + }, +}, +{ + { + { -0x08ac6947, 0x04dbbc17, -0x2d0798ba, 0x69e6a2d7, -0x0ac1543a, -0x39bf6267, 0x332e25d2, 0x606175f6 }, + { -0x78317077, 0x738b38d7, 0x4179a88d, -0x49d9a71e, -0x0eaece93, 0x30738c9c, 0x727275c9, 0x49128c7f }, + { -0x0abf1823, 0x4021370e, -0x5e0e2f5b, 0x0910d6f5, 0x5b06b807, 0x4634aacd, 0x6944f235, 0x6a39e635 } + }, + { + { 0x74049e9d, 0x1da19657, -0x6701cad5, -0x0432915f, -0x33adc95a, -0x4e3432b0, 0x3f9846e2, 0x1f5ec83d }, + { -0x206f0c19, -0x6932a9c0, -0x2405da16, 0x6c3a760e, 0x59e33cc4, 0x24f3ef09, 0x530d2e58, 0x42889e7e }, + { 0x328ccb75, -0x7104dc3d, -0x22789117, -0x50bd5df9, 0x5dfae796, 0x20fbdadc, 0x06bf9f51, 0x241e246b } + }, + { + { 0x6280bbb8, 0x7eaafc9a, -0x0bfc27f7, 0x22a70f12, 0x1bfc8d20, 0x31ce40bb, -0x1742ac12, 0x2bc65635 }, + { -0x5291670a, 0x29e68e57, 0x0b462065, 0x4c9260c8, -0x5ae144b5, 0x3f00862e, -0x4c726f69, 0x5bc2c77f }, + { -0x5694526d, -0x172a2361, -0x21e6b824, -0x1a704e83, 0x65185fa3, 0x681532ea, 0x034a7830, 0x1fdd6c3b } + }, + { + { 0x2dd8f7a9, -0x63ec595b, 0x3efdcabf, 0x2dbb1f8c, 0x5e08f7b5, -0x69e1cdc0, -0x4419361b, 0x48c8a121 }, + { 0x55dc18fe, 0x0a64e28c, 0x3399ebdd, -0x1c206167, 0x70e2e652, 0x79ac4323, 0x3ae4cc0e, 0x35ff7fc3 }, + { 0x59646445, -0x03bea584, -0x3ed749eb, -0x2ddb4d29, 0x05fbb912, 0x6035c9c9, 0x74429fab, 0x42d7a912 } + }, + { + { -0x6cc25a44, -0x565b76b9, -0x3d168614, 0x4a58920e, 0x13e5ac4c, -0x69278000, 0x4b48b147, 0x453692d7 }, + { -0x1508d12d, 0x4e6213e3, 0x43acd4e7, 0x6794981a, 0x6eb508cb, -0x00ab8322, 0x10fcb532, 0x6fed19dd }, + { -0x57aa6391, -0x2288a267, -0x20ffc1dc, -0x0bd5dec0, -0x256d759a, 0x5223e229, 0x6d38f22c, 0x063f46ba } + }, + { + { 0x37346921, 0x39843cb7, 0x38c89447, -0x58b804f9, -0x5dbacf82, -0x34727fcf, 0x6d82f068, 0x67810f8e }, + { 0x5f536694, -0x2d2dbd77, 0x42939b2c, -0x35cc5d3b, -0x382246a4, -0x6790525a, 0x2f712d5d, 0x5a152c04 }, + { -0x2dd7824c, 0x3eeb8fbc, 0x01a03e93, 0x72c7d3a3, -0x4267d9a6, 0x5473e88c, 0x5921b403, 0x7324aa51 } + }, + { + { -0x17dcab35, -0x52dc0926, -0x49a8e593, 0x6962502a, -0x1c71c82f, -0x649ae9ca, -0x2e5cced1, 0x5cac5005 }, + { 0x6c3cbe8e, -0x7a86bd0c, 0x4730c046, -0x5e2c9b4f, -0x2dc3be41, 0x1c8ed914, -0x11092a2e, 0x0838e161 }, + { -0x161c66fc, -0x733eab34, -0x7b2197ba, 0x5b3a040b, -0x4e41a292, -0x3b2759e4, -0x2779e0fe, 0x40fb897b } + }, + { + { 0x5ab10761, -0x1a8127b9, 0x6fd13746, 0x71435e20, -0x32fda9ce, 0x342f824e, -0x5786e185, 0x4b16281e }, + { 0x62de37a1, -0x7b3a5570, 0x0d1d96e1, 0x421da500, 0x6a9242d9, 0x78828630, 0x690d10da, 0x3c5e464a }, + { 0x0b813381, -0x2e3efe2b, 0x76ee6828, -0x2119f0ef, 0x383f6409, 0x0cb68893, -0x0900b7b6, 0x6183c565 } + }, +}, +{ + { + { -0x50c09992, -0x24b97ab7, -0x0eb5f15b, -0x288030fc, -0x5b45f3b9, 0x3df23ff7, 0x32ce3c85, 0x3a10dfe1 }, + { 0x1e6bf9d6, 0x741d5a46, 0x7777a581, 0x2305b3fc, 0x6474d3d9, -0x2baa8b5e, 0x6401e0ff, 0x1926e1dc }, + { -0x15e83160, -0x1f80b176, 0x3a1fc1fd, 0x2fd51546, 0x31f2c0f1, 0x175322fd, -0x79e1a2eb, 0x1fa1d01d } + }, + { + { -0x2e206b55, 0x38dcac00, -0x2ef7f217, 0x2e712bdd, -0x022a1d9e, 0x7f13e93e, -0x1165fe1b, 0x73fced18 }, + { 0x7d599832, -0x337faa6c, 0x37f15520, 0x1e4656da, 0x4e059320, -0x6609088c, 0x6a75cf33, 0x773563bc }, + { 0x63139cb3, 0x06b1e908, -0x3a5fc133, -0x5b6c2599, -0x529c76ce, -0x72883138, 0x1b864f44, 0x1f426b70 } + }, + { + { -0x6e5edaae, -0x0e81ca38, 0x575e9c76, -0x48947ead, 0x0d9b723e, -0x057cbf91, 0x3fa7e438, 0x0b76bb1b }, + { 0x41911c01, -0x1036d9b4, 0x17a22c25, -0x0e5c4848, -0x0cf0ebb9, 0x5875da6b, 0x1d31b090, 0x4e1af527 }, + { 0x7f92939b, 0x08b8c1f9, -0x2bbb5492, -0x41988e35, -0x66447fe9, 0x22e56463, -0x488d56ab, 0x7b6dd61e } + }, + { + { -0x54fe2d39, 0x5730abf9, 0x40143b18, 0x16fb76dc, -0x5f344d7f, -0x7993419b, -0x64009502, 0x53fa9b65 }, + { 0x50f33d92, -0x48523e18, 0x608cd5cf, 0x7998fa4f, -0x7203a425, -0x5269d243, -0x50e2d0b1, 0x703e9bce }, + { -0x6b77abab, 0x6c14c8e9, 0x65aed4e5, -0x7bc5a29a, -0x4329a50f, 0x181bb73e, -0x3b39e0b0, 0x398d93e5 } + }, + { + { -0x2d181c0e, -0x3c7883a0, 0x30828bb1, 0x3b34aaa0, 0x739ef138, 0x283e26e7, 0x02c30577, 0x699c9c90 }, + { 0x33e248f3, 0x1c4bd167, 0x15bf0a5f, -0x4261ed79, -0x5ef4fc8a, -0x2bc07310, -0x20e6e4ed, 0x53b09b5d }, + { 0x5946f1cc, -0x0cf958dd, -0x331a2683, -0x6de8e74b, -0x7e4b168b, 0x28cdd247, 0x6fcdd907, 0x51caf30c } + }, + { + { 0x18ac54c7, 0x737af99a, -0x3ae34cf1, -0x6fcc8724, 0x4ce10cc7, 0x2b89bc33, -0x76071666, 0x12ae29c1 }, + { 0x7674e00a, -0x59f458be, -0x5e85840d, 0x630e8570, -0x30ccdb34, 0x3758563d, 0x2383fdaa, 0x5504aa29 }, + { 0x1f0d01cf, -0x56613f35, 0x3a34f7ae, 0x0dd1efcc, -0x2f63b1de, 0x55ca7521, 0x58eba5ea, 0x5fd14fe9 } + }, + { + { -0x406c3472, 0x3c42fe5e, 0x36d4565f, -0x412057af, -0x77bddf18, -0x1f0f7a62, 0x0725d128, 0x7dd73f96 }, + { 0x2845ab2c, -0x4a23d221, 0x0a7fe993, 0x069491b1, 0x4002e346, 0x4daaf3d6, 0x586474d1, 0x093ff26e }, + { 0x68059829, -0x4ef2db02, -0x2450dc1b, 0x75730672, -0x4ba853d7, 0x1367253a, -0x794b8f5c, 0x2f59bcbc } + }, + { + { -0x496e3cff, 0x7041d560, -0x522818e2, -0x7adfe4c1, 0x11335585, 0x16c2e163, 0x010828b1, 0x2aa55e3d }, + { -0x66e8eca1, -0x7c7b82be, 0x567d03d7, -0x52e46ee1, -0x4188552f, 0x7e7748d9, 0x2e51af4a, 0x5458b42e }, + { 0x0c07444f, -0x12ae6d1a, 0x74421d10, 0x42c54e2d, -0x024a379c, 0x352b4c82, -0x7589799c, 0x13e9004a } + }, +}, +{ + { + { -0x7f94b984, 0x1e6284c5, -0x18a29f85, -0x3a096685, -0x4c872d9e, -0x749826a8, -0x7e327490, 0x3d88d66a }, + { 0x6c032bff, -0x344a4aab, 0x29297a3a, -0x208e6e49, -0x52127e45, -0x3e008cda, 0x68be03f5, 0x71ade8bb }, + { 0x204ed789, -0x7489856d, -0x605f51d6, 0x762fcacb, 0x6dce4887, 0x771febcc, -0x700fa04d, 0x34306215 } + }, + { + { 0x2a7b31b4, -0x031de6f9, -0x55a87fea, 0x4d7adc75, -0x78b86cdc, 0x0ec276a6, 0x1fda4beb, 0x6d6d9d5d }, + { -0x1e0a40b7, -0x1fa25e59, -0x2b8c9f6e, 0x26457d6d, 0x73cc32f6, 0x77dcb077, -0x6322a033, 0x0a5d9496 }, + { -0x164f7e7d, 0x22b1a58a, -0x3ea3c775, -0x026a2f8f, -0x7af5fae9, -0x567edc8a, -0x4480cca2, 0x33384cba } + }, + { + { 0x26218b8d, 0x33bc627a, -0x3857f39f, -0x157f4de1, 0x173e9ee6, -0x6ba74ed5, 0x0e2f3059, 0x076247be }, + { 0x0ca2c7b5, 0x3c6fa268, 0x6fb64fda, 0x1b508204, 0x5431d6de, -0x14accb64, 0x6b879c89, 0x5278b38f }, + { 0x1416375a, 0x52e105f6, -0x7a54145c, -0x136850ca, 0x23a67c36, 0x26e6b506, -0x0c2b04ff, 0x5cf0e856 } + }, + { + { 0x3db342a8, -0x415131cf, -0x7bd24812, -0x345c9ca5, -0x7e80ec11, -0x177399e0, 0x4e76d5c6, 0x1b9438aa }, + { 0x1ae8cab4, -0x0936978d, -0x34b06d3b, 0x5e20741e, -0x733243c2, 0x2da53be5, 0x69970df7, 0x2dddfea2 }, + { 0x166f031a, -0x75af8882, 0x0fb7a328, 0x067b39f1, 0x010fbd76, 0x1925c9a6, -0x338bf6fb, 0x6df9b575 } + }, + { + { 0x48cade41, -0x13203ca5, -0x4dcd7d90, 0x6a88471f, 0x40a01b6a, 0x740a4a24, 0x003b5f29, 0x471e5796 }, + { 0x27f6bdcf, 0x42c11929, 0x403d61ca, -0x706e6e86, -0x7461e09f, -0x23e3a59a, 0x04ec0f8d, 0x15960478 }, + { -0x5312c854, -0x2569444d, -0x16df7316, 0x7a2423b5, 0x38aebae2, 0x24cc5c30, -0x23a251d1, 0x50c356af } + }, + { + { 0x1b31b964, -0x30126321, -0x735ae50d, -0x0b79567b, -0x1573e07c, 0x14897265, -0x6cd53400, 0x784a53dd }, + { 0x41c30318, 0x09dcbf43, -0x7ce7e232, -0x1145f9ef, -0x23e1d65f, -0x3e863f32, 0x073f35b0, 0x1dbf7b89 }, + { 0x14fc4920, 0x2d99f9df, -0x3bb6601b, 0x76ccb60c, -0x1a30fffd, -0x5becd345, 0x54f000ea, 0x3f93d823 } + }, + { + { 0x79e14978, -0x1553ed2f, -0x441400a2, -0x006dc00d, 0x0663ce27, 0x4af663e4, 0x11a5f5ff, 0x0fd381a8 }, + { -0x61fb317b, -0x7e7c1898, 0x04465341, 0x678fb71e, 0x6688edac, -0x526dfa71, 0x532b099a, 0x5da350d3 }, + { -0x5bc920ac, -0x0da95314, -0x51962918, 0x108b6168, 0x6b5d036c, 0x20d986cb, -0x011d50b0, 0x655957b9 } + }, + { + { -0x2ffd2f54, -0x423ebf65, -0x4a33265a, 0x66660245, -0x05217a14, -0x7dce823c, 0x6ad7df0d, 0x02fe934b }, + { -0x56fdfcf1, -0x51574f81, -0x0b9c2ebd, -0x07738996, 0x3c787a60, 0x15b08366, -0x7d985b58, 0x08eab114 }, + { -0x3048158c, -0x10a30f00, -0x5e34bd54, 0x22897633, -0x310d7a1e, -0x2b31f3ac, -0x75eb95ab, 0x30408c04 } + }, +}, +{ + { + { 0x193b877f, -0x44d1ff37, -0x1f23af95, -0x131c5770, 0x36de649f, -0x130c4840, -0x672161e6, 0x5f460408 }, + { -0x7cd03125, 0x739d8845, -0x5194079d, -0x05c72937, -0x48b00109, 0x32bc0dca, 0x14bce45e, 0x73937e88 }, + { 0x297bf48d, -0x46fc8eea, -0x2b0f97cc, -0x562ec4de, 0x4696bdc6, -0x1e68eaa9, -0x6e2a17cb, 0x2cf8a4e8 } + }, + { + { 0x17d06ba2, 0x2cb5487e, 0x3950196b, 0x24d2381c, -0x7a6875d0, -0x289a637f, -0x6e295b0a, 0x7a6f7f28 }, + { 0x07110f67, 0x6d93fd87, 0x7c38b549, -0x22b3f62d, -0x3d8c957a, 0x7cb16a4c, 0x58252a09, 0x2049bd6e }, + { 0x6a9aef49, 0x7d09fd8d, 0x5b3db90b, -0x0f119f42, 0x519ebfd4, 0x4c21b52c, -0x3aba6be3, 0x6011aadf } + }, + { + { 0x02cbf890, 0x63ded0c8, 0x0dff6aaa, -0x042f6736, -0x46491267, 0x624d0afd, 0x79340b1e, 0x69ce18b7 }, + { -0x306a07c4, 0x5f67926d, 0x71289071, 0x7c7e8561, -0x667085a5, -0x295e180d, 0x0b62f9e0, 0x6fc5cc1b }, + { -0x4d678635, -0x2e10aad8, -0x2b816f6e, -0x22e551c4, 0x189f2352, 0x127e0442, -0x1a8efe0f, 0x15596b3a } + }, + { + { 0x7e5124ca, 0x09ff3116, -0x2638ba21, 0x0be4158b, 0x7ef556e5, 0x292b7d22, -0x50492ec8, 0x3aa4e241 }, + { 0x3f9179a2, 0x462739d2, -0x68292231, -0x007cedcf, 0x53f2148a, 0x1307deb5, 0x7b5f4dda, 0x0d223768 }, + { 0x2a3305f5, 0x2cc138bf, -0x5d16d93d, 0x48583f8f, 0x5549d2eb, 0x083ab1a2, 0x4687a36c, 0x32fcaa6e } + }, + { + { 0x2787ccdf, 0x3207a473, -0x0dec1c08, 0x17e31908, -0x09f269b2, -0x2a4d1329, -0x3d9ff417, 0x746f6336 }, + { -0x3a82650b, 0x7bc56e8d, -0x620f420e, 0x3e0bd2ed, 0x22efe4a3, -0x553feb22, -0x014295a4, 0x4627e9ce }, + { -0x549368e4, 0x3f4af345, -0x66bc8ce1, -0x1d77148e, 0x0344186d, 0x33596a8a, 0x7ed66293, 0x7b491700 } + }, + { + { -0x22ac5d23, 0x54341b28, -0x20bd03c1, -0x55e86fa5, 0x4dd2f8f4, 0x0ff592d9, -0x1f732c83, 0x1d03620f }, + { -0x547b4f9c, 0x2d85fb5c, -0x760c43ec, 0x497810d2, 0x7b15ce0c, 0x476adc44, -0x07bb0285, 0x122ba376 }, + { -0x5d4b1aac, -0x3dfdcd33, 0x115d187f, -0x612f02be, 0x7dd479d9, 0x2eabb4be, 0x2b68ec4c, 0x02c70bf5 } + }, + { + { 0x458d72e1, -0x531acd41, 0x7cb73cb5, 0x5be768e0, -0x11744219, 0x56cf7d94, -0x014bc5fd, 0x6b0697e3 }, + { 0x5d0b2fbb, -0x5d7813b5, 0x074882ca, 0x415c5790, -0x3e2f7ea4, -0x1fbb59e2, 0x409ef5e0, 0x26334f0a }, + { -0x209d5c40, -0x49370fb6, 0x076da45d, 0x3ef000ef, 0x49f0d2a9, -0x636346a8, 0x441b2fae, 0x1cc37f43 } + }, + { + { -0x36315147, -0x2899a90f, 0x18e5656a, 0x1c5b15f8, -0x7bb3dccc, 0x26e72832, 0x2f196838, 0x3a346f77 }, + { 0x5cc7324f, 0x508f565a, -0x1af956de, -0x2f9e3b40, 0x5c45ac19, -0x04e75425, 0x0380314a, 0x6c6809c1 }, + { -0x1d259538, -0x2d2aaeee, -0x4e17ae13, -0x1642fccf, -0x71398d9e, -0x69f8b923, 0x6ef7c5d0, 0x05911b9f } + }, +}, +{ + { + { -0x3a01606c, 0x01c18980, 0x716fd5c8, -0x329a9897, -0x2e6a5f7a, -0x7e9fba3d, 0x66cc7982, 0x6e2b7f32 }, + { -0x49c800d3, -0x162328aa, -0x36780f3c, -0x13b3cb71, -0x0c043849, -0x312a6d7b, -0x6c1e1579, 0x33053547 }, + { -0x083ca971, -0x337fdb98, 0x19974cb3, -0x6216457e, -0x4a47eca0, -0x5448dd64, 0x6fbeba62, 0x44e2017a } + }, + { + { -0x49359133, -0x7807d30d, 0x18f4a0c2, 0x580f893e, 0x2604e557, 0x05893007, 0x56d19c1d, 0x6cab6ac2 }, + { 0x54dab774, -0x3b3d58bd, 0x4eaf031a, -0x71a2b3c4, 0x42838f17, -0x4893dc2e, 0x68dce4ea, 0x749a098f }, + { 0x2cc1de60, -0x23201f60, 0x51c5575b, 0x032665ff, 0x073abeeb, 0x2c0c32f1, -0x328479fa, 0x6a882014 } + }, + { + { -0x50b01492, -0x2eee2e84, -0x4cc55b5d, 0x050bba42, -0x114b93d0, 0x17514c3c, 0x1bc27d75, 0x54bedb8b }, + { -0x5b8b804b, -0x5ad56d02, 0x1fa5ab89, -0x23ed5bb7, -0x47b85b32, -0x27d256b5, -0x6aed33b2, 0x4d77edce }, + { 0x77e2189c, 0x77c8e145, -0x00663bbb, -0x5c1b9096, 0x6d335343, 0x3144dfc8, 0x7c4216a9, 0x3a96559e } + }, + { + { -0x7f4555ae, 0x44938968, -0x0d7a6bf2, 0x4c98afc4, -0x5babb74a, -0x10b55865, -0x5a855181, 0x5278c510 }, + { -0x0bd52d12, 0x12550d37, -0x675e040b, -0x74871ffc, 0x33894cb2, 0x5d530782, 0x3e498d0c, 0x02c84e4e }, + { 0x294c0b94, -0x5ab22f8c, -0x20e7004a, -0x0aa2b948, -0x72517c9a, -0x0f90133b, -0x7e6f2e9b, 0x58865766 } + }, + { + { 0x3de25cc3, -0x40a7cb10, -0x297eab6a, -0x47783752, -0x6b7e176e, 0x5105221a, -0x088dc06d, 0x6760ed19 }, + { 0x1aef7117, -0x2b88edcf, 0x229e92c7, 0x50343101, -0x62ea6469, 0x7a95e184, -0x74a2d637, 0x2449959b }, + { -0x53ca1ea0, 0x669ba3b7, -0x457bdfaa, 0x2eccf73f, -0x3f7fb0f9, 0x1aec1f17, 0x1856f4e7, 0x0d96bc03 } + }, + { + { -0x338afa1f, -0x4e2acb50, 0x16c35288, 0x32cd0034, 0x0762c29d, -0x34c95a80, 0x237a0bf8, 0x5bfe69b9 }, + { 0x75c52d82, 0x3318be77, 0x54d0aab9, 0x4cb764b5, -0x3388c26f, -0x5430c2d9, -0x7edcd776, 0x3bf4d184 }, + { 0x78a151ab, 0x183eab7e, -0x66f6c89d, -0x44166f37, 0x4ac7e335, -0x008e8292, 0x25f39f88, 0x4c5cddb3 } + }, + { + { -0x185606fe, 0x57750967, 0x4f5b467e, 0x2c37fdfc, 0x3177ba46, -0x4d9e99c6, -0x23d2acd5, 0x3a375e78 }, + { 0x6190a6eb, -0x3f0948b3, 0x2db8f4e4, 0x20ea81a4, -0x68cea8a0, -0x57429083, 0x62ac7c21, 0x33b1d602 }, + { 0x2d4dddea, -0x7ebe18d1, 0x62c607c8, -0x19150168, 0x573cafd0, 0x23c28458, 0x4ff97346, 0x46b9476f } + }, + { + { 0x0d58359f, 0x1215505c, -0x03d73b95, 0x2a2013c7, -0x761599b2, 0x24a0a1af, -0x5eecf1e1, 0x4400b638 }, + { 0x4f901e5c, 0x0c1ffea4, 0x2184b782, 0x2b0b6fb7, 0x0114db88, -0x1a78006f, 0x4785a142, 0x37130f36 }, + { -0x6912e63d, 0x3a01b764, -0x12cd8dd0, 0x31e00ab0, -0x7c35ea4f, 0x520a8857, 0x5accbec7, 0x06aab987 } + }, +}, +{ + { + { 0x512eeaef, 0x5349acf3, 0x1cc1cb49, 0x20c141d3, -0x56659773, 0x24180c07, -0x39b4d2e9, 0x555ef9d1 }, + { -0x0a20f145, -0x3ecc667d, 0x512c4cac, -0x3f0c8a71, 0x0bb398e1, 0x2cf1130a, -0x55d8f39e, 0x6b3cecf9 }, + { 0x3b73bd08, 0x36a770ba, -0x5c5040f4, 0x624aef08, -0x4bf6b90e, 0x5737ff98, 0x3381749d, 0x675f4de1 } + }, + { + { 0x3bdab31d, -0x5ed00927, -0x629ad202, 0x0725d80f, -0x65416b79, 0x019c4ff3, -0x7d32c3bd, 0x60f450b8 }, + { 0x6b1782fc, 0x0e2c5203, 0x6cad83b4, 0x64816c81, 0x6964073e, -0x2f234227, 0x0164c520, 0x13d99df7 }, + { 0x21e5c0ca, 0x014b5ec3, -0x28e6405e, 0x4fcb69c9, 0x750023a0, 0x4e5f1c18, 0x55edac80, 0x1c06de9e } + }, + { + { -0x00929656, -0x002ad4c0, -0x23bfb645, 0x34530b18, -0x5cb26769, 0x5e4a5c2f, 0x7d32ba2d, 0x78096f8e }, + { -0x5cc13b1e, -0x66f0852a, -0x41d11f72, 0x6608f938, 0x63284515, -0x635ebc3b, -0x13d249f3, 0x4cf38a1f }, + { 0x0dfa5ce7, -0x5f55559b, 0x48b5478c, -0x063b61d6, 0x7003725b, 0x4f09cc7d, 0x26091abe, 0x373cad3a } + }, + { + { -0x76224453, -0x0e415705, 0x61aeaecb, 0x3bcb2cbc, 0x1f9b8d9d, -0x70a75845, 0x5112a686, 0x21547eda }, + { -0x7d360a84, -0x4d6b9cb3, 0x24934536, 0x1fcbfde1, 0x418cdb5a, -0x6163b24d, 0x454419fc, 0x0040f3d9 }, + { -0x02a6792d, -0x210216c7, 0x510a380c, -0x0bd8d377, -0x44cee647, -0x48d45bf9, 0x4a254df4, 0x63550a33 } + }, + { + { 0x72547b49, -0x6445a7bb, -0x1d3bf720, -0x0cfa3906, -0x38cb0e73, 0x60e8fa69, -0x55828986, 0x39a92baf }, + { -0x4a9630c9, 0x6507d6ed, 0x0ca52ee1, 0x178429b0, -0x149429a3, -0x1583ff70, -0x250870af, 0x3eea62c7 }, + { -0x196cd8b2, -0x62db38ed, 0x68dbd375, 0x5f638577, -0x14754c66, 0x70525560, 0x65c9c4cd, 0x68436a06 } + }, + { + { -0x17dfef84, 0x1e56d317, -0x7bf5169b, -0x3ad997bc, 0x320ffc7a, -0x3e1f5e3a, -0x6e9eeb8e, 0x5373669c }, + { 0x202f3f27, -0x43fdca18, 0x64f975b0, -0x38a3ff1e, -0x5c73dbea, -0x6e5b162b, -0x75487607, 0x17b6e7f6 }, + { -0x65f1ada9, 0x5d2814ab, -0x36354c04, -0x6f70df7c, 0x5b2d1eca, -0x50350a78, 0x78f87d11, 0x1cb4b5a6 } + }, + { + { -0x5d5ff819, 0x6b74aa62, -0x0f8e384f, -0x0cee1f50, 0x000be223, 0x5707e438, -0x7d109154, 0x2dc0fd2d }, + { 0x394afc6c, -0x499b3f95, -0x6725a04f, 0x0c88de24, 0x4bcad834, 0x4f8d0316, -0x218bcb5e, 0x330bca78 }, + { 0x1119744e, -0x67d1007c, 0x2b074724, -0x0696a16a, -0x4036ac05, -0x3a753eb1, 0x369f1cf5, 0x3c31be1b } + }, + { + { -0x0634bd8e, -0x3e97436d, -0x38312468, -0x51478ee1, 0x34ac8d7a, 0x7f0e52aa, 0x7e7d55bb, 0x41cec109 }, + { 0x08948aee, -0x4f0b79b3, -0x6e45e391, 0x07dc19ee, -0x59535ea8, 0x7975cdae, 0x4262d4bb, 0x330b6113 }, + { -0x5d927f76, -0x0869e629, 0x1d9e156d, -0x44e02b62, -0x245e20d9, 0x73d7c36c, 0x1f28777d, 0x26b44cd9 } + }, +}, +{ + { + { -0x4fd7a0c9, -0x50bb7bd3, 0x47efc8df, -0x78ace770, -0x07df6866, -0x6a8b1f6f, 0x69615579, 0x0e378d60 }, + { 0x393aa6d8, 0x300a9035, -0x5ed44e33, 0x2b501131, -0x0f6c3dde, 0x7b1ff677, -0x3547d453, 0x4309c1f8 }, + { -0x7cf8a5ab, -0x26056e8f, 0x6b009fdc, 0x4bdb5ad2, -0x29c210f2, 0x7829ad2c, 0x75fd3877, 0x078fc549 } + }, + { + { -0x47cc5676, -0x1dffb4a5, 0x2d4c3330, 0x44775dec, 0x7eace913, 0x3aa24406, -0x2a71ff57, 0x272630e3 }, + { 0x28878f2d, -0x782042ec, 0x1e9421a1, 0x134636dd, 0x257341a3, 0x4f17c951, -0x52d69348, 0x5df98d4b }, + { -0x1336f4ac, -0x0c987030, 0x12043599, -0x0ffeba65, 0x3758b89b, 0x26725fbc, 0x73a719ae, 0x4325e4aa } + }, + { + { -0x30960a63, -0x12db9d66, -0x22a5440c, 0x2a4a1cce, 0x56b2d67b, 0x3535ca1f, 0x43b1b42d, 0x5d8c68d0 }, + { 0x433c3493, 0x657dc6ef, -0x7f24073d, 0x65375e9f, 0x5b372dae, 0x47fd2d46, 0x796e7947, 0x4966ab79 }, + { -0x1c4bd4f6, -0x11ccd2b3, 0x16a4601c, -0x27b1a5d5, 0x078ba3e4, 0x78243877, 0x184ee437, 0x77ed1eb4 } + }, + { + { -0x616d12e6, 0x185d43f8, -0x01b8e63a, -0x4fb5e116, -0x590fc0b1, 0x499fbe88, 0x3c859bdd, 0x5d8b0d2f }, + { 0x201839a0, -0x402b1ec1, 0x3e3df161, -0x5110001e, 0x6b5d1fe3, -0x49a4fb10, 0x2b62fbc0, 0x52e085fb }, + { -0x5ab30d46, 0x124079ea, 0x001b26e7, -0x28db9a15, -0x36850803, 0x6843bcfd, 0x55eacd02, 0x0524b42b } + }, + { + { -0x647d6154, -0x43e72353, -0x4a0a8630, 0x23ae7d28, 0x69384233, -0x3cb9edd6, -0x182b5377, 0x1a6110b2 }, + { -0x1babb850, -0x02f2a242, 0x092005ee, 0x6cec351a, 0x567579cb, -0x665b87bc, 0x16e7fa45, 0x59d242a2 }, + { -0x19966854, 0x4f833f6a, 0x361839a4, 0x6849762a, -0x68f54adb, 0x6985dec1, -0x234e0aba, 0x53045e89 } + }, + { + { -0x72ba01ee, -0x7b25c322, -0x1bbb1d2e, -0x42bd3de8, 0x1f7e3598, -0x57ae6988, 0x5616e2b2, 0x7642c93f }, + { -0x28acac25, -0x34744cba, -0x51aee1de, -0x03034db5, -0x2af51911, -0x345b72c0, -0x0b0834a3, 0x26e3bae5 }, + { 0x4595f8e4, 0x2323daa7, -0x7a85414c, -0x21977375, 0x1c59326e, 0x3fc48e96, 0x15c9b8ba, 0x0b2e73ca } + }, + { + { 0x79c03a55, 0x0e3fbfaf, 0x4cbb5acf, 0x3077af05, -0x24c21c61, -0x2a3aadbb, 0x476a4af7, 0x015e68c1 }, + { -0x3e80afda, -0x2944bbd8, -0x04a56359, -0x614d8ddd, 0x1919c644, -0x1c845afd, -0x4a6599fe, 0x21ce380d }, + { 0x20066a38, -0x3e2ad7ae, 0x3570aef3, -0x6a9fc1ae, 0x226b8a4d, -0x7cd9a659, 0x1f8eedc9, 0x5dd68909 } + }, + { + { -0x5acecf7c, 0x1d022591, -0x29d8f78e, -0x35d2b552, 0x2f0bfd20, -0x795ed47b, -0x528258b8, 0x56e6c439 }, + { -0x402c37aa, -0x34537b22, -0x4ca00dbc, 0x1624c348, 0x5d9cad07, -0x48077236, -0x5d3d1418, 0x3b0e574d }, + { 0x42bdbae6, -0x38fb00b7, -0x4d21e087, 0x5e21ade2, 0x5652fad8, -0x16a24c0d, -0x70f7143f, 0x0822b537 } + }, +}, +{ + { + { 0x62730383, -0x1e480d6d, -0x143575d4, 0x4b5279ff, -0x402becec, -0x25038876, -0x638d9ef1, 0x7deb1014 }, + { -0x70c78b8b, 0x51f04847, -0x634134c4, -0x4da2430c, -0x2660dfab, -0x6554edbc, 0x1c10a5d6, 0x2c709e6c }, + { -0x78991186, -0x349d5096, 0x5553cd0e, 0x66cbec04, 0x0f0be4b5, 0x58800138, -0x09d31d16, 0x08e68e9f } + }, + { + { 0x0ab8f2f9, 0x2f2d09d5, -0x3aa6dc21, -0x5346de73, 0x73766cb9, 0x4a8f3426, 0x38f719f5, 0x4cb13bd7 }, + { 0x4bc130ad, 0x34ad500a, 0x3d0bd49c, -0x72c724b7, 0x500a89be, -0x5da3c268, -0x1145c4f7, 0x2f1f3f87 }, + { -0x1aea49b6, -0x087b738b, -0x24b56fc8, -0x5a6afe46, 0x3f751b50, -0x3df2cec1, -0x3f51d118, 0x19a1e353 } + }, + { + { -0x2a694243, -0x4bde8d33, -0x671103c0, -0x6c1fbabd, -0x4bbef64b, -0x604eacb9, 0x0266ae34, 0x736bd399 }, + { -0x4505fa3d, 0x7d1c7560, -0x391aa19f, -0x4c1e5f60, -0x3f299b8d, -0x1cad68e8, -0x3df3cb7a, 0x41546b11 }, + { -0x6ccb4c4c, -0x7aacd2b0, 0x60816573, 0x46fd114b, 0x425c8375, -0x33a0a0d0, -0x478054a4, 0x412295a2 } + }, + { + { -0x1d6c153a, 0x2e655261, 0x2133acdb, -0x7ba56dfd, 0x7900996b, 0x460975cb, 0x195add80, 0x0760bb8d }, + { -0x0a812917, 0x19c99b88, 0x6df8c825, 0x5393cb26, -0x4cf52d8d, 0x5cee3213, -0x4ad2d1cc, 0x14e153eb }, + { -0x32197e76, 0x413e1a17, -0x12965f7c, 0x57156da9, 0x46caccb1, 0x2cbf268f, -0x3cc53a0e, 0x6b34be9b } + }, + { + { 0x6571f2d3, 0x11fc6965, 0x530e737a, -0x393617bb, -0x2b01afcb, -0x1cc5185e, 0x2e6dd30b, 0x01b9c7b6 }, + { 0x3a78c0b2, -0x0c20d09c, -0x0dd1fd84, 0x4c3e971e, 0x49c1b5a3, -0x1382e3a2, 0x0922dd2d, 0x2012c18f }, + { 0x5ac89d29, -0x77f4aa1b, 0x45a0a763, 0x1483241f, -0x3d1893e1, 0x3d36efdf, 0x4e4bade8, 0x08af5b78 } + }, + { + { -0x7633d3b5, -0x1d8ceb2e, -0x5d78e873, 0x4be4bd11, -0x05cc9b32, 0x18d528d6, -0x50267d92, 0x6423c1d5 }, + { -0x77e0dacd, 0x283499dc, 0x779323b6, -0x62fada26, 0x673441f4, -0x76852205, 0x163a168d, 0x32b79d71 }, + { -0x12034c96, -0x337a0727, 0x3746e5f9, 0x22bcc28f, -0x061a2c33, -0x1b621cc8, -0x3ec1d234, 0x480a5efb } + }, + { + { 0x42ce221f, -0x499eb31c, 0x4c053928, 0x6e199dcc, -0x23e341fd, 0x663fb4a4, 0x691c8e06, 0x24b31d47 }, + { 0x01622071, 0x0b51e70b, -0x74e2503b, 0x06b505cf, -0x10a55433, 0x2c6bb061, 0x0cb7bf31, 0x47aa2760 }, + { -0x3fea073d, 0x2a541eed, 0x7c693f7c, 0x11a4fe7e, 0x4ea278d6, -0x0f5099ed, 0x14dda094, 0x545b585d } + }, + { + { -0x1c4cde1f, 0x6204e4d0, 0x28ff1e95, 0x3baa637a, 0x5b99bd9e, 0x0b0ccffd, 0x64c8d071, 0x4d22dc3e }, + { -0x5f2bc5f1, 0x67bf275e, 0x089beebe, -0x521971cc, -0x2b8618d2, 0x4289134c, 0x32ba5454, 0x0f62f9c3 }, + { -0x29c4a0c7, -0x034b9a77, 0x57cbcf61, 0x5cae6a3f, -0x6ac505fb, -0x01453d2e, 0x36371436, 0x1c0fa01a } + }, +}, +{ + { + { 0x54c53fae, -0x3ee11a18, 0x2b4f3ff4, 0x6a0b06c1, -0x1f49858e, 0x33540f80, -0x32f81c11, 0x15f18fc3 }, + { -0x4383296e, -0x18ab8bb7, -0x1908c221, 0x0f9abeaa, 0x00837e29, 0x4af01ca7, 0x3f1bc183, 0x63ab1b5d }, + { -0x4fd70b74, 0x32750763, 0x556a065f, 0x06020740, -0x3cb6a4a8, -0x2ac427ee, -0x79a0af73, 0x08706c9b } + }, + { + { 0x38b41246, -0x3366e4bf, 0x6f9ac26b, 0x243b9c52, -0x48345443, -0x4610b6b3, -0x2f7d1300, 0x5fba433d }, + { 0x3d343dff, -0x0c835d55, -0x7f5439e9, 0x1a8c6a2d, -0x2b330036, -0x71b61fcb, -0x455e2e47, 0x48b46bee }, + { -0x366be530, -0x63b61cab, 0x74498f84, -0x468cb522, 0x66663e5c, 0x41c3fed0, -0x1718ef4d, 0x0ecfedf8 } + }, + { + { -0x16bfc89e, 0x744f7463, -0x72033637, -0x08657212, 0x55e4cde3, 0x163a6496, -0x4d7b0bcb, 0x3b61788d }, + { -0x632b8f27, 0x76430f9f, -0x5bd09ff8, -0x49d53365, 0x59adad5e, 0x1898297c, -0x4873af80, 0x7789dd2d }, + { 0x0d6ef6b2, -0x4dddd7e7, 0x46ce4bfa, -0x56b5994e, 0x4f0b6cc7, 0x46c1a77a, -0x148cc731, 0x4236ccff } + }, + { + { -0x2588820a, 0x3bd82dbf, 0x0b98369e, 0x71b177cc, -0x7af3c967, 0x1d0e8463, 0x48e2d1f1, 0x5a71945b }, + { 0x0d55e274, -0x7b68bfb3, -0x3b52d4ad, 0x6c6663d9, -0x5256a8cc, -0x13d04f27, -0x324708c4, 0x2617e120 }, + { 0x405b4b42, 0x6f203dd5, 0x10b24509, 0x327ec604, -0x53d577ba, -0x63cb8dd0, 0x11ffeb6a, 0x77de29fc } + }, + { + { -0x13312d36, -0x7ca1ec71, -0x1569c466, -0x736150ed, -0x4de9f15a, -0x36a04040, -0x5278876e, 0x575e66f3 }, + { -0x7c488758, -0x4f53a837, -0x28016ed4, 0x53cdcca9, -0x00e0a624, 0x61c2b854, -0x0f218254, 0x3a1a2cf0 }, + { -0x377034c6, -0x667fc5d9, 0x275ec0b0, 0x345a6789, -0x0093d41b, 0x459789d0, 0x1e70a8b2, 0x62f88265 } + }, + { + { 0x698a19e0, 0x6d822986, 0x74d78a71, -0x2367de1f, -0x0934e0b9, 0x41a85f31, -0x432563af, 0x352721c2 }, + { 0x59ff1be4, 0x085ae2c7, 0x3b0e40b7, 0x149145c9, 0x7ff27379, -0x3b981806, -0x2a38c56b, 0x4eeecf0a }, + { 0x213fc985, 0x48329952, 0x368a1746, 0x1087cf0d, 0x66c15aa5, -0x71ad9e4f, 0x2ed24c21, 0x2d5b2d84 } + }, + { + { 0x196ac533, 0x5eb7d13d, -0x247f41d5, 0x377234ec, 0x7cf5ae24, -0x1ebb3004, -0x3bbe5314, 0x5226bcf9 }, + { -0x142c212f, 0x02cfebd9, 0x39021974, -0x2ba4de89, -0x01cf5e49, 0x7576f813, -0x5cb1093e, 0x5691b6f9 }, + { 0x23e5b547, 0x79ee6c72, -0x7ccf2987, 0x6f5f5076, 0x6d8adce9, -0x128c1e17, 0x1d8ccc03, 0x27c3da1e } + }, + { + { 0x630ef9f6, 0x28302e71, 0x2b64cee0, -0x3d2b5dfd, 0x4b6292be, 0x09082030, -0x57d520e8, 0x5fca747a }, + { 0x3fe24c74, 0x7eb9efb2, 0x1651be01, 0x3e50f49f, 0x21858dea, 0x3ea732dc, 0x5bb810f9, 0x17377bd7 }, + { 0x5c258ea5, 0x232a03c3, 0x6bcb0cf1, -0x790dc5d4, 0x2e442166, 0x3dad8d0d, -0x548979d5, 0x04a8933c } + }, +}, +{ + { + { -0x736c95b0, 0x69082b0e, -0x3e253a4a, -0x06365fcb, -0x3b2049cc, 0x6fb73e54, 0x1d2bc140, 0x4005419b }, + { 0x22943dff, -0x2d39fb4a, 0x44cfb3a0, -0x43734132, -0x687f7988, 0x5d254ff3, 0x3b1ca6bf, 0x0fa3614f }, + { -0x46417d10, -0x5ffc0143, 0x3a44ac90, 0x2089c1af, 0x1954fa8e, -0x07b6606f, -0x10bf54be, 0x1fba218a } + }, + { + { 0x3e7b0194, 0x4f3e5704, 0x08daaf7f, -0x57e2c112, -0x6623210f, -0x37c63955, -0x00889e2b, 0x6c535d13 }, + { -0x05370ac2, -0x54ab6bb8, 0x7ba63741, -0x7e091766, 0x6c2b5e01, 0x74fd6c7d, -0x573791be, 0x392e3aca }, + { 0x3e8a35af, 0x4cbd34e9, 0x5887e816, 0x2e078144, -0x0d654f55, 0x19319c76, -0x2af53ec5, 0x25e17fe4 } + }, + { + { 0x76f121a7, -0x6ea0800b, 0x2fcd87e3, -0x3cb5cdd9, 0x4d1be526, -0x3345d022, -0x76967665, 0x6bba828f }, + { 0x1e04f676, 0x0a289bd7, -0x29bdf06b, 0x208e1c52, 0x34691fab, 0x5186d8b0, 0x2a9fb351, 0x25575144 }, + { -0x6f01c6ff, -0x1d2e439a, -0x5f66852b, 0x4cb54a18, -0x507b9f2c, -0x68e296ec, 0x7f6b7be4, 0x559d504f } + }, + { + { -0x092d9903, -0x63b76e19, 0x0307781b, 0x0744a19b, 0x6061e23b, -0x77c770e3, 0x354bd50e, 0x123ea6a3 }, + { -0x4c14ab2b, -0x588c7c88, -0x5aaac384, 0x1d69d366, -0x06d7ff46, 0x0a26cf62, -0x7f81cde9, 0x01ab12d5 }, + { 0x41e32d96, 0x118d1890, -0x27cea7b8, -0x46121c3e, -0x27cdba27, 0x1eab4271, -0x36e75eac, 0x4a3961e2 } + }, + { + { -0x0cdcc0e2, 0x0327d644, 0x34fcf016, 0x499a260e, -0x0d254687, -0x7c4a58ea, -0x642beee1, 0x68aceead }, + { -0x07194460, 0x71dc3be0, 0x7effe30a, -0x293107cc, -0x1ec5b896, -0x566dbda1, -0x04e2489d, 0x2cd6bce3 }, + { -0x0c283df0, 0x38b4c90e, -0x4852fbf4, 0x308e6e24, -0x4818c1dd, 0x3860d9f1, -0x4af70a69, 0x595760d5 } + }, + { + { -0x02fdd870, -0x77d53415, -0x3beea8a0, -0x7650ccfb, 0x7d3473f4, 0x65f492e3, 0x54515a2b, 0x2cb2c5df }, + { 0x04aa6397, 0x6129bfe1, -0x5b580335, -0x7069fff8, 0x7d909458, 0x3f8bc089, -0x234d6e57, 0x709fa43e }, + { 0x63fd2aca, -0x14f5a274, 0x2e694eff, -0x2dd43e9a, -0x07344fc6, 0x2723f36e, -0x0f37ece1, 0x70f029ec } + }, + { + { 0x5e10b0b9, 0x2a6aafaa, -0x10fbe557, 0x78f0a370, -0x55c529e1, 0x773efb77, -0x58b4261f, 0x44eca5a2 }, + { 0x2eed3e33, 0x461307b3, -0x5baa7e19, -0x51fbd0cd, 0x195f0366, -0x36bbb62d, 0x6c314858, 0x0b7d5d8a }, + { 0x7b95d543, 0x25d44832, -0x5ccbf0e3, 0x70d38300, 0x60e1c52b, -0x21e3ace4, 0x2c7de9e4, 0x27222451 } + }, + { + { 0x42a975fc, -0x40844476, -0x69525ca8, -0x73a3c689, -0x321255b8, -0x1d803891, -0x0943df5a, 0x19735fd7 }, + { 0x49c5342e, 0x1abc92af, -0x4d190530, -0x001127ef, -0x0337b1d7, -0x105d7373, -0x5bb33abd, 0x11b5df18 }, + { 0x42c84266, -0x1c546f30, 0x7f19547e, -0x147b71f1, 0x65a497b9, 0x2503a1d0, -0x6e2076a1, 0x0fef9111 } + }, +}, +{ + { + { 0x5b1c16b7, 0x6ab5dcb8, 0x3c7b27a5, -0x6b3f0318, 0x735517be, -0x5b4ee3e6, -0x45f15056, 0x499238d0 }, + { -0x54e39147, -0x4eaf835f, 0x16b687b3, -0x42bb70c2, 0x2c7a91ab, 0x3455fb7f, 0x2f2adec1, 0x7579229e }, + { 0x7aba8b57, -0x130b91ae, -0x742e9b85, 0x15a08c47, 0x5f706fef, 0x7af1c6a6, -0x0fc5cf2b, 0x6345fa78 } + }, + { + { -0x42270f5c, -0x6c2c3417, -0x02e88cfe, -0x24ead3e5, 0x7f17a875, 0x7dbddc6d, -0x70bd9102, 0x3e1a71cc }, + { 0x1015e7a1, -0x20fd06a1, -0x564bfd9d, 0x790ec41d, 0x33ea1107, 0x4d3a0ea1, -0x1cc50737, 0x54f70be7 }, + { -0x6f45429e, -0x37c35c1d, 0x0291c833, -0x7f121c99, -0x2c86ff3c, -0x377fc734, 0x1ec31fa1, 0x2c5fc023 } + }, + { + { 0x02456e65, -0x3bdd1b2f, -0x352b846f, -0x78beb53f, -0x5d490023, 0x1592e2bb, -0x0a3deff1, 0x75d9d2bf }, + { 0x17038b4f, -0x01456ee9, -0x3621107f, -0x1aedc8df, 0x5d0d8834, 0x1c97e4e7, 0x23dc3bc6, 0x68afae7a }, + { 0x3626e81c, 0x5bd9b476, -0x435fd123, -0x766996ca, 0x61f077b3, 0x0a41193d, 0x00ce5471, 0x3097a242 } + }, + { + { 0x6695c486, -0x5e9d18dc, 0x35a89607, 0x131d6334, -0x5f2ed5c9, 0x30521561, -0x59504c9d, 0x56704bad }, + { -0x380747b4, 0x57427734, 0x01b270e9, -0x0ebe5ec2, -0x4b1a9b5a, 0x02d1adfe, -0x317c42b8, 0x4bb23d92 }, + { 0x52f912b9, -0x5093b559, -0x27988f38, 0x5e665f6c, -0x5c3732a8, 0x4c35ac83, 0x10a58a7e, 0x2b7a29c0 } + }, + { + { -0x40fff792, 0x33810a23, -0x18c90084, -0x50316da2, -0x1db6dd2c, 0x3d60e670, 0x4f96061b, 0x11ce9e71 }, + { -0x2f3e313d, -0x3bff8089, -0x453b6d08, -0x72efdf4a, 0x7e69daaf, 0x32ec29d5, -0x626a0320, 0x59940875 }, + { -0x27ea453f, 0x219ef713, 0x485be25c, -0x0ebeb9a3, 0x4e513c51, 0x6d5447cc, 0x5ef44393, 0x174926be } + }, + { + { -0x6c15fdd2, 0x3ef5d415, 0x0ed0eed6, 0x5cbcc1a2, 0x07382c8c, -0x702db131, 0x06d8e1ad, 0x6fa42ead }, + { -0x03a42a45, -0x4a214d07, -0x1e27ef1f, -0x6d2558d6, -0x48d5e3a7, -0x503b3024, 0x3fc22a24, 0x497d7881 }, + { 0x1f73371f, -0x1d897db6, 0x4f5b6736, 0x7f7cf01c, 0x04fa46e7, 0x7e201fe3, 0x57808c96, 0x785a36a3 } + }, + { + { 0x5d517bc3, 0x07044298, -0x519ac988, 0x6acd56c7, -0x67a5889d, 0x00a27983, -0x1aed99d5, 0x5167effa }, + { 0x63014d2b, -0x7da04203, 0x6ca7578b, -0x37adc964, 0x5c0b5df0, 0x5b2fcd28, 0x58048c8f, 0x12ab214c }, + { 0x0f53c4b6, -0x42b1561f, -0x7536e5ec, 0x1673dc5f, 0x2acc1aba, -0x5707e5b2, 0x24332a25, 0x33a92a79 } + }, + { + { 0x218f2ada, 0x7ba95ba0, 0x330fb9ca, -0x300bdd79, 0x56c6d907, -0x2525b693, -0x0b4111ac, 0x5380c296 }, + { 0x27996c02, -0x622e0b67, -0x1fb2e8ae, 0x0cb3b058, 0x7fd02c3e, 0x1f7e8896, -0x3474c14f, 0x2f964268 }, + { 0x66898d0a, -0x62b0d8fc, 0x0aff3f7a, 0x3d098799, 0x67daba45, -0x2f610c9e, 0x7b1c669c, 0x7761455e } + }, +}, +}; #elif defined(CURVED25519_128BIT) static const ge_precomp base[32][8] = { { @@ -6373,7 +9095,7 @@ static const ge_precomp base[32][8] = { static void ge_select(ge_precomp *t,int pos,signed char b) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge_precomp minust; unsigned char bnegative = negative(b); unsigned char babs = b - (((-bnegative) & b) << 1); @@ -6408,7 +9130,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) signed char e[64]; signed char carry; ge_p1p1 r; -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge_p2 s; #endif ge_precomp t; @@ -6431,7 +9153,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) e[63] += carry; /* each e[i] is between -8 and 8 */ -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge_select(&t,0,e[1]); fe_sub(h->X, t.yplusx, t.yminusx); fe_add(h->Y, t.yplusx, t.yminusx); @@ -6545,6 +9267,49 @@ static const ge_precomp Bi[8] = { { 0x5d9a762f9bd0b516, -0x14c750b1c8c02112, 0x032e5a7d93d64270, 0x511d61210ae4d842, }, }, }; +#elif defined(CURVED25519_ASM_32BIT) +static const ge_precomp Bi[8] = { + { + { -0x0a73c47b, 0x2fbc93c6, -0x0473f1e7, -0x306cd23a, 0x643d42c2, 0x270b4898, 0x33d4ba65, 0x07cf9d3a, }, + { -0x28bf6ec2, -0x62efc6fb, -0x2ebf414d, -0x02c660fb, 0x688f8a09, -0x5a3e7bcc, -0x6707ed99, 0x44fd2f92, }, + { -0x78855598, -0x5436edfb, -0x33553b62, 0x26d9e823, -0x22bca674, 0x5a1b7dcb, -0x60f39a58, 0x6f117b68, }, + }, + { + { 0x4cee9730, -0x50da4f58, -0x1779b476, 0x025a8430, -0x60fe98ce, -0x3ee4affe, -0x657f070c, 0x7a164e1b, }, + { -0x5b032d9b, 0x56611fe8, -0x1a3e4583, 0x3bd353fd, 0x214bd6bd, -0x7ece0ce6, 0x555bda62, 0x2ab91587, }, + { 0x0dd0d889, 0x14ae933f, 0x1c35da62, 0x58942322, -0x730d24b4, -0x2e8f1abb, 0x12b9b4c6, 0x5a2826af, }, + }, + { + { 0x08a5bb33, -0x5ded43bc, -0x38a112fe, -0x72afb73d, 0x5abfec44, -0x22e414f4, 0x46e206eb, 0x2945ccf1, }, + { -0x5bb82946, 0x7f9182c3, 0x4b2729b7, -0x2affeb2f, -0x479b5f79, -0x1cc30ee4, -0x14e4aa0d, 0x154a7e73, }, + { -0x7ed57d7b, -0x4344240f, -0x2f422e04, 0x270e0807, 0x1bbda72d, -0x4be498f5, 0x6b3bb69a, 0x43aabe69, }, + }, + { + { -0x6bb15c41, 0x6b1a5cd0, -0x4c623f2e, 0x7470353a, 0x28542e49, 0x71b25282, 0x283c927e, 0x461bea69, }, + { -0x55cdde4f, -0x4590d366, 0x3bba23a7, 0x6ca02153, -0x6de6d3c6, -0x621589b1, 0x2e5317e0, 0x1d6edd5d, }, + { 0x01b8b3a2, -0x0e7c9238, 0x053ea49a, -0x4cfca0b9, 0x5877adf3, 0x529c41ba, 0x6a0f90a7, 0x7a9fbb1c, }, + }, + { + { -0x59579cd1, -0x64d19876, 0x51bc46c5, -0x59af6191, -0x39790a4b, -0x314dcc37, -0x752280a7, 0x34b9ed33, }, + { 0x039d8064, -0x0c91de82, -0x0adfbe65, -0x675f7e4a, -0x18a14fbc, -0x693439f8, -0x05236371, 0x49c05a51, }, + { -0x6fba50e5, 0x06b4e8bf, -0x58e62dd1, -0x1d007c18, -0x6c2b30ea, -0x550903d7, 0x1b008b06, 0x73c17202, }, + }, + { + { -0x757fd522, 0x2fbf0084, 0x02302e27, -0x1a260131, 0x17703406, 0x113e8471, 0x546d8faf, 0x4275aae2, }, + { 0x49864348, 0x315f5b02, 0x77088381, 0x3ed6b369, 0x6a8deb95, -0x5c5f8aab, 0x29d5c77f, 0x18ab5980, }, + { -0x029f7617, -0x27d4d33b, 0x3282e4a4, 0x031eb4a1, -0x4ae579de, 0x44311199, -0x4ac206b8, 0x3dc65522, }, + }, + { + { -0x5dff8093, -0x408f3dde, -0x4a432125, -0x407b4c66, -0x04f845f9, 0x537a0e12, -0x3cb90dbf, 0x234fd7ee, }, + { 0x327fbf93, 0x506f013b, -0x64889095, -0x51031437, -0x5552a698, -0x62ed4dce, 0x176024a7, 0x0267882d, }, + { 0x732ea378, 0x5360a119, -0x20722b8f, 0x2437e6b1, -0x6e581acd, -0x5d10c808, -0x55f6879d, 0x497ba6fd, }, + }, + { + { 0x13cfeaa0, 0x24cecc03, 0x189c246d, -0x79b73d73, -0x3e0d2b30, 0x2dbdbdfa, -0x0ed218d5, 0x61e22917, }, + { 0x468ccf0b, 0x040bcd86, 0x2a9910d6, -0x2c7d645c, 0x07b25192, 0x75083008, 0x18d05ebf, 0x43b5cd42, }, + { -0x642f4aea, 0x5d9a762f, 0x373fdeee, -0x14c750b2, -0x6c29bd90, 0x032e5a7d, 0x0ae4d842, 0x511d6121, }, + }, +}; #elif defined(CURVED25519_128BIT) static const ge_precomp Bi[8] = { { @@ -6698,7 +9463,11 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, #ifdef CURVED25519_ASM_64BIT static const ge d = { 0x75eb4dca135978a3, 0x00700a4d4141d8ab, -0x7338bf8688861768, 0x52036cee2b6ffe73, - }; +}; +#elif defined(CURVED25519_ASM_32BIT) +static const ge d = { + 0x135978a3, 0x75eb4dca, 0x4141d8ab, 0x00700a4d, 0x7779e898, -0x7338bf87, 0x2b6ffe73, 0x52036cee, +}; #elif defined(CURVED25519_128BIT) static const ge d = { 0x34dca135978a3, 0x1a8283b156ebd, 0x5e7a26001c029, 0x739c663a03cbb, @@ -6708,14 +9477,18 @@ static const ge d = { static const ge d = { -10913610,13857413,-15372611,6949391,114729, -8787816,-6275908,-3247719,-18696448,-12055116 -} ; +}; #endif #ifdef CURVED25519_ASM_64BIT static const ge sqrtm1 = { -0x3b11e4d8b5f15f50, 0x2f431806ad2fe478, 0x2b4d00993dfbd7a7, 0x2b8324804fc1df0b, - }; +}; +#elif defined(CURVED25519_ASM_32BIT) +static const ge sqrtm1 = { + 0x4a0ea0b0, -0x3b11e4d9, -0x52d01b88, 0x2f431806, 0x3dfbd7a7, 0x2b4d0099, 0x4fc1df0b, 0x2b832480, +}; #elif defined(CURVED25519_128BIT) static const ge sqrtm1 = { 0x61b274a0ea0b0, 0x0d5a5fc8f189d, 0x7ef5e9cbd0c60, 0x78595a6804c9e, @@ -6725,7 +9498,7 @@ static const ge sqrtm1 = { static const ge sqrtm1 = { -32595792,-7943725,9377950,3500415,12389472, -272473,-25146209,-2005654,326686,11406482 -} ; +}; #endif @@ -6779,7 +9552,7 @@ r = p + q static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -6806,7 +9579,7 @@ r = p - q static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -6832,7 +9605,7 @@ r = p static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM fe_mul(r->X,p->X,p->T); fe_mul(r->Y,p->Y,p->Z); fe_mul(r->Z,p->Z,p->T); @@ -6850,7 +9623,7 @@ r = p static WC_INLINE void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM fe_mul(r->X,p->X,p->T); fe_mul(r->Y,p->Y,p->Z); fe_mul(r->Z,p->Z,p->T); @@ -6879,7 +9652,7 @@ r = 2 * p static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge t0; fe_sq(r->X,p->X); fe_sq(r->Z,p->Y); @@ -6919,7 +9692,11 @@ r = p #ifdef CURVED25519_ASM_64BIT static const ge d2 = { -0x1429646bd94d0ea7, 0x00e0149a8283b156, 0x198e80f2eef3d130, 0x2406d9dc56dffce7, - }; +}; +#elif defined(CURVED25519_ASM_32BIT) +static const ge d2 = { + 0x26b2f159, -0x1429646c, -0x7d7c4eaa, 0x00e0149a, -0x110c2ed0, 0x198e80f2, 0x56dffce7, 0x2406d9dc, +}; #elif defined(CURVED25519_128BIT) static const ge d2 = { 0x69b9426b2f159, 0x35050762add7a, 0x3cf44c0038052, 0x6738cc7407977, @@ -6970,7 +9747,7 @@ void ge_p3_tobytes(unsigned char *s,const ge_p3 *h) } -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM /* ge_precomp_0 */ static void ge_precomp_0(ge_precomp *h) { @@ -6988,7 +9765,7 @@ r = p - q static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) { -#ifndef CURVED25519_ASM_64BIT +#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index fe5e27027..4274916ba 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -52,6 +52,8 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \ wolfcrypt/src/port/arm/armv8-sha256.c \ wolfcrypt/src/port/arm/armv8-curve25519.c \ wolfcrypt/src/port/arm/armv8-curve25519.S \ + wolfcrypt/src/port/arm/armv7-curve25519.c \ + wolfcrypt/src/port/arm/armv7-curve25519.S \ wolfcrypt/src/port/nxp/ksdk_port.c \ wolfcrypt/src/port/atmel/README.md \ wolfcrypt/src/port/xilinx/xil-sha3.c \ diff --git a/wolfcrypt/src/port/arm/armv7-curve25519.S b/wolfcrypt/src/port/arm/armv7-curve25519.S new file mode 100644 index 000000000..223a67c22 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv7-curve25519.S @@ -0,0 +1,6005 @@ +/* armv7-curve25519 + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifndef __aarch64__ +.text +.globl fe_init +.type fe_init, %function +.align 2 +fe_init: + bx lr +.size fe_init,.-fe_init +.text +.globl fe_frombytes +.type fe_frombytes, %function +.align 2 +fe_frombytes: + push {r4, r5, r6, r7, lr} + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + ldrd r4, r5, [r1, #16] + ldrd r6, r7, [r1, #24] + and r7, r7, #0x7fffffff + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + strd r4, r5, [r0, #16] + strd r6, r7, [r0, #24] + pop {r4, r5, r6, r7, pc} +.size fe_frombytes,.-fe_frombytes +.text +.globl fe_tobytes +.type fe_tobytes, %function +.align 2 +fe_tobytes: + push {r4, r5, r6, r7, r8, lr} + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + ldrd r4, r5, [r1, #16] + ldrd r6, r7, [r1, #24] + adds r8, r2, #19 + adcs r8, r3, #0 + adcs r8, r12, #0 + adcs r8, lr, #0 + adcs r8, r4, #0 + adcs r8, r5, #0 + adcs r8, r6, #0 + adc r8, r7, #0 + asr r8, r8, #31 + and r8, r8, #19 + adds r2, r2, r8 + adcs r3, r3, #0 + adcs r12, r12, #0 + adcs lr, lr, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + and r7, r7, #0x7fffffff + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + strd r4, r5, [r0, #16] + strd r6, r7, [r0, #24] + pop {r4, r5, r6, r7, r8, pc} +.size fe_tobytes,.-fe_tobytes +.text +.globl fe_1 +.type fe_1, %function +.align 2 +fe_1: + # Set one + mov r2, #1 + mov r1, #0 + str r2, [r0] + str r1, [r0, #4] + str r1, [r0, #8] + str r1, [r0, #12] + str r1, [r0, #16] + str r1, [r0, #20] + str r1, [r0, #24] + str r1, [r0, #28] + bx lr +.size fe_1,.-fe_1 +.text +.globl fe_0 +.type fe_0, %function +.align 2 +fe_0: + # Set zero + mov r1, #0 + str r1, [r0] + str r1, [r0, #4] + str r1, [r0, #8] + str r1, [r0, #12] + str r1, [r0, #16] + str r1, [r0, #20] + str r1, [r0, #24] + str r1, [r0, #28] + bx lr +.size fe_0,.-fe_0 +.text +.globl fe_copy +.type fe_copy, %function +.align 2 +fe_copy: + push {lr} + # Copy + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + ldrd r2, r3, [r1, #16] + ldr r12, [r1, #24] + ldr lr, [r1, #28] + strd r2, r3, [r0, #16] + str r12, [r0, #24] + str lr, [r0, #28] + pop {pc} +.size fe_copy,.-fe_copy +.text +.globl fe_sub +.type fe_sub, %function +.align 2 +fe_sub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + # Sub + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r2] + ldrd r8, r9, [r2, #8] + subs r6, r12, r6 + sbcs r7, lr, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + strd r6, r7, [r0] + strd r8, r9, [r0, #8] + ldr r12, [r1, #16] + ldr lr, [r1, #20] + ldrd r4, r5, [r1, #24] + ldrd r6, r7, [r2, #16] + ldrd r8, r9, [r2, #24] + sbcs r6, r12, r6 + sbcs r7, lr, r7 + sbcs r8, r4, r8 + sbc r9, r5, r9 + mov r10, #-19 + asr r3, r9, #31 + # Mask the modulus + and r10, r3, r10 + and r11, r3, #0x7fffffff + # Add modulus (if underflow) + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + adds r12, r12, r10 + adcs lr, lr, r3 + adcs r4, r4, r3 + adcs r5, r5, r3 + adcs r6, r6, r3 + adcs r7, r7, r3 + adcs r8, r8, r3 + adc r9, r9, r11 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + strd r6, r7, [r0, #16] + strd r8, r9, [r0, #24] + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_sub,.-fe_sub +.text +.globl fe_add +.type fe_add, %function +.align 2 +fe_add: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + # Add + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r2] + ldrd r8, r9, [r2, #8] + adds r6, r12, r6 + adcs r7, lr, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + strd r6, r7, [r0] + strd r8, r9, [r0, #8] + ldr r12, [r1, #16] + ldr lr, [r1, #20] + ldrd r4, r5, [r1, #24] + ldrd r6, r7, [r2, #16] + ldrd r8, r9, [r2, #24] + adcs r6, r12, r6 + adcs r7, lr, r7 + adcs r8, r4, r8 + adc r9, r5, r9 + mov r10, #-19 + asr r3, r9, #31 + # Mask the modulus + and r10, r3, r10 + and r11, r3, #0x7fffffff + # Sub modulus (if overflow) + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + subs r12, r12, r10 + sbcs lr, lr, r3 + sbcs r4, r4, r3 + sbcs r5, r5, r3 + sbcs r6, r6, r3 + sbcs r7, r7, r3 + sbcs r8, r8, r3 + sbc r9, r9, r11 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + strd r6, r7, [r0, #16] + strd r8, r9, [r0, #24] + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_add,.-fe_add +.text +.globl fe_neg +.type fe_neg, %function +.align 2 +fe_neg: + push {r4, r5, lr} + mov r5, #-1 + mov r4, #-19 + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + subs r2, r4, r2 + sbcs r3, r5, r3 + sbcs r12, r5, r12 + sbcs lr, r5, lr + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + mov r4, #0x7fffffff + ldrd r2, r3, [r1, #16] + ldr r12, [r1, #24] + ldr lr, [r1, #28] + sbcs r2, r5, r2 + sbcs r3, r5, r3 + sbcs r12, r5, r12 + sbc lr, r4, lr + strd r2, r3, [r0, #16] + str r12, [r0, #24] + str lr, [r0, #28] + pop {r4, r5, pc} +.size fe_neg,.-fe_neg +.text +.globl fe_isnonzero +.type fe_isnonzero, %function +.align 2 +fe_isnonzero: + push {r4, r5, r6, r7, r8, lr} + ldrd r2, r3, [r0] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + adds r1, r2, #19 + adcs r1, r3, #0 + adcs r1, r12, #0 + adcs r1, lr, #0 + adcs r1, r4, #0 + adcs r1, r5, #0 + adcs r1, r6, #0 + adc r1, r7, #0 + asr r1, r1, #31 + and r1, r1, #19 + adds r2, r2, r1 + adcs r3, r3, #0 + adcs r12, r12, #0 + adcs lr, lr, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + and r7, r7, #0x7fffffff + orr r2, r2, r3 + orr r12, r12, lr + orr r4, r4, r5 + orr r6, r6, r7 + orr r12, r12, r4 + orr r2, r2, r6 + orr r0, r2, r12 + pop {r4, r5, r6, r7, r8, pc} +.size fe_isnonzero,.-fe_isnonzero +.text +.globl fe_isnegative +.type fe_isnegative, %function +.align 2 +fe_isnegative: + push {lr} + ldrd r2, r3, [r0] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + adds r1, r2, #19 + adcs r1, r3, #0 + adcs r1, r12, #0 + adcs r1, lr, #0 + ldrd r2, r3, [r0, #16] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + adcs r1, r2, #0 + adcs r1, r3, #0 + adcs r1, r12, #0 + ldr r2, [r0] + adc r1, lr, #0 + and r0, r2, #1 + lsr r1, r1, #31 + eor r0, r0, r1 + pop {pc} +.size fe_isnegative,.-fe_isnegative +.text +.globl fe_cmov_table +.type fe_cmov_table, %function +.align 2 +fe_cmov_table: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sxtb r2, r2 + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #1 + mov r12, #0 + mov lr, #1 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-19 + mov r9, #-1 + subs r8, r8, r5 + sbcs r9, r9, r6 + sbc r11, r11, r11 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0] + str r12, [r0, #4] + str lr, [r0, #32] + str r4, [r0, #36] + str r5, [r0, #64] + str r6, [r0, #68] + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #0 + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-1 + mov r9, #-1 + rsbs r11, r11, #0 + sbcs r8, r8, r5 + sbcs r9, r9, r6 + sbc r11, r11, r11 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0, #8] + str r12, [r0, #12] + str lr, [r0, #40] + str r4, [r0, #44] + str r5, [r0, #72] + str r6, [r0, #76] + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #0 + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-1 + mov r9, #-1 + rsbs r11, r11, #0 + sbcs r8, r8, r5 + sbcs r9, r9, r6 + sbc r11, r11, r11 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0, #16] + str r12, [r0, #20] + str lr, [r0, #48] + str r4, [r0, #52] + str r5, [r0, #80] + str r6, [r0, #84] + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #0 + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-1 + mov r9, #0x7fffffff + rsbs r11, r11, #0 + sbcs r8, r8, r5 + sbc r9, r9, r6 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0, #24] + str r12, [r0, #28] + str lr, [r0, #56] + str r4, [r0, #60] + str r5, [r0, #88] + str r6, [r0, #92] + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_cmov_table,.-fe_cmov_table +.text +.globl fe_mul +.type fe_mul, %function +.align 2 +fe_mul: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x40 + # Multiply + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r2] + ldr lr, [r2, #4] + # A[0] * B[0] = 0 + umull r4, r5, r7, r9 + str r4, [sp] + # A[0] * B[1] = 1 + umull r3, r6, r7, lr + adds r5, r5, r3 + adc r6, r6, #0 + # A[1] * B[0] = 1 + umull r3, r12, r8, r9 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #4] + # A[2] * B[0] = 2 + ldr r10, [r1, #8] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adc r4, r4, r12 + # A[1] * B[1] = 2 + umull r3, r12, r8, lr + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[0] * B[2] = 2 + ldr r11, [r2, #8] + umull r3, r12, r7, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #8] + # A[0] * B[3] = 3 + ldr r11, [r2, #12] + umull r3, r12, r7, r11 + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[1] * B[2] = 3 + ldr r11, [r2, #8] + umull r3, r12, r8, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[2] * B[1] = 3 + umull r3, r12, r10, lr + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[3] * B[0] = 3 + ldr r10, [r1, #12] + umull r3, r12, r10, r9 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #12] + # A[4] * B[0] = 4 + ldr r10, [r1, #16] + umull r3, r12, r10, r9 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[3] * B[1] = 4 + ldr r10, [r1, #12] + umull r3, r12, r10, lr + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[2] * B[2] = 4 + ldr r10, [r1, #8] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[1] * B[3] = 4 + ldr r11, [r2, #12] + umull r3, r12, r8, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[0] * B[4] = 4 + ldr r11, [r2, #16] + umull r3, r12, r7, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #16] + # A[0] * B[5] = 5 + ldr r11, [r2, #20] + umull r3, r12, r7, r11 + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[1] * B[4] = 5 + ldr r11, [r2, #16] + umull r3, r12, r8, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[2] * B[3] = 5 + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[3] * B[2] = 5 + ldr r10, [r1, #12] + ldr r11, [r2, #8] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[4] * B[1] = 5 + ldr r10, [r1, #16] + umull r3, r12, r10, lr + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[5] * B[0] = 5 + ldr r10, [r1, #20] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #20] + # A[6] * B[0] = 6 + ldr r10, [r1, #24] + umull r3, r12, r10, r9 + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[5] * B[1] = 6 + ldr r10, [r1, #20] + umull r3, r12, r10, lr + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[4] * B[2] = 6 + ldr r10, [r1, #16] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[3] * B[3] = 6 + ldr r10, [r1, #12] + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[2] * B[4] = 6 + ldr r10, [r1, #8] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[1] * B[5] = 6 + ldr r11, [r2, #20] + umull r3, r12, r8, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[0] * B[6] = 6 + ldr r11, [r2, #24] + umull r3, r12, r7, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #24] + # A[0] * B[7] = 7 + ldr r11, [r2, #28] + umull r3, r12, r7, r11 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[1] * B[6] = 7 + ldr r11, [r2, #24] + umull r3, r12, r8, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[2] * B[5] = 7 + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[3] * B[4] = 7 + ldr r10, [r1, #12] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[4] * B[3] = 7 + ldr r10, [r1, #16] + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[5] * B[2] = 7 + ldr r10, [r1, #20] + ldr r11, [r2, #8] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[6] * B[1] = 7 + ldr r10, [r1, #24] + umull r3, r12, r10, lr + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[7] * B[0] = 7 + ldr r10, [r1, #28] + umull r3, r12, r10, r9 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #28] + ldr r7, [r1, #24] + ldr r9, [r2, #24] + # A[7] * B[1] = 8 + umull r3, r12, r10, lr + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[6] * B[2] = 8 + umull r3, r12, r7, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[5] * B[3] = 8 + ldr r10, [r1, #20] + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[4] * B[4] = 8 + ldr r10, [r1, #16] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[3] * B[5] = 8 + ldr r10, [r1, #12] + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[2] * B[6] = 8 + ldr r10, [r1, #8] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[1] * B[7] = 8 + ldr r11, [r2, #28] + umull r3, r12, r8, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #32] + ldr r8, [r1, #28] + mov lr, r11 + # A[2] * B[7] = 9 + umull r3, r12, r10, lr + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[3] * B[6] = 9 + ldr r10, [r1, #12] + umull r3, r12, r10, r9 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[4] * B[5] = 9 + ldr r10, [r1, #16] + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[5] * B[4] = 9 + ldr r10, [r1, #20] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[6] * B[3] = 9 + ldr r11, [r2, #12] + umull r3, r12, r7, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[7] * B[2] = 9 + ldr r11, [r2, #8] + umull r3, r12, r8, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #36] + # A[7] * B[3] = 10 + ldr r11, [r2, #12] + umull r3, r12, r8, r11 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[6] * B[4] = 10 + ldr r11, [r2, #16] + umull r3, r12, r7, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[5] * B[5] = 10 + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[4] * B[6] = 10 + ldr r10, [r1, #16] + umull r3, r12, r10, r9 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[3] * B[7] = 10 + ldr r10, [r1, #12] + umull r3, r12, r10, lr + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #40] + # A[4] * B[7] = 11 + ldr r10, [r1, #16] + umull r3, r12, r10, lr + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[5] * B[6] = 11 + ldr r10, [r1, #20] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[6] * B[5] = 11 + umull r3, r12, r7, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[7] * B[4] = 11 + ldr r11, [r2, #16] + umull r3, r12, r8, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #44] + # A[7] * B[5] = 12 + ldr r11, [r2, #20] + umull r3, r12, r8, r11 + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[6] * B[6] = 12 + umull r3, r12, r7, r9 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[5] * B[7] = 12 + umull r3, r12, r10, lr + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #48] + # A[6] * B[7] = 13 + umull r3, r12, r7, lr + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[7] * B[6] = 13 + umull r3, r12, r8, r9 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #52] + # A[7] * B[7] = 14 + umull r3, r12, r8, lr + adds r6, r6, r3 + adc r4, r4, r12 + str r6, [sp, #56] + str r4, [sp, #60] + # Reduce + # Load bottom half + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #8] + ldrd r8, r9, [sp, #16] + ldrd r10, r11, [sp, #24] + lsr r3, r11, #31 + and r11, r11, #0x7fffffff + mov lr, #19 + ldr r1, [sp, #32] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + adds r4, r4, r3 + mov r2, #0 + adcs r5, r5, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #36] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r5, r5, r3 + mov r2, #0 + adcs r6, r6, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #40] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r6, r6, r3 + mov r2, #0 + adcs r7, r7, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #44] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r7, r7, r3 + mov r2, #0 + adcs r8, r8, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #48] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r8, r8, r3 + mov r2, #0 + adcs r9, r9, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #52] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r9, r9, r3 + mov r2, #0 + adcs r10, r10, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #56] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r10, r10, r3 + mov r2, #0 + adcs r11, r11, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #60] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + adds r11, r11, r3 + adc r3, r12, r2 + # Overflow + lsl r3, r3, #1 + orr r3, r3, r11, lsr #31 + mul r3, r3, lr + and r11, r11, #0x7fffffff + adds r4, r4, r3 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Reduce if top bit set + asr r3, r11, #31 + and r3, r3, lr + and r11, r11, #0x7fffffff + adds r4, r4, r3 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Store + strd r4, r5, [r0] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] + add sp, sp, #0x40 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_mul,.-fe_mul +.text +.globl fe_sq +.type fe_sq, %function +.align 2 +fe_sq: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x40 + # Square + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r12, [r1, #16] + # A[0] * A[0] = 0 + umull r4, r5, r7, r7 + str r4, [sp] + # A[0] * A[1] = 1 + umull r2, r3, r7, r8 + mov r6, #0 + adds r5, r5, r2 + adc r6, r6, r3 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #4] + # A[1] * A[1] = 2 + umull r2, r3, r8, r8 + adds r6, r6, r2 + adc r4, r4, r3 + # A[0] * A[2] = 2 + umull r2, r3, r7, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #8] + # A[0] * A[3] = 3 + umull r2, r3, r7, r10 + adds r4, r4, r2 + adc r5, r5, r3 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[2] = 3 + umull r2, r3, r8, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #12] + # A[2] * A[2] = 4 + umull r2, r3, r9, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[3] = 4 + umull r2, r3, r8, r10 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[0] * A[4] = 4 + umull r2, r3, r7, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #16] + # A[0] * A[5] = 5 + ldr r11, [r1, #20] + umull r2, r3, r7, r11 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[4] = 5 + umull r2, r3, r8, r12 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[3] = 5 + umull r2, r3, r9, r10 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #20] + # A[3] * A[3] = 6 + umull r2, r3, r10, r10 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[2] * A[4] = 6 + umull r2, r3, r9, r12 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[5] = 6 + umull r2, r3, r8, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[0] * A[6] = 6 + ldr r11, [r1, #24] + umull r2, r3, r7, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #24] + # A[0] * A[7] = 7 + ldr r11, [r1, #28] + umull r2, r3, r7, r11 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[6] = 7 + ldr r11, [r1, #24] + umull r2, r3, r8, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[2] * A[5] = 7 + ldr r11, [r1, #20] + umull r2, r3, r9, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[4] = 7 + umull r2, r3, r10, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #28] + # A[4] * A[4] = 8 + umull r2, r3, r12, r12 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[3] * A[5] = 8 + umull r2, r3, r10, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[6] = 8 + ldr r11, [r1, #24] + umull r2, r3, r9, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[7] = 8 + ldr r11, [r1, #28] + umull r2, r3, r8, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #32] + ldr r7, [r1, #20] + # A[2] * A[7] = 9 + umull r2, r3, r9, r11 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[3] * A[6] = 9 + ldr r11, [r1, #24] + umull r2, r3, r10, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[4] * A[5] = 9 + umull r2, r3, r12, r7 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #36] + mov r8, r11 + # A[5] * A[5] = 10 + umull r2, r3, r7, r7 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[4] * A[6] = 10 + umull r2, r3, r12, r8 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[7] = 10 + ldr r11, [r1, #28] + umull r2, r3, r10, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #40] + mov r9, r11 + # A[4] * A[7] = 11 + umull r2, r3, r12, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[5] * A[6] = 11 + umull r2, r3, r7, r8 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #44] + # A[6] * A[6] = 12 + umull r2, r3, r8, r8 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[5] * A[7] = 12 + umull r2, r3, r7, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #48] + # A[6] * A[7] = 13 + umull r2, r3, r8, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #52] + # A[7] * A[7] = 14 + umull r2, r3, r9, r9 + adds r6, r6, r2 + adc r4, r4, r3 + str r6, [sp, #56] + str r4, [sp, #60] + # Reduce + # Load bottom half + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #8] + ldrd r8, r9, [sp, #16] + ldrd r10, r11, [sp, #24] + lsr r2, r11, #31 + and r11, r11, #0x7fffffff + mov r12, #19 + ldr r1, [sp, #32] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + adds r4, r4, r2 + mov lr, #0 + adcs r5, r5, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #36] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r5, r5, r2 + mov lr, #0 + adcs r6, r6, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #40] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r6, r6, r2 + mov lr, #0 + adcs r7, r7, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #44] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r7, r7, r2 + mov lr, #0 + adcs r8, r8, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #48] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r8, r8, r2 + mov lr, #0 + adcs r9, r9, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #52] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r9, r9, r2 + mov lr, #0 + adcs r10, r10, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #56] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r10, r10, r2 + mov lr, #0 + adcs r11, r11, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #60] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + adds r11, r11, r2 + adc r2, r3, lr + # Overflow + lsl r2, r2, #1 + orr r2, r2, r11, lsr #31 + mul r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Reduce if top bit set + asr r2, r11, #31 + and r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Store + strd r4, r5, [r0] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] + add sp, sp, #0x40 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_sq,.-fe_sq +.text +.globl fe_mul121666 +.type fe_mul121666, %function +.align 2 +fe_mul121666: + push {r4, r5, r6, r7, r8, r9, r10, lr} + # Multiply by 121666 + ldrd r2, r3, [r1] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r1, #16] + ldrd r8, r9, [r1, #24] + movw lr, #0xdb42 + movt lr, #1 + umull r2, r10, r2, lr + umull r3, r12, r3, lr + adds r3, r3, r10 + adc r10, r12, #0 + umull r4, r12, r4, lr + adds r4, r4, r10 + adc r10, r12, #0 + umull r5, r12, r5, lr + adds r5, r5, r10 + adc r10, r12, #0 + umull r6, r12, r6, lr + adds r6, r6, r10 + adc r10, r12, #0 + umull r7, r12, r7, lr + adds r7, r7, r10 + adc r10, r12, #0 + umull r8, r12, r8, lr + adds r8, r8, r10 + adc r10, r12, #0 + umull r9, r12, r9, lr + adds r9, r9, r10 + adc r10, r12, #0 + mov lr, #19 + lsl r10, r10, #1 + orr r10, r10, r9, lsr #31 + mul r10, r10, lr + and r9, r9, #0x7fffffff + adds r2, r2, r10 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + strd r2, r3, [r0] + strd r4, r5, [r0, #8] + strd r6, r7, [r0, #16] + strd r8, r9, [r0, #24] + pop {r4, r5, r6, r7, r8, r9, r10, pc} +.size fe_mul121666,.-fe_mul121666 +.text +.globl fe_sq2 +.type fe_sq2, %function +.align 2 +fe_sq2: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x40 + # Square * 2 + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r12, [r1, #16] + # A[0] * A[0] = 0 + umull r4, r5, r7, r7 + str r4, [sp] + # A[0] * A[1] = 1 + umull r2, r3, r7, r8 + mov r6, #0 + adds r5, r5, r2 + adc r6, r6, r3 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #4] + # A[1] * A[1] = 2 + umull r2, r3, r8, r8 + adds r6, r6, r2 + adc r4, r4, r3 + # A[0] * A[2] = 2 + umull r2, r3, r7, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #8] + # A[0] * A[3] = 3 + umull r2, r3, r7, r10 + adds r4, r4, r2 + adc r5, r5, r3 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[2] = 3 + umull r2, r3, r8, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #12] + # A[2] * A[2] = 4 + umull r2, r3, r9, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[3] = 4 + umull r2, r3, r8, r10 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[0] * A[4] = 4 + umull r2, r3, r7, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #16] + # A[0] * A[5] = 5 + ldr r11, [r1, #20] + umull r2, r3, r7, r11 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[4] = 5 + umull r2, r3, r8, r12 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[3] = 5 + umull r2, r3, r9, r10 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #20] + # A[3] * A[3] = 6 + umull r2, r3, r10, r10 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[2] * A[4] = 6 + umull r2, r3, r9, r12 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[5] = 6 + umull r2, r3, r8, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[0] * A[6] = 6 + ldr r11, [r1, #24] + umull r2, r3, r7, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #24] + # A[0] * A[7] = 7 + ldr r11, [r1, #28] + umull r2, r3, r7, r11 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[6] = 7 + ldr r11, [r1, #24] + umull r2, r3, r8, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[2] * A[5] = 7 + ldr r11, [r1, #20] + umull r2, r3, r9, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[4] = 7 + umull r2, r3, r10, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #28] + # A[4] * A[4] = 8 + umull r2, r3, r12, r12 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[3] * A[5] = 8 + umull r2, r3, r10, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[6] = 8 + ldr r11, [r1, #24] + umull r2, r3, r9, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[7] = 8 + ldr r11, [r1, #28] + umull r2, r3, r8, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #32] + ldr r7, [r1, #20] + # A[2] * A[7] = 9 + umull r2, r3, r9, r11 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[3] * A[6] = 9 + ldr r11, [r1, #24] + umull r2, r3, r10, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[4] * A[5] = 9 + umull r2, r3, r12, r7 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #36] + mov r8, r11 + # A[5] * A[5] = 10 + umull r2, r3, r7, r7 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[4] * A[6] = 10 + umull r2, r3, r12, r8 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[7] = 10 + ldr r11, [r1, #28] + umull r2, r3, r10, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #40] + mov r9, r11 + # A[4] * A[7] = 11 + umull r2, r3, r12, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[5] * A[6] = 11 + umull r2, r3, r7, r8 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #44] + # A[6] * A[6] = 12 + umull r2, r3, r8, r8 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[5] * A[7] = 12 + umull r2, r3, r7, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #48] + # A[6] * A[7] = 13 + umull r2, r3, r8, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #52] + # A[7] * A[7] = 14 + umull r2, r3, r9, r9 + adds r6, r6, r2 + adc r4, r4, r3 + str r6, [sp, #56] + str r4, [sp, #60] + # Double and Reduce + # Load bottom half + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #8] + ldrd r8, r9, [sp, #16] + ldrd r10, r11, [sp, #24] + lsr r2, r11, #30 + lsl r11, r11, #1 + orr r11, r11, r10, lsr #31 + lsl r10, r10, #1 + orr r10, r10, r9, lsr #31 + lsl r9, r9, #1 + orr r9, r9, r8, lsr #31 + lsl r8, r8, #1 + orr r8, r8, r7, lsr #31 + lsl r7, r7, #1 + orr r7, r7, r6, lsr #31 + lsl r6, r6, #1 + orr r6, r6, r5, lsr #31 + lsl r5, r5, #1 + orr r5, r5, r4, lsr #31 + lsl r4, r4, #1 + and r11, r11, #0x7fffffff + mov r12, #19 + ldr r1, [sp, #32] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + adds r4, r4, r2 + mov lr, #0 + adcs r5, r5, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #36] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r5, r5, r2 + mov lr, #0 + adcs r6, r6, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #40] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r6, r6, r2 + mov lr, #0 + adcs r7, r7, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #44] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r7, r7, r2 + mov lr, #0 + adcs r8, r8, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #48] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r8, r8, r2 + mov lr, #0 + adcs r9, r9, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #52] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r9, r9, r2 + mov lr, #0 + adcs r10, r10, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #56] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r10, r10, r2 + mov lr, #0 + adcs r11, r11, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #60] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + adds r11, r11, r2 + adc r2, r3, lr + # Overflow + lsl r2, r2, #1 + orr r2, r2, r11, lsr #31 + mul r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Reduce if top bit set + asr r2, r11, #31 + and r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Store + strd r4, r5, [r0] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] + add sp, sp, #0x40 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_sq2,.-fe_sq2 +.text +.globl fe_invert +.type fe_invert, %function +.align 2 +fe_invert: + push {r4, lr} + sub sp, sp, #0x88 + # Invert + str r0, [sp, #128] + str r1, [sp, #132] + mov r0, sp + ldr r1, [sp, #132] + bl fe_sq + add r0, sp, #32 + mov r1, sp + bl fe_sq + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + add r0, sp, #32 + ldr r1, [sp, #132] + add r2, sp, #32 + bl fe_mul + mov r0, sp + mov r1, sp + add r2, sp, #32 + bl fe_mul + add r0, sp, #64 + mov r1, sp + bl fe_sq + add r0, sp, #32 + add r1, sp, #32 + add r2, sp, #64 + bl fe_mul + add r0, sp, #64 + add r1, sp, #32 + bl fe_sq + mov r4, #4 +L_fe_invert1: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert1 + add r0, sp, #32 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + add r0, sp, #64 + add r1, sp, #32 + bl fe_sq + mov r4, #9 +L_fe_invert2: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert2 + add r0, sp, #64 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + add r0, sp, #96 + add r1, sp, #64 + bl fe_sq + mov r4, #19 +L_fe_invert3: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert3 + add r0, sp, #64 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + mov r4, #10 +L_fe_invert4: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert4 + add r0, sp, #32 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + add r0, sp, #64 + add r1, sp, #32 + bl fe_sq + mov r4, #0x31 +L_fe_invert5: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert5 + add r0, sp, #64 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + add r0, sp, #96 + add r1, sp, #64 + bl fe_sq + mov r4, #0x63 +L_fe_invert6: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert6 + add r0, sp, #64 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + mov r4, #0x32 +L_fe_invert7: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert7 + add r0, sp, #32 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + mov r4, #5 +L_fe_invert8: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert8 + ldr r0, [sp, #128] + add r1, sp, #32 + mov r2, sp + bl fe_mul + ldr r1, [sp, #132] + ldr r0, [sp, #128] + add sp, sp, #0x88 + pop {r4, pc} +.size fe_invert,.-fe_invert +.text +.globl curve25519 +.type curve25519, %function +.align 2 +curve25519: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0xbc + str r0, [sp, #160] + str r1, [sp, #164] + str r2, [sp, #168] + mov r1, #0 + str r1, [sp, #172] + # Set one + mov r11, #1 + mov r10, #0 + str r11, [r0] + str r10, [r0, #4] + str r10, [r0, #8] + str r10, [r0, #12] + str r10, [r0, #16] + str r10, [r0, #20] + str r10, [r0, #24] + str r10, [r0, #28] + # Set zero + mov r10, #0 + str r10, [sp] + str r10, [sp, #4] + str r10, [sp, #8] + str r10, [sp, #12] + str r10, [sp, #16] + str r10, [sp, #20] + str r10, [sp, #24] + str r10, [sp, #28] + # Set one + mov r11, #1 + mov r10, #0 + str r11, [sp, #32] + str r10, [sp, #36] + str r10, [sp, #40] + str r10, [sp, #44] + str r10, [sp, #48] + str r10, [sp, #52] + str r10, [sp, #56] + str r10, [sp, #60] + # Copy + ldrd r4, r5, [r2] + ldrd r6, r7, [r2, #8] + strd r4, r5, [sp, #64] + strd r6, r7, [sp, #72] + ldrd r4, r5, [r2, #16] + ldrd r6, r7, [r2, #24] + strd r4, r5, [sp, #80] + strd r6, r7, [sp, #88] + mov r1, #30 + str r1, [sp, #180] + mov r2, #28 + str r2, [sp, #176] +L_curve25519_words: +L_curve25519_bits: + ldr r1, [sp, #164] + ldr r2, [r1, r2] + ldr r1, [sp, #180] + lsr r2, r2, r1 + and r2, r2, #1 + str r2, [sp, #184] + ldr r1, [sp, #172] + eor r1, r1, r2 + str r1, [sp, #172] + ldr r0, [sp, #160] + # Conditional Swap + neg r1, r1 + ldrd r4, r5, [r0] + ldrd r6, r7, [sp, #64] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0] + strd r6, r7, [sp, #64] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #72] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0, #8] + strd r6, r7, [sp, #72] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [sp, #80] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0, #16] + strd r6, r7, [sp, #80] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #88] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0, #24] + strd r6, r7, [sp, #88] + ldr r1, [sp, #172] + # Conditional Swap + neg r1, r1 + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #32] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp] + strd r6, r7, [sp, #32] + ldrd r4, r5, [sp, #8] + ldrd r6, r7, [sp, #40] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp, #8] + strd r6, r7, [sp, #40] + ldrd r4, r5, [sp, #16] + ldrd r6, r7, [sp, #48] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp, #16] + strd r6, r7, [sp, #48] + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #56] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp, #24] + strd r6, r7, [sp, #56] + ldr r1, [sp, #184] + str r1, [sp, #172] + # Add-Sub + # Add + ldrd r4, r5, [r0] + ldrd r6, r7, [sp] + adds r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [r0] + # Sub + subs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #128] + # Add + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #8] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [r0, #8] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #136] + # Add + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [sp, #16] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [r0, #16] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #144] + # Add + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #24] + adds r3, r3, #-1 + adcs r8, r4, r6 + adc r9, r5, r7 + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + sbc r11, r5, r7 + mov r3, #-19 + asr r2, r9, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [r0] + subs r4, r4, r3 + sbcs r5, r5, r2 + strd r4, r5, [r0] + ldrd r4, r5, [r0, #8] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [r0, #8] + ldrd r4, r5, [r0, #16] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [r0, #16] + sbcs r8, r8, r2 + sbc r9, r9, r12 + strd r8, r9, [r0, #24] + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp, #128] + adds r4, r4, r3 + adcs r5, r5, r2 + strd r4, r5, [sp, #128] + ldrd r4, r5, [sp, #136] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #136] + ldrd r4, r5, [sp, #144] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #144] + adcs r10, r10, r2 + adc r11, r11, r12 + strd r10, r11, [sp, #152] + # Add-Sub + # Add + ldrd r4, r5, [sp, #64] + ldrd r6, r7, [sp, #32] + adds r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp] + # Sub + subs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #96] + # Add + ldrd r4, r5, [sp, #72] + ldrd r6, r7, [sp, #40] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #8] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #104] + # Add + ldrd r4, r5, [sp, #80] + ldrd r6, r7, [sp, #48] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #16] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #112] + # Add + ldrd r4, r5, [sp, #88] + ldrd r6, r7, [sp, #56] + adds r3, r3, #-1 + adcs r8, r4, r6 + adc r9, r5, r7 + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + sbc r11, r5, r7 + mov r3, #-19 + asr r2, r9, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [sp] + subs r4, r4, r3 + sbcs r5, r5, r2 + strd r4, r5, [sp] + ldrd r4, r5, [sp, #8] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #8] + ldrd r4, r5, [sp, #16] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #16] + sbcs r8, r8, r2 + sbc r9, r9, r12 + strd r8, r9, [sp, #24] + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp, #96] + adds r4, r4, r3 + adcs r5, r5, r2 + strd r4, r5, [sp, #96] + ldrd r4, r5, [sp, #104] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #104] + ldrd r4, r5, [sp, #112] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #112] + adcs r10, r10, r2 + adc r11, r11, r12 + strd r10, r11, [sp, #120] + ldr r2, [sp, #160] + add r1, sp, #0x60 + add r0, sp, #0x20 + bl fe_mul + add r2, sp, #0x80 + add r1, sp, #0 + add r0, sp, #0 + bl fe_mul + add r1, sp, #0x80 + add r0, sp, #0x60 + bl fe_sq + ldr r1, [sp, #160] + add r0, sp, #0x80 + bl fe_sq + # Add-Sub + # Add + ldrd r4, r5, [sp, #32] + ldrd r6, r7, [sp] + adds r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #64] + # Sub + subs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp] + # Add + ldrd r4, r5, [sp, #40] + ldrd r6, r7, [sp, #8] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #72] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #8] + # Add + ldrd r4, r5, [sp, #48] + ldrd r6, r7, [sp, #16] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #80] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #16] + # Add + ldrd r4, r5, [sp, #56] + ldrd r6, r7, [sp, #24] + adds r3, r3, #-1 + adcs r8, r4, r6 + adc r9, r5, r7 + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + sbc r11, r5, r7 + mov r3, #-19 + asr r2, r9, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [sp, #64] + subs r4, r4, r3 + sbcs r5, r5, r2 + strd r4, r5, [sp, #64] + ldrd r4, r5, [sp, #72] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #72] + ldrd r4, r5, [sp, #80] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #80] + sbcs r8, r8, r2 + sbc r9, r9, r12 + strd r8, r9, [sp, #88] + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp] + adds r4, r4, r3 + adcs r5, r5, r2 + strd r4, r5, [sp] + ldrd r4, r5, [sp, #8] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #8] + ldrd r4, r5, [sp, #16] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #16] + adcs r10, r10, r2 + adc r11, r11, r12 + strd r10, r11, [sp, #24] + add r2, sp, #0x60 + add r1, sp, #0x80 + ldr r0, [sp, #160] + bl fe_mul + # Sub + ldrd r4, r5, [sp, #128] + ldrd r6, r7, [sp, #136] + ldrd r8, r9, [sp, #96] + ldrd r10, r11, [sp, #104] + subs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + sbcs r11, r7, r11 + strd r8, r9, [sp, #128] + strd r10, r11, [sp, #136] + ldrd r4, r5, [sp, #144] + ldrd r6, r7, [sp, #152] + ldrd r8, r9, [sp, #112] + ldrd r10, r11, [sp, #120] + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + sbc r11, r7, r11 + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp, #128] + ldrd r6, r7, [sp, #136] + adds r4, r4, r3 + adcs r5, r5, r2 + adcs r6, r6, r2 + adcs r7, r7, r2 + adcs r8, r8, r2 + adcs r9, r9, r2 + adcs r10, r10, r2 + adc r11, r11, r12 + strd r4, r5, [sp, #128] + strd r6, r7, [sp, #136] + strd r8, r9, [sp, #144] + strd r10, r11, [sp, #152] + add r1, sp, #0 + add r0, sp, #0 + bl fe_sq + # Multiply by 121666 + ldrd r4, r5, [sp, #128] + ldrd r6, r7, [sp, #136] + ldrd r8, r9, [sp, #144] + ldrd r10, r11, [sp, #152] + movw r12, #0xdb42 + movt r12, #1 + umull r4, r2, r4, r12 + umull r5, r3, r5, r12 + adds r5, r5, r2 + adc r2, r3, #0 + umull r6, r3, r6, r12 + adds r6, r6, r2 + adc r2, r3, #0 + umull r7, r3, r7, r12 + adds r7, r7, r2 + adc r2, r3, #0 + umull r8, r3, r8, r12 + adds r8, r8, r2 + adc r2, r3, #0 + umull r9, r3, r9, r12 + adds r9, r9, r2 + adc r2, r3, #0 + umull r10, r3, r10, r12 + adds r10, r10, r2 + adc r2, r3, #0 + umull r11, r3, r11, r12 + adds r11, r11, r2 + adc r2, r3, #0 + mov r12, #19 + lsl r2, r2, #1 + orr r2, r2, r11, lsr #31 + mul r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + strd r4, r5, [sp, #32] + strd r6, r7, [sp, #40] + strd r8, r9, [sp, #48] + strd r10, r11, [sp, #56] + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_sq + # Add + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + ldrd r8, r9, [sp, #32] + ldrd r10, r11, [sp, #40] + adds r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + adcs r11, r7, r11 + strd r8, r9, [sp, #96] + strd r10, r11, [sp, #104] + ldrd r4, r5, [sp, #112] + ldrd r6, r7, [sp, #120] + ldrd r8, r9, [sp, #48] + ldrd r10, r11, [sp, #56] + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + adc r11, r7, r11 + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + subs r4, r4, r3 + sbcs r5, r5, r2 + sbcs r6, r6, r2 + sbcs r7, r7, r2 + sbcs r8, r8, r2 + sbcs r9, r9, r2 + sbcs r10, r10, r2 + sbc r11, r11, r12 + strd r4, r5, [sp, #96] + strd r6, r7, [sp, #104] + strd r8, r9, [sp, #112] + strd r10, r11, [sp, #120] + add r2, sp, #0 + ldr r1, [sp, #168] + add r0, sp, #0x20 + bl fe_mul + add r2, sp, #0x60 + add r1, sp, #0x80 + add r0, sp, #0 + bl fe_mul + ldr r2, [sp, #176] + ldr r1, [sp, #180] + subs r1, r1, #1 + str r1, [sp, #180] + bge L_curve25519_bits + mov r1, #31 + str r1, [sp, #180] + subs r2, r2, #4 + str r2, [sp, #176] + bge L_curve25519_words + # Invert + add r0, sp, #32 + add r1, sp, #0 + bl fe_sq + add r0, sp, #64 + add r1, sp, #32 + bl fe_sq + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + add r0, sp, #64 + add r1, sp, #0 + add r2, sp, #64 + bl fe_mul + add r0, sp, #32 + add r1, sp, #32 + add r2, sp, #64 + bl fe_mul + add r0, sp, #96 + add r1, sp, #32 + bl fe_sq + add r0, sp, #64 + add r1, sp, #64 + add r2, sp, #96 + bl fe_mul + add r0, sp, #96 + add r1, sp, #64 + bl fe_sq + mov r4, #4 +L_curve25519_inv_1: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_1 + add r0, sp, #64 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + add r0, sp, #96 + add r1, sp, #64 + bl fe_sq + mov r4, #9 +L_curve25519_inv_2: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_2 + add r0, sp, #96 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + add r0, sp, #128 + add r1, sp, #96 + bl fe_sq + mov r4, #19 +L_curve25519_inv_3: + add r0, sp, #128 + add r1, sp, #128 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_3 + add r0, sp, #96 + add r1, sp, #128 + add r2, sp, #96 + bl fe_mul + mov r4, #10 +L_curve25519_inv_4: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_4 + add r0, sp, #64 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + add r0, sp, #96 + add r1, sp, #64 + bl fe_sq + mov r4, #0x31 +L_curve25519_inv_5: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_5 + add r0, sp, #96 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + add r0, sp, #128 + add r1, sp, #96 + bl fe_sq + mov r4, #0x63 +L_curve25519_inv_6: + add r0, sp, #128 + add r1, sp, #128 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_6 + add r0, sp, #96 + add r1, sp, #128 + add r2, sp, #96 + bl fe_mul + mov r4, #0x32 +L_curve25519_inv_7: + add r0, sp, #96 + add r1, sp, #96 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_7 + add r0, sp, #64 + add r1, sp, #96 + add r2, sp, #64 + bl fe_mul + mov r4, #5 +L_curve25519_inv_8: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_8 + add r0, sp, #0 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + add r2, sp, #0 + ldr r1, [sp, #160] + ldr r0, [sp, #160] + bl fe_mul + mov r0, #0 + add sp, sp, #0xbc + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size curve25519,.-curve25519 +.text +.globl fe_pow22523 +.type fe_pow22523, %function +.align 2 +fe_pow22523: + push {r4, lr} + sub sp, sp, #0x68 + # pow22523 + str r0, [sp, #96] + str r1, [sp, #100] + mov r0, sp + ldr r1, [sp, #100] + bl fe_sq + add r0, sp, #32 + mov r1, sp + bl fe_sq + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + add r0, sp, #32 + ldr r1, [sp, #100] + add r2, sp, #32 + bl fe_mul + mov r0, sp + mov r1, sp + add r2, sp, #32 + bl fe_mul + mov r0, sp + mov r1, sp + bl fe_sq + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #32 + mov r1, sp + bl fe_sq + mov r4, #4 +L_fe_pow22523_1: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_1 + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #32 + mov r1, sp + bl fe_sq + mov r4, #9 +L_fe_pow22523_2: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_2 + add r0, sp, #32 + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #64 + add r1, sp, #32 + bl fe_sq + mov r4, #19 +L_fe_pow22523_3: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_3 + add r0, sp, #32 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + mov r4, #10 +L_fe_pow22523_4: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_4 + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #32 + mov r1, sp + bl fe_sq + mov r4, #0x31 +L_fe_pow22523_5: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_5 + add r0, sp, #32 + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #64 + add r1, sp, #32 + bl fe_sq + mov r4, #0x63 +L_fe_pow22523_6: + add r0, sp, #64 + add r1, sp, #64 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_6 + add r0, sp, #32 + add r1, sp, #64 + add r2, sp, #32 + bl fe_mul + mov r4, #0x32 +L_fe_pow22523_7: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_7 + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + mov r4, #2 +L_fe_pow22523_8: + mov r0, sp + mov r1, sp + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_8 + ldr r0, [sp, #96] + mov r1, sp + ldr r2, [sp, #100] + bl fe_mul + ldr r1, [sp, #100] + ldr r0, [sp, #96] + add sp, sp, #0x68 + pop {r4, pc} +.size fe_pow22523,.-fe_pow22523 +.text +.globl fe_ge_to_p2 +.type fe_ge_to_p2, %function +.align 2 +fe_ge_to_p2: + push {lr} + sub sp, sp, #16 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r2, [sp, #28] + ldr r1, [sp, #12] + ldr r0, [sp] + bl fe_mul + ldr r2, [sp, #24] + ldr r1, [sp, #20] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #28] + ldr r1, [sp, #24] + ldr r0, [sp, #8] + bl fe_mul + add sp, sp, #16 + pop {pc} +.size fe_ge_to_p2,.-fe_ge_to_p2 +.text +.globl fe_ge_to_p3 +.type fe_ge_to_p3, %function +.align 2 +fe_ge_to_p3: + push {lr} + sub sp, sp, #16 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r2, [sp, #32] + ldr r1, [sp, #20] + ldr r0, [sp] + bl fe_mul + ldr r2, [sp, #28] + ldr r1, [sp, #24] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #32] + ldr r1, [sp, #28] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #24] + ldr r1, [sp, #20] + ldr r0, [sp, #12] + bl fe_mul + add sp, sp, #16 + pop {pc} +.size fe_ge_to_p3,.-fe_ge_to_p3 +.text +.globl fe_ge_dbl +.type fe_ge_dbl, %function +.align 2 +fe_ge_dbl: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #16 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r1, [sp, #52] + ldr r0, [sp] + bl fe_sq + ldr r1, [sp, #56] + ldr r0, [sp, #8] + bl fe_sq + ldr r0, [sp, #4] + ldr r1, [sp, #52] + ldr r2, [sp, #56] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r1, [sp, #4] + ldr r0, [sp, #12] + bl fe_sq + ldr r0, [sp, #4] + ldr r1, [sp, #8] + ldr r2, [sp] + # Add-Sub + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r2] + ldr r6, [r2, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r1, #8] + ldr r4, [r1, #12] + ldr r5, [r2, #8] + ldr r6, [r2, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r2, #16] + ldr r6, [r2, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r1, #24] + ldr r4, [r1, #28] + ldr r5, [r2, #24] + ldr r6, [r2, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp] + ldr r1, [sp, #12] + ldr r2, [sp, #4] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r1, [sp, #60] + ldr r0, [sp, #12] + bl fe_sq2 + ldr r0, [sp, #12] + ldr r1, [sp, #8] + # Sub + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + add sp, sp, #16 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_ge_dbl,.-fe_ge_dbl +.text +.globl fe_ge_madd +.type fe_ge_madd, %function +.align 2 +fe_ge_madd: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x20 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #88] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #92] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #80] + ldr r1, [sp, #84] + ldr r0, [sp, #12] + bl fe_mul + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #76] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #12] + # Add-Sub + # Add + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r1] + ldr r6, [r1, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r1, #16] + ldr r6, [r1, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #0x20 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_ge_madd,.-fe_ge_madd +.text +.globl fe_ge_msub +.type fe_ge_msub, %function +.align 2 +fe_ge_msub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x20 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #92] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #88] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #80] + ldr r1, [sp, #84] + ldr r0, [sp, #12] + bl fe_mul + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #76] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #12] + ldr r1, [sp, #8] + # Add-Sub + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r1, #8] + ldr r4, [r1, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r1, #24] + ldr r4, [r1, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #0x20 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_ge_msub,.-fe_ge_msub +.text +.globl fe_ge_add +.type fe_ge_add, %function +.align 2 +fe_ge_add: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x60 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #156] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #160] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #144] + ldr r1, [sp, #152] + ldr r0, [sp, #12] + bl fe_mul + ldr r2, [sp, #148] + ldr r1, [sp, #140] + ldr r0, [sp] + bl fe_mul + add r0, sp, #16 + ldr r1, [sp] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #12] + add r2, sp, #16 + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r1] + ldr r6, [r1, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r1, #16] + ldr r6, [r1, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #0x60 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_ge_add,.-fe_ge_add +.text +.globl fe_ge_sub +.type fe_ge_sub, %function +.align 2 +fe_ge_sub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x60 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #160] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #156] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #144] + ldr r1, [sp, #152] + ldr r0, [sp, #12] + bl fe_mul + ldr r2, [sp, #148] + ldr r1, [sp, #140] + ldr r0, [sp] + bl fe_mul + add r0, sp, #16 + ldr r1, [sp] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #12] + ldr r1, [sp, #8] + add r2, sp, #16 + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #0x60 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +.size fe_ge_sub,.-fe_ge_sub +#endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv7-curve25519.c b/wolfcrypt/src/port/arm/armv7-curve25519.c new file mode 100644 index 000000000..15c0cdf86 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv7-curve25519.c @@ -0,0 +1,5623 @@ +/* armv7-curve25519 + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifndef __aarch64__ +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include +#include + +void fe_init() +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "\n\t" + "add sp, sp, #0\n\t" + : + : + : "memory" + ); +} + +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "ldrd r2, r3, [r1]\n\t" + "ldrd r12, r4, [r1, #8]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "ldrd r7, r8, [r1, #24]\n\t" + "and r8, r8, #0x7fffffff\n\t" + "strd r2, r3, [r0]\n\t" + "strd r12, r4, [r0, #8]\n\t" + "strd r5, r6, [r0, #16]\n\t" + "strd r7, r8, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "ldrd r2, r3, [r1]\n\t" + "ldrd r12, r4, [r1, #8]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "ldrd r7, r8, [r1, #24]\n\t" + "adds r9, r2, #19\n\t" + "adcs r9, r3, #0\n\t" + "adcs r9, r12, #0\n\t" + "adcs r9, r4, #0\n\t" + "adcs r9, r5, #0\n\t" + "adcs r9, r6, #0\n\t" + "adcs r9, r7, #0\n\t" + "adc r9, r8, #0\n\t" + "asr r9, r9, #31\n\t" + "and r9, r9, #19\n\t" + "adds r2, r2, r9\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "and r8, r8, #0x7fffffff\n\t" + "strd r2, r3, [r0]\n\t" + "strd r12, r4, [r0, #8]\n\t" + "strd r5, r6, [r0, #16]\n\t" + "strd r7, r8, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [out] "+r" (out), [n] "+r" (n) + : + : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + /* Set one */ + "mov r2, #1\n\t" + "mov r1, #0\n\t" + "strd r2, r1, [r0]\n\t" + "strd r1, r1, [r0, #8]\n\t" + "strd r1, r1, [r0, #16]\n\t" + "strd r1, r1, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [n] "+r" (n) + : + : "memory", "r1", "r2" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + /* Set zero */ + "mov r1, #0\n\t" + "strd r1, r1, [r0]\n\t" + "strd r1, r1, [r0, #8]\n\t" + "strd r1, r1, [r0, #16]\n\t" + "strd r1, r1, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [n] "+r" (n) + : + : "memory", "r1" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + /* Copy */ + "ldrd r2, r3, [r1]\n\t" + "ldrd r12, r4, [r1, #8]\n\t" + "strd r2, r3, [r0]\n\t" + "strd r12, r4, [r0, #8]\n\t" + "ldrd r2, r3, [r1, #16]\n\t" + "ldrd r12, r4, [r1, #24]\n\t" + "strd r2, r3, [r0, #16]\n\t" + "strd r12, r4, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "r4" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + /* Sub */ + "ldrd r12, r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, r12, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd r12, r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, r12, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r11, #-19\n\t" + "asr r3, r10, #31\n\t" + /* Mask the modulus */ + "and r11, r3, r11\n\t" + "and lr, r3, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r12, r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, r11\n\t" + "adcs r4, r4, r3\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r3\n\t" + "adcs r7, r7, r3\n\t" + "adcs r8, r8, r3\n\t" + "adcs r9, r9, r3\n\t" + "adc r10, r10, lr\n\t" + "strd r12, r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + /* Add */ + "ldrd r12, r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, r12, r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd r12, r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, r12, r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r11, #-19\n\t" + "asr r3, r10, #31\n\t" + /* Mask the modulus */ + "and r11, r3, r11\n\t" + "and lr, r3, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r12, r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs r12, r12, r11\n\t" + "sbcs r4, r4, r3\n\t" + "sbcs r5, r5, r3\n\t" + "sbcs r6, r6, r3\n\t" + "sbcs r7, r7, r3\n\t" + "sbcs r8, r8, r3\n\t" + "sbcs r9, r9, r3\n\t" + "sbc r10, r10, lr\n\t" + "strd r12, r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "mov r6, #-1\n\t" + "mov r5, #-19\n\t" + "ldrd r2, r3, [r1]\n\t" + "ldrd r12, r4, [r1, #8]\n\t" + "subs r2, r5, r2\n\t" + "sbcs r3, r6, r3\n\t" + "sbcs r12, r6, r12\n\t" + "sbcs r4, r6, r4\n\t" + "strd r2, r3, [r0]\n\t" + "strd r12, r4, [r0, #8]\n\t" + "mov r5, #0x7fffffff\n\t" + "ldrd r2, r3, [r1, #16]\n\t" + "ldrd r12, r4, [r1, #24]\n\t" + "sbcs r2, r6, r2\n\t" + "sbcs r3, r6, r3\n\t" + "sbcs r12, r6, r12\n\t" + "sbc r4, r5, r4\n\t" + "strd r2, r3, [r0, #16]\n\t" + "strd r12, r4, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "r4", "r5", "r6" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "ldrd r2, r3, [r0]\n\t" + "ldrd r12, r4, [r0, #8]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "ldrd r7, r8, [r0, #24]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, r4, #0\n\t" + "adcs r1, r5, #0\n\t" + "adcs r1, r6, #0\n\t" + "adcs r1, r7, #0\n\t" + "adc r1, r8, #0\n\t" + "asr r1, r1, #31\n\t" + "and r1, r1, #19\n\t" + "adds r2, r2, r1\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "and r8, r8, #0x7fffffff\n\t" + "orr r2, r2, r3\n\t" + "orr r12, r12, r4\n\t" + "orr r5, r5, r6\n\t" + "orr r7, r7, r8\n\t" + "orr r12, r12, r5\n\t" + "orr r2, r2, r7\n\t" + "orr %[a], r2, r12\n\t" + "add sp, sp, #0\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "ldrd r2, r3, [r0]\n\t" + "ldrd r12, r4, [r0, #8]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, r4, #0\n\t" + "ldrd r2, r3, [r0, #16]\n\t" + "ldrd r12, r4, [r0, #24]\n\t" + "adcs r1, r2, #0\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "ldr r2, [r0]\n\t" + "adc r1, r4, #0\n\t" + "and %[a], r2, #1\n\t" + "lsr r1, r1, #31\n\t" + "eor %[a], %[a], r1\n\t" + "add sp, sp, #0\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "r4" + ); + return (uint32_t)(size_t)a; +} + +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + "sxtb %[b], %[b]\n\t" + "sbfx r8, %[b], #7, #1\n\t" + "eor r11, %[b], r8\n\t" + "sub r11, r11, r8\n\t" + "mov r3, #1\n\t" + "mov r12, #0\n\t" + "mov r4, #1\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #31\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #30\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #29\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #28\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #27\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #26\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #25\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #24\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #32]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #64]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r9, #-19\n\t" + "mov r10, #-1\n\t" + "subs r9, r9, r6\n\t" + "sbcs r10, r10, r7\n\t" + "sbc lr, lr, lr\n\t" + "asr r11, %[b], #31\n\t" + "eor r8, r3, r4\n\t" + "and r8, r8, r11\n\t" + "eor r3, r3, r8\n\t" + "eor r4, r4, r8\n\t" + "eor r8, r12, r5\n\t" + "and r8, r8, r11\n\t" + "eor r12, r12, r8\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r11\n\t" + "eor r6, r6, r9\n\t" + "eor r10, r10, r7\n\t" + "and r10, r10, r11\n\t" + "eor r7, r7, r10\n\t" + "strd r3, r12, [r0]\n\t" + "strd r4, r5, [r0, #32]\n\t" + "strd r6, r7, [r0, #64]\n\t" + "sbfx r8, %[b], #7, #1\n\t" + "eor r11, %[b], r8\n\t" + "sub r11, r11, r8\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #31\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #30\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #29\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #28\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #27\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #26\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #25\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #24\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #40]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #72]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r9, #-1\n\t" + "mov r10, #-1\n\t" + "rsbs lr, lr, #0\n\t" + "sbcs r9, r9, r6\n\t" + "sbcs r10, r10, r7\n\t" + "sbc lr, lr, lr\n\t" + "asr r11, %[b], #31\n\t" + "eor r8, r3, r4\n\t" + "and r8, r8, r11\n\t" + "eor r3, r3, r8\n\t" + "eor r4, r4, r8\n\t" + "eor r8, r12, r5\n\t" + "and r8, r8, r11\n\t" + "eor r12, r12, r8\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r11\n\t" + "eor r6, r6, r9\n\t" + "eor r10, r10, r7\n\t" + "and r10, r10, r11\n\t" + "eor r7, r7, r10\n\t" + "strd r3, r12, [r0, #8]\n\t" + "strd r4, r5, [r0, #40]\n\t" + "strd r6, r7, [r0, #72]\n\t" + "sbfx r8, %[b], #7, #1\n\t" + "eor r11, %[b], r8\n\t" + "sub r11, r11, r8\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #31\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #30\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #29\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #28\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #27\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #26\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #25\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #24\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #16]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #48]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #80]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r9, #-1\n\t" + "mov r10, #-1\n\t" + "rsbs lr, lr, #0\n\t" + "sbcs r9, r9, r6\n\t" + "sbcs r10, r10, r7\n\t" + "sbc lr, lr, lr\n\t" + "asr r11, %[b], #31\n\t" + "eor r8, r3, r4\n\t" + "and r8, r8, r11\n\t" + "eor r3, r3, r8\n\t" + "eor r4, r4, r8\n\t" + "eor r8, r12, r5\n\t" + "and r8, r8, r11\n\t" + "eor r12, r12, r8\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r11\n\t" + "eor r6, r6, r9\n\t" + "eor r10, r10, r7\n\t" + "and r10, r10, r11\n\t" + "eor r7, r7, r10\n\t" + "strd r3, r12, [r0, #16]\n\t" + "strd r4, r5, [r0, #48]\n\t" + "strd r6, r7, [r0, #80]\n\t" + "sbfx r8, %[b], #7, #1\n\t" + "eor r11, %[b], r8\n\t" + "sub r11, r11, r8\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #31\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #30\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #29\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #28\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #27\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #26\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #25\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r8, #0x80000000\n\t" + "ror r8, r8, #24\n\t" + "ror r8, r8, r11\n\t" + "asr r8, r8, #31\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r12\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r12, r12, r10\n\t" + "ldrd r9, r10, [r1, #56]\n\t" + "eor r9, r9, r4\n\t" + "eor r10, r10, r5\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r4, r4, r9\n\t" + "eor r5, r5, r10\n\t" + "ldrd r9, r10, [r1, #88]\n\t" + "eor r9, r9, r6\n\t" + "eor r10, r10, r7\n\t" + "and r9, r9, r8\n\t" + "and r10, r10, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r10\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r9, #-1\n\t" + "mov r10, #0x7fffffff\n\t" + "rsbs lr, lr, #0\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r10, r10, r7\n\t" + "asr r11, %[b], #31\n\t" + "eor r8, r3, r4\n\t" + "and r8, r8, r11\n\t" + "eor r3, r3, r8\n\t" + "eor r4, r4, r8\n\t" + "eor r8, r12, r5\n\t" + "and r8, r8, r11\n\t" + "eor r12, r12, r8\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r11\n\t" + "eor r6, r6, r9\n\t" + "eor r10, r10, r7\n\t" + "and r10, r10, r11\n\t" + "eor r7, r7, r10\n\t" + "strd r3, r12, [r0, #24]\n\t" + "strd r4, r5, [r0, #56]\n\t" + "strd r6, r7, [r0, #88]\n\t" + "add sp, sp, #0\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Multiply */ + "ldr r8, [r1]\n\t" + "ldr r9, [r1, #4]\n\t" + "ldr r10, [r2]\n\t" + "ldr r4, [r2, #4]\n\t" + /* A[0] * B[0] = 0 */ + "umull r5, r6, r8, r10\n\t" + "str r5, [sp]\n\t" + /* A[0] * B[1] = 1 */ + "umull r3, r7, r8, r4\n\t" + "adds r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * B[0] = 1 */ + "umull r3, r12, r9, r10\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #4]\n\t" + /* A[2] * B[0] = 2 */ + "ldr r11, [r1, #8]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r7, r7, r3\n\t" + "adc r5, r5, r12\n\t" + /* A[1] * B[1] = 2 */ + "umull r3, r12, r9, r4\n\t" + "adds r7, r7, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * B[2] = 2 */ + "ldr lr, [r2, #8]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #8]\n\t" + /* A[0] * B[3] = 3 */ + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r5, r5, r3\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * B[2] = 3 */ + "ldr lr, [r2, #8]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[2] * B[1] = 3 */ + "umull r3, r12, r11, r4\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[3] * B[0] = 3 */ + "ldr r11, [r1, #12]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #12]\n\t" + /* A[4] * B[0] = 4 */ + "ldr r11, [r1, #16]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[1] = 4 */ + "ldr r11, [r1, #12]\n\t" + "umull r3, r12, r11, r4\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[2] = 4 */ + "ldr r11, [r1, #8]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[3] = 4 */ + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[0] * B[4] = 4 */ + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #16]\n\t" + /* A[0] * B[5] = 5 */ + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r7, r7, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[4] = 5 */ + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[3] = 5 */ + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[2] = 5 */ + "ldr r11, [r1, #12]\n\t" + "ldr lr, [r2, #8]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[1] = 5 */ + "ldr r11, [r1, #16]\n\t" + "umull r3, r12, r11, r4\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[0] = 5 */ + "ldr r11, [r1, #20]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #20]\n\t" + /* A[6] * B[0] = 6 */ + "ldr r11, [r1, #24]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r5, r5, r3\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[5] * B[1] = 6 */ + "ldr r11, [r1, #20]\n\t" + "umull r3, r12, r11, r4\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[4] * B[2] = 6 */ + "ldr r11, [r1, #16]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[3] * B[3] = 6 */ + "ldr r11, [r1, #12]\n\t" + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[2] * B[4] = 6 */ + "ldr r11, [r1, #8]\n\t" + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * B[5] = 6 */ + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[0] * B[6] = 6 */ + "ldr lr, [r2, #24]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #24]\n\t" + /* A[0] * B[7] = 7 */ + "ldr lr, [r2, #28]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[6] = 7 */ + "ldr lr, [r2, #24]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[5] = 7 */ + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[4] = 7 */ + "ldr r11, [r1, #12]\n\t" + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[3] = 7 */ + "ldr r11, [r1, #16]\n\t" + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[2] = 7 */ + "ldr r11, [r1, #20]\n\t" + "ldr lr, [r2, #8]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[1] = 7 */ + "ldr r11, [r1, #24]\n\t" + "umull r3, r12, r11, r4\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[0] = 7 */ + "ldr r11, [r1, #28]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #28]\n\t" + "ldr r8, [r1, #24]\n\t" + "ldr r10, [r2, #24]\n\t" + /* A[7] * B[1] = 8 */ + "umull r3, r12, r11, r4\n\t" + "adds r7, r7, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[2] = 8 */ + "umull r3, r12, r8, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[3] = 8 */ + "ldr r11, [r1, #20]\n\t" + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[4] = 8 */ + "ldr r11, [r1, #16]\n\t" + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[5] = 8 */ + "ldr r11, [r1, #12]\n\t" + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[6] = 8 */ + "ldr r11, [r1, #8]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[7] = 8 */ + "ldr lr, [r2, #28]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #32]\n\t" + "ldr r9, [r1, #28]\n\t" + "mov r4, lr\n\t" + /* A[2] * B[7] = 9 */ + "umull r3, r12, r11, r4\n\t" + "adds r5, r5, r3\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[3] * B[6] = 9 */ + "ldr r11, [r1, #12]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[4] * B[5] = 9 */ + "ldr r11, [r1, #16]\n\t" + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[5] * B[4] = 9 */ + "ldr r11, [r1, #20]\n\t" + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[6] * B[3] = 9 */ + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[7] * B[2] = 9 */ + "ldr lr, [r2, #8]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #36]\n\t" + /* A[7] * B[3] = 10 */ + "ldr lr, [r2, #12]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[4] = 10 */ + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[5] = 10 */ + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r11, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[6] = 10 */ + "ldr r11, [r1, #16]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[7] = 10 */ + "ldr r11, [r1, #12]\n\t" + "umull r3, r12, r11, r4\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #40]\n\t" + /* A[4] * B[7] = 11 */ + "ldr r11, [r1, #16]\n\t" + "umull r3, r12, r11, r4\n\t" + "adds r7, r7, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[6] = 11 */ + "ldr r11, [r1, #20]\n\t" + "umull r3, r12, r11, r10\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[5] = 11 */ + "umull r3, r12, r8, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[7] * B[4] = 11 */ + "ldr lr, [r2, #16]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r7, r7, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #44]\n\t" + /* A[7] * B[5] = 12 */ + "ldr lr, [r2, #20]\n\t" + "umull r3, r12, r9, lr\n\t" + "adds r5, r5, r3\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[6] * B[6] = 12 */ + "umull r3, r12, r8, r10\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + /* A[5] * B[7] = 12 */ + "umull r3, r12, r11, r4\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #48]\n\t" + /* A[6] * B[7] = 13 */ + "umull r3, r12, r8, r4\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[6] = 13 */ + "umull r3, r12, r9, r10\n\t" + "adds r6, r6, r3\n\t" + "adcs r7, r7, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #52]\n\t" + /* A[7] * B[7] = 14 */ + "umull r3, r12, r9, r4\n\t" + "adds r7, r7, r3\n\t" + "adc r5, r5, r12\n\t" + "str r7, [sp, #56]\n\t" + "str r5, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r5, r6, [sp]\n\t" + "ldrd r7, r8, [sp, #8]\n\t" + "ldrd r9, r10, [sp, #16]\n\t" + "ldrd r11, lr, [sp, #24]\n\t" + "lsr r3, lr, #31\n\t" + "and lr, lr, #0x7fffffff\n\t" + "mov r4, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "adds r5, r5, r3\n\t" + "mov %[b], #0\n\t" + "adcs r6, r6, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r6, r6, r3\n\t" + "mov %[b], #0\n\t" + "adcs r7, r7, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r7, r7, r3\n\t" + "mov %[b], #0\n\t" + "adcs r8, r8, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r8, r8, r3\n\t" + "mov %[b], #0\n\t" + "adcs r9, r9, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r9, r9, r3\n\t" + "mov %[b], #0\n\t" + "adcs r10, r10, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r10, r10, r3\n\t" + "mov %[b], #0\n\t" + "adcs r11, r11, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r11, r11, r3\n\t" + "mov %[b], #0\n\t" + "adcs lr, lr, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, r4, r3\n\t" + "adds lr, lr, r3\n\t" + "adc r3, r12, %[b]\n\t" + /* Overflow */ + "lsl r3, r3, #1\n\t" + "orr r3, r3, lr, lsr #31\n\t" + "mul r3, r3, r4\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + /* Reduce if top bit set */ + "asr r3, lr, #31\n\t" + "and r3, r3, r4\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + /* Store */ + "strd r5, r6, [r0]\n\t" + "strd r7, r8, [r0, #8]\n\t" + "strd r9, r10, [r0, #16]\n\t" + "strd r11, lr, [r0, #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square */ + "ldr r8, [r1]\n\t" + "ldr r9, [r1, #4]\n\t" + "ldr r10, [r1, #8]\n\t" + "ldr r11, [r1, #12]\n\t" + "ldr r12, [r1, #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r5, r6, r8, r8\n\t" + "str r5, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r8, r9\n\t" + "mov r7, #0\n\t" + "adds r6, r6, r2\n\t" + "adc r7, r7, r3\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r9, r9\n\t" + "adds r7, r7, r2\n\t" + "adc r5, r5, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r8, r10\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r9, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r10, r10\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr lr, [r1, #20]\n\t" + "umull r2, r3, r8, lr\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r9, r12\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r10, r11\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r11, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r9, lr\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r8, lr\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr lr, [r1, #28]\n\t" + "umull r2, r3, r8, lr\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r9, lr\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr lr, [r1, #20]\n\t" + "umull r2, r3, r10, lr\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r11, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r11, lr\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r10, lr\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr lr, [r1, #28]\n\t" + "umull r2, r3, r9, lr\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #32]\n\t" + "ldr r8, [r1, #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r10, lr\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r11, lr\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #36]\n\t" + "mov r9, lr\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr lr, [r1, #28]\n\t" + "umull r2, r3, r11, lr\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #40]\n\t" + "mov r10, lr\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r10\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r8, r9\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r10, r10\n\t" + "adds r7, r7, r2\n\t" + "adc r5, r5, r3\n\t" + "str r7, [sp, #56]\n\t" + "str r5, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r5, r6, [sp]\n\t" + "ldrd r7, r8, [sp, #8]\n\t" + "ldrd r9, r10, [sp, #16]\n\t" + "ldrd r11, lr, [sp, #24]\n\t" + "lsr r2, lr, #31\n\t" + "and lr, lr, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r6, r6, r2\n\t" + "mov r4, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r7, r7, r2\n\t" + "mov r4, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r8, r8, r2\n\t" + "mov r4, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r9, r9, r2\n\t" + "mov r4, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r10, r10, r2\n\t" + "mov r4, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r11, r11, r2\n\t" + "mov r4, #0\n\t" + "adcs lr, lr, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds lr, lr, r2\n\t" + "adc r2, r3, r4\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, lr, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + /* Reduce if top bit set */ + "asr r2, lr, #31\n\t" + "and r2, r2, r12\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + /* Store */ + "strd r5, r6, [r0]\n\t" + "strd r7, r8, [r0, #8]\n\t" + "strd r9, r10, [r0, #16]\n\t" + "strd r11, lr, [r0, #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); +} + +void fe_mul121666(fe r, fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0\n\t" + /* Multiply by 121666 */ + "ldrd r2, r3, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "movw r4, #0xdb42\n\t" + "movt r4, #1\n\t" + "umull r2, r11, r2, r4\n\t" + "umull r3, r12, r3, r4\n\t" + "adds r3, r3, r11\n\t" + "adc r11, r12, #0\n\t" + "umull r5, r12, r5, r4\n\t" + "adds r5, r5, r11\n\t" + "adc r11, r12, #0\n\t" + "umull r6, r12, r6, r4\n\t" + "adds r6, r6, r11\n\t" + "adc r11, r12, #0\n\t" + "umull r7, r12, r7, r4\n\t" + "adds r7, r7, r11\n\t" + "adc r11, r12, #0\n\t" + "umull r8, r12, r8, r4\n\t" + "adds r8, r8, r11\n\t" + "adc r11, r12, #0\n\t" + "umull r9, r12, r9, r4\n\t" + "adds r9, r9, r11\n\t" + "adc r11, r12, #0\n\t" + "umull r10, r12, r10, r4\n\t" + "adds r10, r10, r11\n\t" + "adc r11, r12, #0\n\t" + "mov r4, #19\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, lsr #31\n\t" + "mul r11, r11, r4\n\t" + "and r10, r10, #0x7fffffff\n\t" + "adds r2, r2, r11\n\t" + "adcs r3, r3, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adc r10, r10, #0\n\t" + "strd r2, r3, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "add sp, sp, #0\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_sq2(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square * 2 */ + "ldr r8, [r1]\n\t" + "ldr r9, [r1, #4]\n\t" + "ldr r10, [r1, #8]\n\t" + "ldr r11, [r1, #12]\n\t" + "ldr r12, [r1, #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r5, r6, r8, r8\n\t" + "str r5, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r8, r9\n\t" + "mov r7, #0\n\t" + "adds r6, r6, r2\n\t" + "adc r7, r7, r3\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r9, r9\n\t" + "adds r7, r7, r2\n\t" + "adc r5, r5, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r8, r10\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r9, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r10, r10\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr lr, [r1, #20]\n\t" + "umull r2, r3, r8, lr\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r9, r12\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r10, r11\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r11, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r9, lr\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r8, lr\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr lr, [r1, #28]\n\t" + "umull r2, r3, r8, lr\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r9, lr\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr lr, [r1, #20]\n\t" + "umull r2, r3, r10, lr\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r11, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r11, lr\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r10, lr\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr lr, [r1, #28]\n\t" + "umull r2, r3, r9, lr\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #32]\n\t" + "ldr r8, [r1, #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r10, lr\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr lr, [r1, #24]\n\t" + "umull r2, r3, r11, lr\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #36]\n\t" + "mov r9, lr\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr lr, [r1, #28]\n\t" + "umull r2, r3, r11, lr\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #40]\n\t" + "mov r10, lr\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r10\n\t" + "adds r7, r7, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r8, r9\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r7, r7, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r7, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r7, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r7, r7, #0\n\t" + "str r5, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r10, r10\n\t" + "adds r7, r7, r2\n\t" + "adc r5, r5, r3\n\t" + "str r7, [sp, #56]\n\t" + "str r5, [sp, #60]\n\t" + /* Double and Reduce */ + /* Load bottom half */ + "ldrd r5, r6, [sp]\n\t" + "ldrd r7, r8, [sp, #8]\n\t" + "ldrd r9, r10, [sp, #16]\n\t" + "ldrd r11, lr, [sp, #24]\n\t" + "lsr r2, lr, #30\n\t" + "lsl lr, lr, #1\n\t" + "orr lr, lr, r11, lsr #31\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, lsr #31\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "lsl r9, r9, #1\n\t" + "orr r9, r9, r8, lsr #31\n\t" + "lsl r8, r8, #1\n\t" + "orr r8, r8, r7, lsr #31\n\t" + "lsl r7, r7, #1\n\t" + "orr r7, r7, r6, lsr #31\n\t" + "lsl r6, r6, #1\n\t" + "orr r6, r6, r5, lsr #31\n\t" + "lsl r5, r5, #1\n\t" + "and lr, lr, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r6, r6, r2\n\t" + "mov r4, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r7, r7, r2\n\t" + "mov r4, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r8, r8, r2\n\t" + "mov r4, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r9, r9, r2\n\t" + "mov r4, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r10, r10, r2\n\t" + "mov r4, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, r4\n\t" + "adds r11, r11, r2\n\t" + "mov r4, #0\n\t" + "adcs lr, lr, r3\n\t" + "adc r4, r4, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds lr, lr, r2\n\t" + "adc r2, r3, r4\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, lr, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + /* Reduce if top bit set */ + "asr r2, lr, #31\n\t" + "and r2, r2, r12\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + /* Store */ + "strd r5, r6, [r0]\n\t" + "strd r7, r8, [r0, #8]\n\t" + "strd r9, r10, [r0, #16]\n\t" + "strd r11, lr, [r0, #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); +} + +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x90\n\t" + /* Invert */ + "str %[r], [sp, #128]\n\t" + "str %[a], [sp, #132]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #132]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #132]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #64\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_invert1:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert1\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_invert2:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert2\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_invert3:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert3\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_invert4:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert4\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #0x31\n\t" + "\n" + "L_fe_invert5:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert5\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_invert6:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert6\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "mov r4, #0x32\n\t" + "\n" + "L_fe_invert7:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert7\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_fe_invert8:\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert8\n\t" + "ldr r0, [sp, #128]\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #132]\n\t" + "ldr %[r], [sp, #128]\n\t" + "add sp, sp, #0x90\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +int curve25519(byte* r, byte* n, byte* a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xc0\n\t" + "str %[r], [sp, #160]\n\t" + "str %[n], [sp, #164]\n\t" + "str %[a], [sp, #168]\n\t" + "mov %[n], #0\n\t" + "str %[n], [sp, #172]\n\t" + /* Set one */ + "mov lr, #1\n\t" + "mov r11, #0\n\t" + "strd lr, r11, [r0]\n\t" + "strd r11, r11, [r0, #8]\n\t" + "strd r11, r11, [r0, #16]\n\t" + "strd r11, r11, [r0, #24]\n\t" + /* Set zero */ + "mov r11, #0\n\t" + "strd r11, r11, [sp]\n\t" + "strd r11, r11, [sp, #8]\n\t" + "strd r11, r11, [sp, #16]\n\t" + "strd r11, r11, [sp, #24]\n\t" + /* Set one */ + "mov lr, #1\n\t" + "mov r11, #0\n\t" + "strd lr, r11, [sp, #32]\n\t" + "strd r11, r11, [sp, #40]\n\t" + "strd r11, r11, [sp, #48]\n\t" + "strd r11, r11, [sp, #56]\n\t" + /* Copy */ + "ldrd r5, r6, [r2]\n\t" + "ldrd r7, r8, [r2, #8]\n\t" + "strd r5, r6, [sp, #64]\n\t" + "strd r7, r8, [sp, #72]\n\t" + "ldrd r5, r6, [r2, #16]\n\t" + "ldrd r7, r8, [r2, #24]\n\t" + "strd r5, r6, [sp, #80]\n\t" + "strd r7, r8, [sp, #88]\n\t" + "mov %[n], #30\n\t" + "str %[n], [sp, #180]\n\t" + "mov %[a], #28\n\t" + "str %[a], [sp, #176]\n\t" + "\n" + "L_curve25519_words:\n\t" + "\n" + "L_curve25519_bits:\n\t" + "ldr %[n], [sp, #164]\n\t" + "ldr %[a], [r1, r2]\n\t" + "ldr %[n], [sp, #180]\n\t" + "lsr %[a], %[a], %[n]\n\t" + "and %[a], %[a], #1\n\t" + "str %[a], [sp, #184]\n\t" + "ldr %[n], [sp, #172]\n\t" + "eor %[n], %[n], %[a]\n\t" + "str %[n], [sp, #172]\n\t" + "ldr %[r], [sp, #160]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r5, r6, [r0]\n\t" + "ldrd r7, r8, [sp, #64]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [r0]\n\t" + "strd r7, r8, [sp, #64]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "ldrd r7, r8, [sp, #72]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [sp, #72]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "ldrd r7, r8, [sp, #80]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [r0, #16]\n\t" + "strd r7, r8, [sp, #80]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "ldrd r7, r8, [sp, #88]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [r0, #24]\n\t" + "strd r7, r8, [sp, #88]\n\t" + "ldr %[n], [sp, #172]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r5, r6, [sp]\n\t" + "ldrd r7, r8, [sp, #32]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [sp]\n\t" + "strd r7, r8, [sp, #32]\n\t" + "ldrd r5, r6, [sp, #8]\n\t" + "ldrd r7, r8, [sp, #40]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [sp, #8]\n\t" + "strd r7, r8, [sp, #40]\n\t" + "ldrd r5, r6, [sp, #16]\n\t" + "ldrd r7, r8, [sp, #48]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [sp, #16]\n\t" + "strd r7, r8, [sp, #48]\n\t" + "ldrd r5, r6, [sp, #24]\n\t" + "ldrd r7, r8, [sp, #56]\n\t" + "eor r9, r5, r7\n\t" + "eor r10, r6, r8\n\t" + "and r9, r9, %[n]\n\t" + "and r10, r10, %[n]\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r9\n\t" + "eor r8, r8, r10\n\t" + "strd r5, r6, [sp, #24]\n\t" + "strd r7, r8, [sp, #56]\n\t" + "ldr %[n], [sp, #184]\n\t" + "str %[n], [sp, #172]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r5, r6, [r0]\n\t" + "ldrd r7, r8, [sp]\n\t" + "adds r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [r0]\n\t" + /* Sub */ + "subs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #128]\n\t" + /* Add */ + "ldrd r5, r6, [r0, #8]\n\t" + "ldrd r7, r8, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [r0, #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #136]\n\t" + /* Add */ + "ldrd r5, r6, [r0, #16]\n\t" + "ldrd r7, r8, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [r0, #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #144]\n\t" + /* Add */ + "ldrd r5, r6, [r0, #24]\n\t" + "ldrd r7, r8, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "adc r10, r6, r8\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "sbc lr, r6, r8\n\t" + "mov r3, #-19\n\t" + "asr %[a], r10, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r5, r6, [r0]\n\t" + "subs r5, r5, r3\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [r0, #16]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbc r10, r10, r12\n\t" + "strd r9, r10, [r0, #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], lr, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r5, r6, [sp, #128]\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #128]\n\t" + "ldrd r5, r6, [sp, #136]\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #136]\n\t" + "ldrd r5, r6, [sp, #144]\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #144]\n\t" + "adcs r11, r11, %[a]\n\t" + "adc lr, lr, r12\n\t" + "strd r11, lr, [sp, #152]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r5, r6, [sp, #64]\n\t" + "ldrd r7, r8, [sp, #32]\n\t" + "adds r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [sp]\n\t" + /* Sub */ + "subs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #96]\n\t" + /* Add */ + "ldrd r5, r6, [sp, #72]\n\t" + "ldrd r7, r8, [sp, #40]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [sp, #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #104]\n\t" + /* Add */ + "ldrd r5, r6, [sp, #80]\n\t" + "ldrd r7, r8, [sp, #48]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [sp, #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #112]\n\t" + /* Add */ + "ldrd r5, r6, [sp, #88]\n\t" + "ldrd r7, r8, [sp, #56]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "adc r10, r6, r8\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "sbc lr, r6, r8\n\t" + "mov r3, #-19\n\t" + "asr %[a], r10, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r5, r6, [sp]\n\t" + "subs r5, r5, r3\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp]\n\t" + "ldrd r5, r6, [sp, #8]\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #8]\n\t" + "ldrd r5, r6, [sp, #16]\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #16]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbc r10, r10, r12\n\t" + "strd r9, r10, [sp, #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], lr, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r5, r6, [sp, #96]\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #96]\n\t" + "ldrd r5, r6, [sp, #104]\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #104]\n\t" + "ldrd r5, r6, [sp, #112]\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #112]\n\t" + "adcs r11, r11, %[a]\n\t" + "adc lr, lr, r12\n\t" + "strd r11, lr, [sp, #120]\n\t" + "ldr r2, [sp, #160]\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x20\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #160]\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r5, r6, [sp, #32]\n\t" + "ldrd r7, r8, [sp]\n\t" + "adds r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [sp, #64]\n\t" + /* Sub */ + "subs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp]\n\t" + /* Add */ + "ldrd r5, r6, [sp, #40]\n\t" + "ldrd r7, r8, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [sp, #72]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #8]\n\t" + /* Add */ + "ldrd r5, r6, [sp, #48]\n\t" + "ldrd r7, r8, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "mov r3, #0\n\t" + "adcs r10, r6, r8\n\t" + "adc r3, r3, #0\n\t" + "strd r9, r10, [sp, #80]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "mov r12, #0\n\t" + "sbcs lr, r6, r8\n\t" + "adc r12, r12, #0\n\t" + "strd r11, lr, [sp, #16]\n\t" + /* Add */ + "ldrd r5, r6, [sp, #56]\n\t" + "ldrd r7, r8, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r9, r5, r7\n\t" + "adc r10, r6, r8\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r11, r5, r7\n\t" + "sbc lr, r6, r8\n\t" + "mov r3, #-19\n\t" + "asr %[a], r10, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r5, r6, [sp, #64]\n\t" + "subs r5, r5, r3\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #64]\n\t" + "ldrd r5, r6, [sp, #72]\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #72]\n\t" + "ldrd r5, r6, [sp, #80]\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #80]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbc r10, r10, r12\n\t" + "strd r9, r10, [sp, #88]\n\t" + "mov r3, #-19\n\t" + "asr %[a], lr, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r5, r6, [sp]\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp]\n\t" + "ldrd r5, r6, [sp, #8]\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #8]\n\t" + "ldrd r5, r6, [sp, #16]\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "strd r5, r6, [sp, #16]\n\t" + "adcs r11, r11, %[a]\n\t" + "adc lr, lr, r12\n\t" + "strd r11, lr, [sp, #24]\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + /* Sub */ + "ldrd r5, r6, [sp, #128]\n\t" + "ldrd r7, r8, [sp, #136]\n\t" + "ldrd r9, r10, [sp, #96]\n\t" + "ldrd r11, lr, [sp, #104]\n\t" + "subs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "sbcs lr, r8, lr\n\t" + "strd r9, r10, [sp, #128]\n\t" + "strd r11, lr, [sp, #136]\n\t" + "ldrd r5, r6, [sp, #144]\n\t" + "ldrd r7, r8, [sp, #152]\n\t" + "ldrd r9, r10, [sp, #112]\n\t" + "ldrd r11, lr, [sp, #120]\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "sbc lr, r8, lr\n\t" + "mov r3, #-19\n\t" + "asr %[a], lr, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r5, r6, [sp, #128]\n\t" + "ldrd r7, r8, [sp, #136]\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, %[a]\n\t" + "adcs r7, r7, %[a]\n\t" + "adcs r8, r8, %[a]\n\t" + "adcs r9, r9, %[a]\n\t" + "adcs r10, r10, %[a]\n\t" + "adcs r11, r11, %[a]\n\t" + "adc lr, lr, r12\n\t" + "strd r5, r6, [sp, #128]\n\t" + "strd r7, r8, [sp, #136]\n\t" + "strd r9, r10, [sp, #144]\n\t" + "strd r11, lr, [sp, #152]\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_sq\n\t" + /* Multiply by 121666 */ + "ldrd r5, r6, [sp, #128]\n\t" + "ldrd r7, r8, [sp, #136]\n\t" + "ldrd r9, r10, [sp, #144]\n\t" + "ldrd r11, lr, [sp, #152]\n\t" + "movw r12, #0xdb42\n\t" + "movt r12, #1\n\t" + "umull r5, %[a], r5, r12\n\t" + "umull r6, r3, r6, r12\n\t" + "adds r6, r6, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r7, r3, r7, r12\n\t" + "adds r7, r7, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r8, r3, r8, r12\n\t" + "adds r8, r8, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r9, r3, r9, r12\n\t" + "adds r9, r9, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r10, r3, r10, r12\n\t" + "adds r10, r10, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r11, r3, r11, r12\n\t" + "adds r11, r11, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull lr, r3, lr, r12\n\t" + "adds lr, lr, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "mov r12, #19\n\t" + "lsl %[a], %[a], #1\n\t" + "orr %[a], %[a], lr, lsr #31\n\t" + "mul %[a], %[a], r12\n\t" + "and lr, lr, #0x7fffffff\n\t" + "adds r5, r5, %[a]\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adc lr, lr, #0\n\t" + "strd r5, r6, [sp, #32]\n\t" + "strd r7, r8, [sp, #40]\n\t" + "strd r9, r10, [sp, #48]\n\t" + "strd r11, lr, [sp, #56]\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq\n\t" + /* Add */ + "ldrd r5, r6, [sp, #96]\n\t" + "ldrd r7, r8, [sp, #104]\n\t" + "ldrd r9, r10, [sp, #32]\n\t" + "ldrd r11, lr, [sp, #40]\n\t" + "adds r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "adcs lr, r8, lr\n\t" + "strd r9, r10, [sp, #96]\n\t" + "strd r11, lr, [sp, #104]\n\t" + "ldrd r5, r6, [sp, #112]\n\t" + "ldrd r7, r8, [sp, #120]\n\t" + "ldrd r9, r10, [sp, #48]\n\t" + "ldrd r11, lr, [sp, #56]\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "adc lr, r8, lr\n\t" + "mov r3, #-19\n\t" + "asr %[a], lr, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r5, r6, [sp, #96]\n\t" + "ldrd r7, r8, [sp, #104]\n\t" + "subs r5, r5, r3\n\t" + "sbcs r6, r6, %[a]\n\t" + "sbcs r7, r7, %[a]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbcs r10, r10, %[a]\n\t" + "sbcs r11, r11, %[a]\n\t" + "sbc lr, lr, r12\n\t" + "strd r5, r6, [sp, #96]\n\t" + "strd r7, r8, [sp, #104]\n\t" + "strd r9, r10, [sp, #112]\n\t" + "strd r11, lr, [sp, #120]\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #168]\n\t" + "add r0, sp, #0x20\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #176]\n\t" + "ldr %[n], [sp, #180]\n\t" + "subs %[n], %[n], #1\n\t" + "str %[n], [sp, #180]\n\t" + "bge L_curve25519_bits\n\t" + "mov %[n], #31\n\t" + "str %[n], [sp, #180]\n\t" + "subs %[a], %[a], #4\n\t" + "str %[a], [sp, #176]\n\t" + "bge L_curve25519_words\n\t" + /* Invert */ + "add r0, sp, #32\n\t" + "add r1, sp, #0\n\t" + "bl fe_sq\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #0\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #96\n\t" + "bl fe_mul\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_curve25519_inv_1:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_1\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_curve25519_inv_2:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_2\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #128\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_curve25519_inv_3:\n\t" + "add r0, sp, #128\n\t" + "add r1, sp, #128\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_3\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #128\n\t" + "add r2, sp, #96\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_curve25519_inv_4:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_4\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "mov r4, #0x31\n\t" + "\n" + "L_curve25519_inv_5:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_5\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "add r0, sp, #128\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_curve25519_inv_6:\n\t" + "add r0, sp, #128\n\t" + "add r1, sp, #128\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_6\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #128\n\t" + "add r2, sp, #96\n\t" + "bl fe_mul\n\t" + "mov r4, #0x32\n\t" + "\n" + "L_curve25519_inv_7:\n\t" + "add r0, sp, #96\n\t" + "add r1, sp, #96\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_7\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #96\n\t" + "add r2, sp, #64\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_curve25519_inv_8:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_8\n\t" + "add r0, sp, #0\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #160]\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + "mov r0, #0\n\t" + "add sp, sp, #0xc0\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); + return (uint32_t)(size_t)r; +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x70\n\t" + /* pow22523 */ + "str %[r], [sp, #96]\n\t" + "str %[a], [sp, #100]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #100]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #100]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_pow22523_1:\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_1\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_pow22523_2:\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_2\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_pow22523_3:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_3\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_pow22523_4:\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_4\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #0x31\n\t" + "\n" + "L_fe_pow22523_5:\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_5\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_pow22523_6:\n\t" + "add r0, sp, #64\n\t" + "add r1, sp, #64\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_6\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #64\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #0x32\n\t" + "\n" + "L_fe_pow22523_7:\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_7\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "mov r4, #2\n\t" + "\n" + "L_fe_pow22523_8:\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_8\n\t" + "ldr r0, [sp, #96]\n\t" + "mov r1, sp\n\t" + "ldr r2, [sp, #100]\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #100]\n\t" + "ldr %[r], [sp, #96]\n\t" + "add sp, sp, #0x70\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[px], [sp, #12]\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #24]\n\t" + "ldr r1, [sp, #20]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px) + : + : "memory", "lr" + ); + (void)py; + (void)pz; + (void)pt; +} + +void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #20]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #24]\n\t" + "ldr r1, [sp, #20]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + : + : "memory", "lr" + ); + (void)px; + (void)py; + (void)pz; + (void)pt; +} + +void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r1, [sp, #52]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #56]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #52]\n\t" + "ldr r2, [sp, #56]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r2]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1, #8]\n\t" + "ldrd r6, r7, [r2, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r2, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1, #24]\n\t" + "ldrd r6, r7, [r2, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r2, [sp, #4]\n\t" + /* Sub */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, lr\n\t" + "adcs r9, r9, lr\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r1, [sp, #60]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq2\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Sub */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "ldrd r8, r9, [r1]\n\t" + "ldrd r10, r11, [r1, #8]\n\t" + "subs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, lr\n\t" + "adcs r9, r9, lr\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + : + : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); + (void)px; + (void)py; + (void)pz; +} + +void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x20\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" + /* Sub */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, lr\n\t" + "adcs r9, r9, lr\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r2, [sp, #88]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #92]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #80]\n\t" + "ldr r1, [sp, #84]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #76]\n\t" + /* Double */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r1]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r0, #8]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r0, #16]\n\t" + "ldrd r6, r7, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r0, #24]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "add sp, sp, #0x20\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + : + : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); + (void)px; + (void)py; + (void)pz; + (void)pt; + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x20\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" + /* Sub */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, lr\n\t" + "adcs r9, r9, lr\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r2, [sp, #92]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #88]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #80]\n\t" + "ldr r1, [sp, #84]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #76]\n\t" + /* Double */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "add sp, sp, #0x20\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + : + : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); + (void)px; + (void)py; + (void)pz; + (void)pt; + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" + /* Sub */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, lr\n\t" + "adcs r9, r9, lr\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r2, [sp, #156]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #160]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #144]\n\t" + "ldr r1, [sp, #152]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #148]\n\t" + "ldr r1, [sp, #140]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r1]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + : + : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); + (void)px; + (void)py; + (void)pz; + (void)pt; + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" + /* Add */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" + /* Sub */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, lr\n\t" + "adcs r9, r9, lr\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r2, [sp, #160]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #156]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #144]\n\t" + "ldr r1, [sp, #152]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #148]\n\t" + "ldr r1, [sp, #140]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, lr\n\t" + "sbcs r9, r9, lr\n\t" + "sbcs r10, r10, lr\n\t" + "sbc r11, r11, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0]\n\t" + /* Sub */ + "subs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #8]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r8, r9, [r0, #16]\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "mov r4, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r4, r4, #0\n\t" + "strd r10, r11, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r4, r4, #-1\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r12, #-19\n\t" + "asr lr, r9, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r5, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], lr\n\t" + "sbcs r5, r5, lr\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, lr\n\t" + "sbc r9, r9, r4\n\t" + "strd r8, r9, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr lr, r11, #31\n\t" + /* Mask the modulus */ + "and r12, lr, r12\n\t" + "and r4, lr, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r5, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], lr\n\t" + "adcs r5, r5, lr\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, lr\n\t" + "adc r11, r11, r4\n\t" + "strd r10, r11, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + : + : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" + ); + (void)px; + (void)py; + (void)pz; + (void)pt; + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +#endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index 6d384dde0..9426d9987 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -19,6 +19,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +#ifdef __aarch64__ .text .globl fe_init .type fe_init,@function @@ -50,8 +51,8 @@ fe_tobytes: adcs x6, x3, xzr adcs x6, x4, xzr adc x6, x5, xzr - lsr x6, x6, #63 - mul x6, x6, x7 + asr x6, x6, #63 + and x6, x6, x7 adds x2, x2, x6 adcs x3, x3, xzr adcs x4, x4, xzr @@ -224,8 +225,8 @@ fe_isnonzero: adcs x5, x2, xzr adcs x5, x3, xzr adc x5, x4, xzr - lsr x5, x5, #63 - mul x5, x5, x6 + asr x5, x5, #63 + and x5, x5, x6 adds x1, x1, x5 adcs x2, x2, xzr adcs x3, x3, xzr @@ -248,11 +249,9 @@ fe_isnegative: adcs x5, x2, xzr adcs x5, x3, xzr adc x5, x4, xzr - lsr x5, x5, #63 - mul x5, x5, x6 - ldr x1, [x0] - adds x1, x1, x5 and x0, x1, #1 + lsr x5, x5, #63 + eor x0, x0, x5 ret .size fe_isnegative,.-fe_isnegative .text @@ -276,8 +275,7 @@ fe_cmov_table: str x28, [x29, #104] sxtb x2, w2 sbfx x15, x2, #7, #1 - sxtb x16, w2 - eor x16, x16, x15 + eor x16, x2, x15 sub x16, x16, x15 mov x3, #1 mov x4, xzr @@ -444,8 +442,6 @@ fe_cmov_table: csel x12, x26, x12, eq csel x13, x27, x13, eq csel x14, x28, x14, eq - add x1, x1, #0x180 - sub x1, x1, #0x180 mov x17, #-19 mov x18, #-1 mov x19, #-1 @@ -499,105 +495,104 @@ fe_cmov_table: fe_mul: stp x29, x30, [sp, #-64]! add x29, sp, #0 - str x17, [x29, #16] - str x18, [x29, #24] - str x19, [x29, #32] - str x20, [x29, #40] - str x21, [x29, #48] - str x22, [x29, #56] + str x17, [x29, #24] + str x18, [x29, #32] + str x19, [x29, #40] + str x20, [x29, #48] + str x21, [x29, #56] # Multiply - ldp x15, x16, [x1] - ldp x17, x18, [x1, #16] - ldp x19, x20, [x2] - ldp x21, x22, [x2, #16] + ldp x14, x15, [x1] + ldp x16, x17, [x1, #16] + ldp x18, x19, [x2] + ldp x20, x21, [x2, #16] # A[0] * B[0] - mul x6, x15, x19 - umulh x7, x15, x19 + mul x6, x14, x18 + umulh x7, x14, x18 # A[0] * B[1] - mul x3, x15, x20 - umulh x8, x15, x20 + mul x3, x14, x19 + umulh x8, x14, x19 adds x7, x7, x3 adc x8, x8, xzr # A[1] * B[0] - mul x3, x16, x19 - umulh x4, x16, x19 + mul x3, x15, x18 + umulh x4, x15, x18 adds x7, x7, x3 adcs x8, x8, x4 adc x9, xzr, xzr # A[0] * B[2] - mul x3, x15, x21 - umulh x4, x15, x21 + mul x3, x14, x20 + umulh x4, x14, x20 adds x8, x8, x3 adc x9, x9, x4 # A[1] * B[1] - mul x3, x16, x20 - umulh x4, x16, x20 + mul x3, x15, x19 + umulh x4, x15, x19 adds x8, x8, x3 adcs x9, x9, x4 adc x10, xzr, xzr # A[2] * B[0] - mul x3, x17, x19 - umulh x4, x17, x19 + mul x3, x16, x18 + umulh x4, x16, x18 adds x8, x8, x3 adcs x9, x9, x4 adc x10, x10, xzr # A[0] * B[3] - mul x3, x15, x22 - umulh x4, x15, x22 + mul x3, x14, x21 + umulh x4, x14, x21 adds x9, x9, x3 adcs x10, x10, x4 adc x11, xzr, xzr # A[1] * B[2] - mul x3, x16, x21 - umulh x4, x16, x21 + mul x3, x15, x20 + umulh x4, x15, x20 adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr # A[2] * B[1] - mul x3, x17, x20 - umulh x4, x17, x20 + mul x3, x16, x19 + umulh x4, x16, x19 adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr # A[3] * B[0] - mul x3, x18, x19 - umulh x4, x18, x19 + mul x3, x17, x18 + umulh x4, x17, x18 adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr # A[1] * B[3] - mul x3, x16, x22 - umulh x4, x16, x22 + mul x3, x15, x21 + umulh x4, x15, x21 adds x10, x10, x3 adcs x11, x11, x4 adc x12, xzr, xzr # A[2] * B[2] - mul x3, x17, x21 - umulh x4, x17, x21 + mul x3, x16, x20 + umulh x4, x16, x20 adds x10, x10, x3 adcs x11, x11, x4 adc x12, x12, xzr # A[3] * B[1] - mul x3, x18, x20 - umulh x4, x18, x20 + mul x3, x17, x19 + umulh x4, x17, x19 adds x10, x10, x3 adcs x11, x11, x4 adc x12, x12, xzr # A[2] * B[3] - mul x3, x17, x22 - umulh x4, x17, x22 + mul x3, x16, x21 + umulh x4, x16, x21 adds x11, x11, x3 adcs x12, x12, x4 adc x13, xzr, xzr # A[3] * B[2] - mul x3, x18, x21 - umulh x4, x18, x21 + mul x3, x17, x20 + umulh x4, x17, x20 adds x11, x11, x3 adcs x12, x12, x4 adc x13, x13, xzr # A[3] * B[3] - mul x3, x18, x22 - umulh x4, x18, x22 + mul x3, x17, x21 + umulh x4, x17, x21 adds x12, x12, x3 adc x13, x13, x4 # Reduce @@ -636,8 +631,8 @@ fe_mul: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 + asr x5, x9, #63 + and x5, x5, x3 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -646,12 +641,11 @@ fe_mul: # Store stp x6, x7, [x0] stp x8, x9, [x0, #16] - ldr x17, [x29, #16] - ldr x18, [x29, #24] - ldr x19, [x29, #32] - ldr x20, [x29, #40] - ldr x21, [x29, #48] - ldr x22, [x29, #56] + ldr x17, [x29, #24] + ldr x18, [x29, #32] + ldr x19, [x29, #40] + ldr x20, [x29, #48] + ldr x21, [x29, #56] ldp x29, x30, [sp], #0x40 ret .size fe_mul,.-fe_mul @@ -660,118 +654,113 @@ fe_mul: .type fe_sq,@function .align 4 fe_sq: - stp x29, x30, [sp, #-32]! - add x29, sp, #0 - str x17, [x29, #24] # Square - ldp x14, x15, [x1] - ldp x16, x17, [x1, #16] + ldp x13, x14, [x1] + ldp x15, x16, [x1, #16] # A[0] * A[1] - mul x3, x14, x15 - umulh x4, x14, x15 + mul x6, x13, x14 + umulh x7, x13, x14 # A[0] * A[2] - mul x11, x14, x16 - umulh x5, x14, x16 - adds x4, x4, x11 - adc x5, x5, xzr - # A[0] * A[3] - mul x11, x14, x17 - umulh x6, x14, x17 - adds x5, x5, x11 - adc x6, x6, xzr - # A[1] * A[2] - mul x11, x15, x16 - umulh x12, x15, x16 - adds x5, x5, x11 - adcs x6, x6, x12 - adc x7, xzr, xzr - # A[1] * A[3] - mul x11, x15, x17 - umulh x12, x15, x17 - adds x6, x6, x11 - adc x7, x7, x12 - # A[2] * A[3] - mul x11, x16, x17 - umulh x8, x16, x17 - adds x7, x7, x11 + mul x2, x13, x15 + umulh x8, x13, x15 + adds x7, x7, x2 adc x8, x8, xzr + # A[0] * A[3] + mul x2, x13, x16 + umulh x9, x13, x16 + adds x8, x8, x2 + adc x9, x9, xzr + # A[1] * A[2] + mul x2, x14, x15 + umulh x3, x14, x15 + adds x8, x8, x2 + adcs x9, x9, x3 + adc x10, xzr, xzr + # A[1] * A[3] + mul x2, x14, x16 + umulh x3, x14, x16 + adds x9, x9, x2 + adc x10, x10, x3 + # A[2] * A[3] + mul x2, x15, x16 + umulh x11, x15, x16 + adds x10, x10, x2 + adc x11, x11, xzr # Double - adds x3, x3, x3 - adcs x4, x4, x4 - adcs x5, x5, x5 - adcs x6, x6, x6 + adds x6, x6, x6 adcs x7, x7, x7 adcs x8, x8, x8 - adc x9, xzr, xzr + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adc x12, xzr, xzr # A[0] * A[0] - mul x2, x14, x14 - umulh x10, x14, x14 + mul x5, x13, x13 + umulh x4, x13, x13 # A[1] * A[1] - mul x11, x15, x15 - umulh x12, x15, x15 - adds x3, x3, x10 - adcs x4, x4, x11 - adc x10, x12, xzr + mul x2, x14, x14 + umulh x3, x14, x14 + adds x6, x6, x4 + adcs x7, x7, x2 + adc x4, x3, xzr # A[2] * A[2] - mul x11, x16, x16 - umulh x12, x16, x16 - adds x5, x5, x10 - adcs x6, x6, x11 - adc x10, x12, xzr + mul x2, x15, x15 + umulh x3, x15, x15 + adds x8, x8, x4 + adcs x9, x9, x2 + adc x4, x3, xzr # A[3] * A[3] - mul x11, x17, x17 - umulh x12, x17, x17 - adds x7, x7, x10 - adcs x8, x8, x11 - adc x9, x9, x12 + mul x2, x16, x16 + umulh x3, x16, x16 + adds x10, x10, x4 + adcs x11, x11, x2 + adc x12, x12, x3 # Reduce # Move top half into t4-t7 and remove top bit from t3 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - extr x6, x6, x5, #63 - and x5, x5, #0x7fffffffffffffff + and x8, x8, #0x7fffffffffffffff # Multiply top half by 19 - mov x11, #19 - mul x12, x11, x6 - umulh x6, x11, x6 - adds x2, x2, x12 - mul x12, x11, x7 - umulh x7, x11, x7 - adcs x3, x3, x12 - mul x12, x11, x8 - umulh x8, x11, x8 - adcs x4, x4, x12 - mul x12, x11, x9 - umulh x13, x11, x9 - adcs x5, x5, x12 - adc x13, x13, xzr + mov x2, #19 + mul x3, x2, x9 + umulh x9, x2, x9 + adds x5, x5, x3 + mul x3, x2, x10 + umulh x10, x2, x10 + adcs x6, x6, x3 + mul x3, x2, x11 + umulh x11, x2, x11 + adcs x7, x7, x3 + mul x3, x2, x12 + umulh x4, x2, x12 + adcs x8, x8, x3 + adc x4, x4, xzr # Add remaining product results in - adds x3, x3, x6 - adcs x4, x4, x7 - adcs x5, x5, x8 - adc x13, x13, xzr + adds x6, x6, x9 + adcs x7, x7, x10 + adcs x8, x8, x11 + adc x4, x4, xzr # Overflow - extr x13, x13, x5, #63 - mul x13, x13, x11 - and x5, x5, #0x7fffffffffffffff - adds x2, x2, x13 - adcs x3, x3, xzr - adcs x4, x4, xzr - adc x5, x5, xzr + extr x4, x4, x8, #63 + mul x4, x4, x2 + and x8, x8, #0x7fffffffffffffff + adds x5, x5, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr # Reduce if top bit set - lsr x13, x5, #63 - mul x13, x13, x11 - and x5, x5, #0x7fffffffffffffff - adds x2, x2, x13 - adcs x3, x3, xzr - adcs x4, x4, xzr - adc x5, x5, xzr + asr x4, x8, #63 + and x4, x4, x2 + and x8, x8, #0x7fffffffffffffff + adds x5, x5, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr # Store - stp x2, x3, [x0] - stp x4, x5, [x0, #16] - ldr x17, [x29, #24] - ldp x29, x30, [sp], #32 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] ret .size fe_sq,.-fe_sq .text @@ -780,34 +769,34 @@ fe_sq: .align 4 fe_mul121666: # Multiply by 121666 - ldp x2, x3, [x1] - ldp x4, x5, [x1, #16] - mov x13, #0xdb42 - movk x13, #1, lsl 16 - mul x6, x2, x13 - umulh x7, x2, x13 - mul x11, x3, x13 - umulh x12, x3, x13 - adds x7, x7, x11 - adc x8, xzr, x12 - mul x11, x4, x13 - umulh x12, x4, x13 - adds x8, x8, x11 - adc x9, xzr, x12 - mul x11, x5, x13 - umulh x12, x5, x13 - adds x9, x9, x11 - adc x12, xzr, x12 - mov x13, #19 - extr x12, x12, x9, #63 - mul x12, x12, x13 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x12 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - stp x6, x7, [x0] - stp x8, x9, [x0, #16] + ldp x5, x6, [x1] + ldp x7, x8, [x1, #16] + mov x4, #0xdb42 + movk x4, #1, lsl 16 + mul x9, x5, x4 + umulh x10, x5, x4 + mul x2, x6, x4 + umulh x3, x6, x4 + adds x10, x10, x2 + adc x11, xzr, x3 + mul x2, x7, x4 + umulh x3, x7, x4 + adds x11, x11, x2 + adc x12, xzr, x3 + mul x2, x8, x4 + umulh x3, x8, x4 + adds x12, x12, x2 + adc x3, xzr, x3 + mov x4, #19 + extr x3, x3, x12, #63 + mul x3, x3, x4 + and x12, x12, #0x7fffffffffffffff + adds x9, x9, x3 + adcs x10, x10, xzr + adcs x11, x11, xzr + adc x12, x12, xzr + stp x9, x10, [x0] + stp x11, x12, [x0, #16] ret .size fe_mul121666,.-fe_mul121666 .text @@ -817,128 +806,126 @@ fe_mul121666: fe_sq2: stp x29, x30, [sp, #-32]! add x29, sp, #0 - str x17, [x29, #16] - str x18, [x29, #24] + str x17, [x29, #24] # Square * 2 - ldp x2, x3, [x1] - ldp x4, x5, [x1, #16] + ldp x5, x6, [x1] + ldp x7, x8, [x1, #16] # A[0] * A[1] - mul x7, x2, x3 - umulh x8, x2, x3 + mul x10, x5, x6 + umulh x11, x5, x6 # A[0] * A[2] - mul x11, x2, x4 - umulh x9, x2, x4 - adds x8, x8, x11 - adc x9, x9, xzr + mul x2, x5, x7 + umulh x12, x5, x7 + adds x11, x11, x2 + adc x12, x12, xzr # A[0] * A[3] - mul x11, x2, x5 - umulh x10, x2, x5 - adds x9, x9, x11 - adc x10, x10, xzr + mul x2, x5, x8 + umulh x13, x5, x8 + adds x12, x12, x2 + adc x13, x13, xzr # A[1] * A[2] - mul x11, x3, x4 - umulh x12, x3, x4 - adds x9, x9, x11 - adcs x10, x10, x12 + mul x2, x6, x7 + umulh x3, x6, x7 + adds x12, x12, x2 + adcs x13, x13, x3 adc x14, xzr, xzr # A[1] * A[3] - mul x11, x3, x5 - umulh x12, x3, x5 - adds x10, x10, x11 - adc x14, x14, x12 + mul x2, x6, x8 + umulh x3, x6, x8 + adds x13, x13, x2 + adc x14, x14, x3 # A[2] * A[3] - mul x11, x4, x5 - umulh x15, x4, x5 - adds x14, x14, x11 + mul x2, x7, x8 + umulh x15, x7, x8 + adds x14, x14, x2 adc x15, x15, xzr # Double - adds x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 + adds x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adc x16, xzr, xzr # A[0] * A[0] - mul x6, x2, x2 - umulh x17, x2, x2 + mul x9, x5, x5 + umulh x17, x5, x5 # A[1] * A[1] - mul x11, x3, x3 - umulh x12, x3, x3 - adds x7, x7, x17 - adcs x8, x8, x11 - adc x17, x12, xzr + mul x2, x6, x6 + umulh x3, x6, x6 + adds x10, x10, x17 + adcs x11, x11, x2 + adc x17, x3, xzr # A[2] * A[2] - mul x11, x4, x4 - umulh x12, x4, x4 - adds x9, x9, x17 - adcs x10, x10, x11 - adc x17, x12, xzr + mul x2, x7, x7 + umulh x3, x7, x7 + adds x12, x12, x17 + adcs x13, x13, x2 + adc x17, x3, xzr # A[3] * A[3] - mul x11, x5, x5 - umulh x12, x5, x5 + mul x2, x8, x8 + umulh x3, x8, x8 adds x14, x14, x17 - adcs x15, x15, x11 - adc x16, x16, x12 + adcs x15, x15, x2 + adc x16, x16, x3 # Double and Reduce - mov x11, #0x169 + mov x2, #0x169 # Move top half into t4-t7 and remove top bit from t3 lsr x17, x16, #61 extr x16, x16, x15, #62 extr x15, x15, x14, #62 - extr x14, x14, x10, #62 - extr x10, x10, x9, #62 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - lsl x6, x6, #1 - and x9, x9, #0x7fffffffffffffff + extr x14, x14, x13, #62 + extr x13, x13, x12, #62 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + lsl x9, x9, #1 + and x12, x12, #0x7fffffffffffffff # Two left, only one right and x16, x16, #0x7fffffffffffffff # Multiply top bits by 19*19 - mul x17, x17, x11 + mul x17, x17, x2 # Multiply top half by 19 - mov x11, #19 - mul x12, x11, x10 - umulh x10, x11, x10 - adds x6, x6, x12 - mul x12, x11, x14 - umulh x14, x11, x14 - adcs x7, x7, x12 - mul x12, x11, x15 - umulh x15, x11, x15 - adcs x8, x8, x12 - mul x12, x11, x16 - umulh x13, x11, x16 - adcs x9, x9, x12 - adc x13, x13, xzr + mov x2, #19 + mul x3, x2, x13 + umulh x13, x2, x13 + adds x9, x9, x3 + mul x3, x2, x14 + umulh x14, x2, x14 + adcs x10, x10, x3 + mul x3, x2, x15 + umulh x15, x2, x15 + adcs x11, x11, x3 + mul x3, x2, x16 + umulh x4, x2, x16 + adcs x12, x12, x3 + adc x4, x4, xzr # Add remaining product results in - adds x6, x6, x17 - adcs x7, x7, x10 - adcs x8, x8, x14 - adcs x9, x9, x15 - adc x13, x13, xzr + adds x9, x9, x17 + adcs x10, x10, x13 + adcs x11, x11, x14 + adcs x12, x12, x15 + adc x4, x4, xzr # Overflow - extr x13, x13, x9, #63 - mul x13, x13, x11 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x13 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr + extr x4, x4, x12, #63 + mul x4, x4, x2 + and x12, x12, #0x7fffffffffffffff + adds x9, x9, x4 + adcs x10, x10, xzr + adcs x11, x11, xzr + adc x12, x12, xzr # Reduce if top bit set - lsr x13, x9, #63 - mul x13, x13, x11 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x13 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr + asr x4, x12, #63 + and x4, x4, x2 + and x12, x12, #0x7fffffffffffffff + adds x9, x9, x4 + adcs x10, x10, xzr + adcs x11, x11, xzr + adc x12, x12, xzr # Store - stp x6, x7, [x0] - stp x8, x9, [x0, #16] - ldr x17, [x29, #16] - ldr x18, [x29, #24] + stp x9, x10, [x0] + stp x11, x12, [x0, #16] + ldr x17, [x29, #24] ldp x29, x30, [sp], #32 ret .size fe_sq2,.-fe_sq2 @@ -1063,8 +1050,6 @@ L_fe_invert8: ldr x0, [x29, #144] add x2, x29, #16 bl fe_mul - ldr x1, [x29, #152] - ldr x0, [x29, #144] ldr x20, [x29, #168] ldp x29, x30, [sp], #0xb0 ret @@ -1074,17 +1059,20 @@ L_fe_invert8: .type curve25519,@function .align 4 curve25519: - stp x29, x30, [sp, #-272]! + stp x29, x30, [sp, #-288]! add x29, sp, #0 - str x17, [x29, #200] - str x18, [x29, #208] - str x19, [x29, #216] - str x20, [x29, #224] - str x21, [x29, #232] - str x22, [x29, #240] - str x23, [x29, #248] - str x24, [x29, #256] - str x25, [x29, #264] + str x17, [x29, #192] + str x18, [x29, #200] + str x19, [x29, #208] + str x20, [x29, #216] + str x21, [x29, #224] + str x22, [x29, #232] + str x23, [x29, #240] + str x24, [x29, #248] + str x25, [x29, #256] + str x26, [x29, #264] + str x27, [x29, #272] + str x28, [x29, #280] mov x22, xzr str x0, [x29, #176] # Set one @@ -1113,1065 +1101,38 @@ L_curve25519_bits: eor x22, x22, x23 # Conditional Swap cmp x22, #1 - ldp x6, x7, [x0] - ldp x8, x9, [x0, #16] - ldp x10, x11, [x29, #80] - ldp x12, x13, [x29, #96] - csel x14, x6, x10, eq - csel x6, x10, x6, eq - csel x15, x7, x11, eq - csel x7, x11, x7, eq - csel x16, x8, x12, eq - csel x8, x12, x8, eq - csel x17, x9, x13, eq - csel x9, x13, x9, eq - stp x6, x7, [x0] - stp x8, x9, [x0, #16] - stp x14, x15, [x29, #80] - stp x16, x17, [x29, #96] - # Conditional Swap - cmp x22, #1 - ldp x6, x7, [x29, #16] - ldp x8, x9, [x29, #32] - ldp x10, x11, [x29, #48] - ldp x12, x13, [x29, #64] - csel x14, x6, x10, eq - csel x6, x10, x6, eq - csel x15, x7, x11, eq - csel x7, x11, x7, eq - csel x16, x8, x12, eq - csel x8, x12, x8, eq - csel x17, x9, x13, eq - csel x9, x13, x9, eq - stp x6, x7, [x29, #16] - stp x8, x9, [x29, #32] - stp x14, x15, [x29, #48] - stp x16, x17, [x29, #64] - mov x22, x23 - # Add - ldp x6, x7, [x0] - ldp x8, x9, [x0, #16] - ldp x10, x11, [x29, #16] - ldp x12, x13, [x29, #32] - adds x14, x6, x10 - adcs x15, x7, x11 - adcs x16, x8, x12 - adc x17, x9, x13 - mov x3, #-19 - asr x23, x17, #63 - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x14, x14, x3 - sbcs x15, x15, x23 - sbcs x16, x16, x23 - sbc x17, x17, x4 - # Sub - subs x6, x6, x10 - sbcs x7, x7, x11 - sbcs x8, x8, x12 - sbcs x9, x9, x13 - mov x3, #-19 - csetm x23, cc - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x6, x6, x3 - adcs x7, x7, x23 - adcs x8, x8, x23 - adc x9, x9, x4 - stp x14, x15, [x0] - stp x16, x17, [x0, #16] - stp x6, x7, [x29, #144] - stp x8, x9, [x29, #160] - # Add + ldp x10, x11, [x0] + ldp x12, x13, [x0, #16] ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] - ldp x10, x11, [x29, #48] - ldp x12, x13, [x29, #64] - adds x14, x6, x10 - adcs x15, x7, x11 - adcs x16, x8, x12 - adc x17, x9, x13 - mov x3, #-19 - asr x23, x17, #63 - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x14, x14, x3 - sbcs x15, x15, x23 - sbcs x16, x16, x23 - sbc x17, x17, x4 - # Sub - subs x6, x6, x10 - sbcs x7, x7, x11 - sbcs x8, x8, x12 - sbcs x9, x9, x13 - mov x3, #-19 - csetm x23, cc - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x6, x6, x3 - adcs x7, x7, x23 - adcs x8, x8, x23 - adc x9, x9, x4 - stp x14, x15, [x29, #16] - stp x16, x17, [x29, #32] - stp x6, x7, [x29, #112] - stp x8, x9, [x29, #128] - # Multiply - ldp x18, x19, [x29, #112] - ldp x20, x21, [x29, #128] - ldp x14, x15, [x0] - ldp x16, x17, [x0, #16] - # A[0] * B[0] - mul x6, x18, x14 - umulh x7, x18, x14 - # A[0] * B[1] - mul x3, x18, x15 - umulh x8, x18, x15 - adds x7, x7, x3 - adc x8, x8, xzr - # A[1] * B[0] - mul x3, x19, x14 - umulh x4, x19, x14 - adds x7, x7, x3 - adcs x8, x8, x4 - adc x9, xzr, xzr - # A[0] * B[2] - mul x3, x18, x16 - umulh x4, x18, x16 - adds x8, x8, x3 - adc x9, x9, x4 - # A[1] * B[1] - mul x3, x19, x15 - umulh x4, x19, x15 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, xzr, xzr - # A[2] * B[0] - mul x3, x20, x14 - umulh x4, x20, x14 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, x10, xzr - # A[0] * B[3] - mul x3, x18, x17 - umulh x4, x18, x17 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * B[2] - mul x3, x19, x16 - umulh x4, x19, x16 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[2] * B[1] - mul x3, x20, x15 - umulh x4, x20, x15 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[3] * B[0] - mul x3, x21, x14 - umulh x4, x21, x14 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[1] * B[3] - mul x3, x19, x17 - umulh x4, x19, x17 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, xzr, xzr - # A[2] * B[2] - mul x3, x20, x16 - umulh x4, x20, x16 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr - # A[3] * B[1] - mul x3, x21, x15 - umulh x4, x21, x15 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr - # A[2] * B[3] - mul x3, x20, x17 - umulh x4, x20, x17 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, xzr, xzr - # A[3] * B[2] - mul x3, x21, x16 - umulh x4, x21, x16 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, x13, xzr - # A[3] * B[3] - mul x3, x21, x17 - umulh x4, x21, x17 - adds x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #48] - stp x8, x9, [x29, #64] - # Multiply + csel x14, x10, x6, eq + csel x10, x6, x10, eq + csel x15, x11, x7, eq + csel x11, x7, x11, eq + csel x16, x12, x8, eq + csel x12, x8, x12, eq + csel x17, x13, x9, eq + csel x13, x9, x13, eq + # Conditional Swap + cmp x22, #1 ldp x18, x19, [x29, #16] ldp x20, x21, [x29, #32] - ldp x14, x15, [x29, #144] - ldp x16, x17, [x29, #160] - # A[0] * B[0] - mul x6, x18, x14 - umulh x7, x18, x14 - # A[0] * B[1] - mul x3, x18, x15 - umulh x8, x18, x15 - adds x7, x7, x3 - adc x8, x8, xzr - # A[1] * B[0] - mul x3, x19, x14 - umulh x4, x19, x14 - adds x7, x7, x3 - adcs x8, x8, x4 - adc x9, xzr, xzr - # A[0] * B[2] - mul x3, x18, x16 - umulh x4, x18, x16 - adds x8, x8, x3 - adc x9, x9, x4 - # A[1] * B[1] - mul x3, x19, x15 - umulh x4, x19, x15 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, xzr, xzr - # A[2] * B[0] - mul x3, x20, x14 - umulh x4, x20, x14 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, x10, xzr - # A[0] * B[3] - mul x3, x18, x17 - umulh x4, x18, x17 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * B[2] - mul x3, x19, x16 - umulh x4, x19, x16 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[2] * B[1] - mul x3, x20, x15 - umulh x4, x20, x15 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[3] * B[0] - mul x3, x21, x14 - umulh x4, x21, x14 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[1] * B[3] - mul x3, x19, x17 - umulh x4, x19, x17 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, xzr, xzr - # A[2] * B[2] - mul x3, x20, x16 - umulh x4, x20, x16 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr - # A[3] * B[1] - mul x3, x21, x15 - umulh x4, x21, x15 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr - # A[2] * B[3] - mul x3, x20, x17 - umulh x4, x20, x17 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, xzr, xzr - # A[3] * B[2] - mul x3, x21, x16 - umulh x4, x21, x16 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, x13, xzr - # A[3] * B[3] - mul x3, x21, x17 - umulh x4, x21, x17 - adds x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #16] - stp x8, x9, [x29, #32] - # Square - ldp x18, x19, [x29, #144] - ldp x20, x21, [x29, #160] - # A[0] * A[1] - mul x7, x18, x19 - umulh x8, x18, x19 - # A[0] * A[2] - mul x3, x18, x20 - umulh x9, x18, x20 - adds x8, x8, x3 - adc x9, x9, xzr - # A[0] * A[3] - mul x3, x18, x21 - umulh x10, x18, x21 - adds x9, x9, x3 - adc x10, x10, xzr - # A[1] * A[2] - mul x3, x19, x20 - umulh x4, x19, x20 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * A[3] - mul x3, x19, x21 - umulh x4, x19, x21 - adds x10, x10, x3 - adc x11, x11, x4 - # A[2] * A[3] - mul x3, x20, x21 - umulh x12, x20, x21 - adds x11, x11, x3 - adc x12, x12, xzr - # Double - adds x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adc x13, xzr, xzr - # A[0] * A[0] - mul x6, x18, x18 - umulh x23, x18, x18 - # A[1] * A[1] - mul x3, x19, x19 - umulh x4, x19, x19 - adds x7, x7, x23 - adcs x8, x8, x3 - adc x23, x4, xzr - # A[2] * A[2] - mul x3, x20, x20 - umulh x4, x20, x20 - adds x9, x9, x23 - adcs x10, x10, x3 - adc x23, x4, xzr - # A[3] * A[3] - mul x3, x21, x21 - umulh x4, x21, x21 - adds x11, x11, x23 - adcs x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #112] - stp x8, x9, [x29, #128] - # Square - ldp x18, x19, [x0] - ldp x20, x21, [x0, #16] - # A[0] * A[1] - mul x7, x18, x19 - umulh x8, x18, x19 - # A[0] * A[2] - mul x3, x18, x20 - umulh x9, x18, x20 - adds x8, x8, x3 - adc x9, x9, xzr - # A[0] * A[3] - mul x3, x18, x21 - umulh x10, x18, x21 - adds x9, x9, x3 - adc x10, x10, xzr - # A[1] * A[2] - mul x3, x19, x20 - umulh x4, x19, x20 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * A[3] - mul x3, x19, x21 - umulh x4, x19, x21 - adds x10, x10, x3 - adc x11, x11, x4 - # A[2] * A[3] - mul x3, x20, x21 - umulh x12, x20, x21 - adds x11, x11, x3 - adc x12, x12, xzr - # Double - adds x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adc x13, xzr, xzr - # A[0] * A[0] - mul x6, x18, x18 - umulh x23, x18, x18 - # A[1] * A[1] - mul x3, x19, x19 - umulh x4, x19, x19 - adds x7, x7, x23 - adcs x8, x8, x3 - adc x23, x4, xzr - # A[2] * A[2] - mul x3, x20, x20 - umulh x4, x20, x20 - adds x9, x9, x23 - adcs x10, x10, x3 - adc x23, x4, xzr - # A[3] * A[3] - mul x3, x21, x21 - umulh x4, x21, x21 - adds x11, x11, x23 - adcs x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #144] - stp x8, x9, [x29, #160] - # Add ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] - ldp x10, x11, [x29, #16] - ldp x12, x13, [x29, #32] - adds x14, x6, x10 - adcs x15, x7, x11 - adcs x16, x8, x12 - adc x17, x9, x13 - mov x3, #-19 - asr x23, x17, #63 - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x14, x14, x3 - sbcs x15, x15, x23 - sbcs x16, x16, x23 - sbc x17, x17, x4 - # Sub - subs x6, x6, x10 - sbcs x7, x7, x11 - sbcs x8, x8, x12 - sbcs x9, x9, x13 - mov x3, #-19 - csetm x23, cc - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x6, x6, x3 - adcs x7, x7, x23 - adcs x8, x8, x23 - adc x9, x9, x4 - stp x14, x15, [x29, #80] - stp x16, x17, [x29, #96] - stp x6, x7, [x29, #16] - stp x8, x9, [x29, #32] - # Multiply - ldp x18, x19, [x29, #144] - ldp x20, x21, [x29, #160] - ldp x14, x15, [x29, #112] - ldp x16, x17, [x29, #128] - # A[0] * B[0] - mul x6, x18, x14 - umulh x7, x18, x14 - # A[0] * B[1] - mul x3, x18, x15 - umulh x8, x18, x15 - adds x7, x7, x3 - adc x8, x8, xzr - # A[1] * B[0] - mul x3, x19, x14 - umulh x4, x19, x14 - adds x7, x7, x3 - adcs x8, x8, x4 - adc x9, xzr, xzr - # A[0] * B[2] - mul x3, x18, x16 - umulh x4, x18, x16 - adds x8, x8, x3 - adc x9, x9, x4 - # A[1] * B[1] - mul x3, x19, x15 - umulh x4, x19, x15 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, xzr, xzr - # A[2] * B[0] - mul x3, x20, x14 - umulh x4, x20, x14 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, x10, xzr - # A[0] * B[3] - mul x3, x18, x17 - umulh x4, x18, x17 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * B[2] - mul x3, x19, x16 - umulh x4, x19, x16 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[2] * B[1] - mul x3, x20, x15 - umulh x4, x20, x15 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[3] * B[0] - mul x3, x21, x14 - umulh x4, x21, x14 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[1] * B[3] - mul x3, x19, x17 - umulh x4, x19, x17 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, xzr, xzr - # A[2] * B[2] - mul x3, x20, x16 - umulh x4, x20, x16 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr - # A[3] * B[1] - mul x3, x21, x15 - umulh x4, x21, x15 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr - # A[2] * B[3] - mul x3, x20, x17 - umulh x4, x20, x17 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, xzr, xzr - # A[3] * B[2] - mul x3, x21, x16 - umulh x4, x21, x16 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, x13, xzr - # A[3] * B[3] - mul x3, x21, x17 - umulh x4, x21, x17 - adds x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x0] - stp x8, x9, [x0, #16] - # Sub - ldp x6, x7, [x29, #144] - ldp x8, x9, [x29, #160] - ldp x10, x11, [x29, #112] - ldp x12, x13, [x29, #128] - subs x6, x6, x10 - sbcs x7, x7, x11 - sbcs x8, x8, x12 - sbcs x9, x9, x13 - mov x3, #-19 - csetm x23, cc - # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x6, x6, x3 - adcs x7, x7, x23 - adcs x8, x8, x23 - adc x9, x9, x4 - stp x6, x7, [x29, #144] - stp x8, x9, [x29, #160] - # Square - ldp x18, x19, [x29, #16] - ldp x20, x21, [x29, #32] - # A[0] * A[1] - mul x7, x18, x19 - umulh x8, x18, x19 - # A[0] * A[2] - mul x3, x18, x20 - umulh x9, x18, x20 - adds x8, x8, x3 - adc x9, x9, xzr - # A[0] * A[3] - mul x3, x18, x21 - umulh x10, x18, x21 - adds x9, x9, x3 - adc x10, x10, xzr - # A[1] * A[2] - mul x3, x19, x20 - umulh x4, x19, x20 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * A[3] - mul x3, x19, x21 - umulh x4, x19, x21 - adds x10, x10, x3 - adc x11, x11, x4 - # A[2] * A[3] - mul x3, x20, x21 - umulh x12, x20, x21 - adds x11, x11, x3 - adc x12, x12, xzr - # Double - adds x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adc x13, xzr, xzr - # A[0] * A[0] - mul x6, x18, x18 - umulh x23, x18, x18 - # A[1] * A[1] - mul x3, x19, x19 - umulh x4, x19, x19 - adds x7, x7, x23 - adcs x8, x8, x3 - adc x23, x4, xzr - # A[2] * A[2] - mul x3, x20, x20 - umulh x4, x20, x20 - adds x9, x9, x23 - adcs x10, x10, x3 - adc x23, x4, xzr - # A[3] * A[3] - mul x3, x21, x21 - umulh x4, x21, x21 - adds x11, x11, x23 - adcs x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #16] - stp x8, x9, [x29, #32] - # Multiply by 121666 - ldp x18, x19, [x29, #144] - ldp x20, x21, [x29, #160] - mov x5, #0xdb42 - movk x5, #1, lsl 16 - mul x6, x18, x5 - umulh x7, x18, x5 - mul x3, x19, x5 - umulh x4, x19, x5 - adds x7, x7, x3 - adc x8, xzr, x4 - mul x3, x20, x5 - umulh x4, x20, x5 - adds x8, x8, x3 - adc x9, xzr, x4 - mul x3, x21, x5 - umulh x4, x21, x5 - adds x9, x9, x3 - adc x4, xzr, x4 - mov x5, #19 - extr x4, x4, x9, #63 - mul x4, x4, x5 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x4 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - stp x6, x7, [x29, #48] - stp x8, x9, [x29, #64] - # Square - ldp x18, x19, [x29, #80] - ldp x20, x21, [x29, #96] - # A[0] * A[1] - mul x7, x18, x19 - umulh x8, x18, x19 - # A[0] * A[2] - mul x3, x18, x20 - umulh x9, x18, x20 - adds x8, x8, x3 - adc x9, x9, xzr - # A[0] * A[3] - mul x3, x18, x21 - umulh x10, x18, x21 - adds x9, x9, x3 - adc x10, x10, xzr - # A[1] * A[2] - mul x3, x19, x20 - umulh x4, x19, x20 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * A[3] - mul x3, x19, x21 - umulh x4, x19, x21 - adds x10, x10, x3 - adc x11, x11, x4 - # A[2] * A[3] - mul x3, x20, x21 - umulh x12, x20, x21 - adds x11, x11, x3 - adc x12, x12, xzr - # Double - adds x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adc x13, xzr, xzr - # A[0] * A[0] - mul x6, x18, x18 - umulh x23, x18, x18 - # A[1] * A[1] - mul x3, x19, x19 - umulh x4, x19, x19 - adds x7, x7, x23 - adcs x8, x8, x3 - adc x23, x4, xzr - # A[2] * A[2] - mul x3, x20, x20 - umulh x4, x20, x20 - adds x9, x9, x23 - adcs x10, x10, x3 - adc x23, x4, xzr - # A[3] * A[3] - mul x3, x21, x21 - umulh x4, x21, x21 - adds x11, x11, x23 - adcs x12, x12, x3 - adc x13, x13, x4 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #80] - stp x8, x9, [x29, #96] + csel x5, x18, x6, eq + csel x18, x6, x18, eq + csel x26, x19, x7, eq + csel x19, x7, x19, eq + csel x27, x20, x8, eq + csel x20, x8, x20, eq + csel x28, x21, x9, eq + csel x21, x9, x21, eq + mov x22, x23 # Add - ldp x6, x7, [x29, #112] - ldp x8, x9, [x29, #128] - ldp x10, x11, [x29, #48] - ldp x12, x13, [x29, #64] - adds x6, x6, x10 - adcs x7, x7, x11 - adcs x8, x8, x12 - adc x9, x9, x13 + adds x6, x10, x18 + adcs x7, x11, x19 + adcs x8, x12, x20 + adc x9, x13, x21 mov x3, #-19 asr x23, x9, #63 # Mask the modulus @@ -2182,129 +1143,650 @@ L_curve25519_bits: sbcs x7, x7, x23 sbcs x8, x8, x23 sbc x9, x9, x4 - stp x6, x7, [x29, #112] - stp x8, x9, [x29, #128] + # Sub + subs x18, x10, x18 + sbcs x19, x11, x19 + sbcs x20, x12, x20 + sbcs x21, x13, x21 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x18, x18, x3 + adcs x19, x19, x23 + adcs x20, x20, x23 + adc x21, x21, x4 + stp x18, x19, [x29, #144] + stp x20, x21, [x29, #160] + # Add + adds x10, x14, x5 + adcs x11, x15, x26 + adcs x12, x16, x27 + adc x13, x17, x28 + mov x3, #-19 + asr x23, x13, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x23 + sbcs x12, x12, x23 + sbc x13, x13, x4 + # Sub + subs x14, x14, x5 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x14, x14, x3 + adcs x15, x15, x23 + adcs x16, x16, x23 + adc x17, x17, x4 # Multiply - ldp x18, x19, [x2] - ldp x20, x21, [x2, #16] - ldp x14, x15, [x29, #16] - ldp x16, x17, [x29, #32] # A[0] * B[0] - mul x6, x18, x14 - umulh x7, x18, x14 + mul x18, x14, x6 + umulh x19, x14, x6 # A[0] * B[1] - mul x3, x18, x15 - umulh x8, x18, x15 + mul x3, x14, x7 + umulh x20, x14, x7 + adds x19, x19, x3 + adc x20, x20, xzr + # A[1] * B[0] + mul x3, x15, x6 + umulh x4, x15, x6 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + umulh x4, x14, x8 + adds x20, x20, x3 + adc x21, x21, x4 + # A[1] * B[1] + mul x3, x15, x7 + umulh x4, x15, x7 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x23, xzr, xzr + # A[2] * B[0] + mul x3, x16, x6 + umulh x4, x16, x6 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x23, x23, xzr + # A[0] * B[3] + mul x3, x14, x9 + umulh x4, x14, x9 + adds x21, x21, x3 + adcs x23, x23, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x8 + umulh x4, x15, x8 + adds x21, x21, x3 + adcs x23, x23, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x7 + umulh x4, x16, x7 + adds x21, x21, x3 + adcs x23, x23, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x6 + umulh x4, x17, x6 + adds x21, x21, x3 + adcs x23, x23, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x9 + umulh x4, x15, x9 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x8 + umulh x4, x16, x8 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x7 + umulh x4, x17, x7 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x9 + umulh x4, x16, x9 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x8 + umulh x4, x17, x8 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x9 + umulh x4, x17, x9 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x21, #63 + and x21, x21, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x23 + umulh x23, x3, x23 + adds x18, x18, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x19, x19, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x20, x20, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x21, x21, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x19, x19, x23 + adcs x20, x20, x26 + adcs x21, x21, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x21, #63 + mul x5, x5, x3 + and x21, x21, #0x7fffffffffffffff + adds x18, x18, x5 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # Reduce if top bit set + asr x5, x21, #63 + and x5, x5, x3 + and x21, x21, #0x7fffffffffffffff + adds x18, x18, x5 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # Store + stp x18, x19, [x29, #112] + stp x20, x21, [x29, #128] + # Multiply + ldp x23, x26, [x29, #144] + ldp x27, x28, [x29, #160] + # A[0] * B[0] + mul x18, x10, x23 + umulh x19, x10, x23 + # A[0] * B[1] + mul x3, x10, x26 + umulh x20, x10, x26 + adds x19, x19, x3 + adc x20, x20, xzr + # A[1] * B[0] + mul x3, x11, x23 + umulh x4, x11, x23 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x10, x27 + umulh x4, x10, x27 + adds x20, x20, x3 + adc x21, x21, x4 + # A[1] * B[1] + mul x3, x11, x26 + umulh x4, x11, x26 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x14, xzr, xzr + # A[2] * B[0] + mul x3, x12, x23 + umulh x4, x12, x23 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x14, x14, xzr + # A[0] * B[3] + mul x3, x10, x28 + umulh x4, x10, x28 + adds x21, x21, x3 + adcs x14, x14, x4 + adc x15, xzr, xzr + # A[1] * B[2] + mul x3, x11, x27 + umulh x4, x11, x27 + adds x21, x21, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[2] * B[1] + mul x3, x12, x26 + umulh x4, x12, x26 + adds x21, x21, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[3] * B[0] + mul x3, x13, x23 + umulh x4, x13, x23 + adds x21, x21, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[1] * B[3] + mul x3, x11, x28 + umulh x4, x11, x28 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, xzr, xzr + # A[2] * B[2] + mul x3, x12, x27 + umulh x4, x12, x27 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[3] * B[1] + mul x3, x13, x26 + umulh x4, x13, x26 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[2] * B[3] + mul x3, x12, x28 + umulh x4, x12, x28 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, xzr, xzr + # A[3] * B[2] + mul x3, x13, x27 + umulh x4, x13, x27 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[3] * B[3] + mul x3, x13, x28 + umulh x4, x13, x28 + adds x16, x16, x3 + adc x17, x17, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + extr x15, x15, x14, #63 + extr x14, x14, x21, #63 + and x21, x21, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x14 + umulh x14, x3, x14 + adds x18, x18, x4 + mul x4, x3, x15 + umulh x15, x3, x15 + adcs x19, x19, x4 + mul x4, x3, x16 + umulh x16, x3, x16 + adcs x20, x20, x4 + mul x4, x3, x17 + umulh x5, x3, x17 + adcs x21, x21, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x19, x19, x14 + adcs x20, x20, x15 + adcs x21, x21, x16 + adc x5, x5, xzr + # Overflow + extr x5, x5, x21, #63 + mul x5, x5, x3 + and x21, x21, #0x7fffffffffffffff + adds x18, x18, x5 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # Reduce if top bit set + asr x5, x21, #63 + and x5, x5, x3 + and x21, x21, #0x7fffffffffffffff + adds x18, x18, x5 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # Store + # Square + # A[0] * A[1] + mul x11, x23, x26 + umulh x12, x23, x26 + # A[0] * A[2] + mul x3, x23, x27 + umulh x13, x23, x27 + adds x12, x12, x3 + adc x13, x13, xzr + # A[0] * A[3] + mul x3, x23, x28 + umulh x14, x23, x28 + adds x13, x13, x3 + adc x14, x14, xzr + # A[1] * A[2] + mul x3, x26, x27 + umulh x4, x26, x27 + adds x13, x13, x3 + adcs x14, x14, x4 + adc x15, xzr, xzr + # A[1] * A[3] + mul x3, x26, x28 + umulh x4, x26, x28 + adds x14, x14, x3 + adc x15, x15, x4 + # A[2] * A[3] + mul x3, x27, x28 + umulh x16, x27, x28 + adds x15, x15, x3 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + mul x10, x23, x23 + umulh x5, x23, x23 + # A[1] * A[1] + mul x3, x26, x26 + umulh x4, x26, x26 + adds x11, x11, x5 + adcs x12, x12, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x27, x27 + umulh x4, x27, x27 + adds x13, x13, x5 + adcs x14, x14, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x28, x28 + umulh x4, x28, x28 + adds x15, x15, x5 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + and x13, x13, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x14 + umulh x14, x3, x14 + adds x10, x10, x4 + mul x4, x3, x15 + umulh x15, x3, x15 + adcs x11, x11, x4 + mul x4, x3, x16 + umulh x16, x3, x16 + adcs x12, x12, x4 + mul x4, x3, x17 + umulh x5, x3, x17 + adcs x13, x13, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x11, x11, x14 + adcs x12, x12, x15 + adcs x13, x13, x16 + adc x5, x5, xzr + # Overflow + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Reduce if top bit set + asr x5, x13, #63 + and x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Store + # Square + # A[0] * A[1] + mul x15, x6, x7 + umulh x16, x6, x7 + # A[0] * A[2] + mul x3, x6, x8 + umulh x17, x6, x8 + adds x16, x16, x3 + adc x17, x17, xzr + # A[0] * A[3] + mul x3, x6, x9 + umulh x23, x6, x9 + adds x17, x17, x3 + adc x23, x23, xzr + # A[1] * A[2] + mul x3, x7, x8 + umulh x4, x7, x8 + adds x17, x17, x3 + adcs x23, x23, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x7, x9 + umulh x4, x7, x9 + adds x23, x23, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x8, x9 + umulh x27, x8, x9 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x23, x23, x23 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x14, x6, x6 + umulh x5, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + umulh x4, x7, x7 + adds x15, x15, x5 + adcs x16, x16, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x8, x8 + umulh x4, x8, x8 + adds x17, x17, x5 + adcs x23, x23, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x9, x9 + umulh x4, x9, x9 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x17, #63 + and x17, x17, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x23 + umulh x23, x3, x23 + adds x14, x14, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x15, x15, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x16, x16, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x17, x17, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x15, x15, x23 + adcs x16, x16, x26 + adcs x17, x17, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Reduce if top bit set + asr x5, x17, #63 + and x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Store + # Multiply + # A[0] * B[0] + mul x6, x14, x10 + umulh x7, x14, x10 + # A[0] * B[1] + mul x3, x14, x11 + umulh x8, x14, x11 adds x7, x7, x3 adc x8, x8, xzr # A[1] * B[0] - mul x3, x19, x14 - umulh x4, x19, x14 + mul x3, x15, x10 + umulh x4, x15, x10 adds x7, x7, x3 adcs x8, x8, x4 adc x9, xzr, xzr # A[0] * B[2] - mul x3, x18, x16 - umulh x4, x18, x16 + mul x3, x14, x12 + umulh x4, x14, x12 adds x8, x8, x3 adc x9, x9, x4 # A[1] * B[1] - mul x3, x19, x15 - umulh x4, x19, x15 + mul x3, x15, x11 + umulh x4, x15, x11 adds x8, x8, x3 adcs x9, x9, x4 - adc x10, xzr, xzr + adc x23, xzr, xzr # A[2] * B[0] - mul x3, x20, x14 - umulh x4, x20, x14 + mul x3, x16, x10 + umulh x4, x16, x10 adds x8, x8, x3 adcs x9, x9, x4 - adc x10, x10, xzr + adc x23, x23, xzr # A[0] * B[3] - mul x3, x18, x17 - umulh x4, x18, x17 + mul x3, x14, x13 + umulh x4, x14, x13 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr + adcs x23, x23, x4 + adc x26, xzr, xzr # A[1] * B[2] - mul x3, x19, x16 - umulh x4, x19, x16 + mul x3, x15, x12 + umulh x4, x15, x12 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + adcs x23, x23, x4 + adc x26, x26, xzr # A[2] * B[1] - mul x3, x20, x15 - umulh x4, x20, x15 + mul x3, x16, x11 + umulh x4, x16, x11 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + adcs x23, x23, x4 + adc x26, x26, xzr # A[3] * B[0] - mul x3, x21, x14 - umulh x4, x21, x14 + mul x3, x17, x10 + umulh x4, x17, x10 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + adcs x23, x23, x4 + adc x26, x26, xzr # A[1] * B[3] - mul x3, x19, x17 - umulh x4, x19, x17 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, xzr, xzr + mul x3, x15, x13 + umulh x4, x15, x13 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr # A[2] * B[2] - mul x3, x20, x16 - umulh x4, x20, x16 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr + mul x3, x16, x12 + umulh x4, x16, x12 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr # A[3] * B[1] - mul x3, x21, x15 - umulh x4, x21, x15 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr + mul x3, x17, x11 + umulh x4, x17, x11 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr # A[2] * B[3] - mul x3, x20, x17 - umulh x4, x20, x17 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, xzr, xzr + mul x3, x16, x13 + umulh x4, x16, x13 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr # A[3] * B[2] - mul x3, x21, x16 - umulh x4, x21, x16 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, x13, xzr + mul x3, x17, x12 + umulh x4, x17, x12 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr # A[3] * B[3] - mul x3, x21, x17 - umulh x4, x21, x17 - adds x12, x12, x3 - adc x13, x13, x4 + mul x3, x17, x13 + umulh x4, x17, x13 + adds x27, x27, x3 + adc x28, x28, x4 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x9, #63 and x9, x9, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 + mul x4, x3, x23 + umulh x23, x3, x23 adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 + mul x4, x3, x26 + umulh x26, x3, x26 adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 + mul x4, x3, x27 + umulh x27, x3, x27 adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 + mul x4, x3, x28 + umulh x5, x3, x28 adcs x9, x9, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 + adds x7, x7, x23 + adcs x8, x8, x26 + adcs x9, x9, x27 adc x5, x5, xzr # Overflow extr x5, x5, x9, #63 @@ -2315,137 +1797,188 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 + asr x5, x9, #63 + and x5, x5, x3 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr adcs x8, x8, xzr adc x9, x9, xzr # Store - stp x6, x7, [x29, #48] - stp x8, x9, [x29, #64] + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + # Sub + subs x14, x14, x10 + sbcs x15, x15, x11 + sbcs x16, x16, x12 + sbcs x17, x17, x13 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x14, x14, x3 + adcs x15, x15, x23 + adcs x16, x16, x23 + adc x17, x17, x4 + # Multiply by 121666 + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x14, x5 + umulh x7, x14, x5 + mul x3, x15, x5 + umulh x4, x15, x5 + adds x7, x7, x3 + adc x8, xzr, x4 + mul x3, x16, x5 + umulh x4, x16, x5 + adds x8, x8, x3 + adc x9, xzr, x4 + mul x3, x17, x5 + umulh x4, x17, x5 + adds x9, x9, x3 + adc x4, xzr, x4 + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x4 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Add + adds x10, x10, x6 + adcs x11, x11, x7 + adcs x12, x12, x8 + adc x13, x13, x9 + mov x3, #-19 + asr x23, x13, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x23 + sbcs x12, x12, x23 + sbc x13, x13, x4 # Multiply - ldp x18, x19, [x29, #144] - ldp x20, x21, [x29, #160] - ldp x14, x15, [x29, #112] - ldp x16, x17, [x29, #128] # A[0] * B[0] - mul x6, x18, x14 - umulh x7, x18, x14 + mul x6, x14, x10 + umulh x7, x14, x10 # A[0] * B[1] - mul x3, x18, x15 - umulh x8, x18, x15 + mul x3, x14, x11 + umulh x8, x14, x11 adds x7, x7, x3 adc x8, x8, xzr # A[1] * B[0] - mul x3, x19, x14 - umulh x4, x19, x14 + mul x3, x15, x10 + umulh x4, x15, x10 adds x7, x7, x3 adcs x8, x8, x4 adc x9, xzr, xzr # A[0] * B[2] - mul x3, x18, x16 - umulh x4, x18, x16 + mul x3, x14, x12 + umulh x4, x14, x12 adds x8, x8, x3 adc x9, x9, x4 # A[1] * B[1] - mul x3, x19, x15 - umulh x4, x19, x15 + mul x3, x15, x11 + umulh x4, x15, x11 adds x8, x8, x3 adcs x9, x9, x4 - adc x10, xzr, xzr + adc x23, xzr, xzr # A[2] * B[0] - mul x3, x20, x14 - umulh x4, x20, x14 + mul x3, x16, x10 + umulh x4, x16, x10 adds x8, x8, x3 adcs x9, x9, x4 - adc x10, x10, xzr + adc x23, x23, xzr # A[0] * B[3] - mul x3, x18, x17 - umulh x4, x18, x17 + mul x3, x14, x13 + umulh x4, x14, x13 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr + adcs x23, x23, x4 + adc x26, xzr, xzr # A[1] * B[2] - mul x3, x19, x16 - umulh x4, x19, x16 + mul x3, x15, x12 + umulh x4, x15, x12 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + adcs x23, x23, x4 + adc x26, x26, xzr # A[2] * B[1] - mul x3, x20, x15 - umulh x4, x20, x15 + mul x3, x16, x11 + umulh x4, x16, x11 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + adcs x23, x23, x4 + adc x26, x26, xzr # A[3] * B[0] - mul x3, x21, x14 - umulh x4, x21, x14 + mul x3, x17, x10 + umulh x4, x17, x10 adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + adcs x23, x23, x4 + adc x26, x26, xzr # A[1] * B[3] - mul x3, x19, x17 - umulh x4, x19, x17 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, xzr, xzr + mul x3, x15, x13 + umulh x4, x15, x13 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr # A[2] * B[2] - mul x3, x20, x16 - umulh x4, x20, x16 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr + mul x3, x16, x12 + umulh x4, x16, x12 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr # A[3] * B[1] - mul x3, x21, x15 - umulh x4, x21, x15 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr + mul x3, x17, x11 + umulh x4, x17, x11 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr # A[2] * B[3] - mul x3, x20, x17 - umulh x4, x20, x17 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, xzr, xzr + mul x3, x16, x13 + umulh x4, x16, x13 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr # A[3] * B[2] - mul x3, x21, x16 - umulh x4, x21, x16 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, x13, xzr + mul x3, x17, x12 + umulh x4, x17, x12 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr # A[3] * B[3] - mul x3, x21, x17 - umulh x4, x21, x17 - adds x12, x12, x3 - adc x13, x13, x4 + mul x3, x17, x13 + umulh x4, x17, x13 + adds x27, x27, x3 + adc x28, x28, x4 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x9, #63 and x9, x9, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 + mul x4, x3, x23 + umulh x23, x3, x23 adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 + mul x4, x3, x26 + umulh x26, x3, x26 adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 + mul x4, x3, x27 + umulh x27, x3, x27 adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 + mul x4, x3, x28 + umulh x5, x3, x28 adcs x9, x9, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 + adds x7, x7, x23 + adcs x8, x8, x26 + adcs x9, x9, x27 adc x5, x5, xzr # Overflow extr x5, x5, x9, #63 @@ -2456,8 +1989,8 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 + asr x5, x9, #63 + and x5, x5, x3 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2466,6 +1999,385 @@ L_curve25519_bits: # Store stp x6, x7, [x29, #16] stp x8, x9, [x29, #32] + # Add + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] + adds x10, x6, x18 + adcs x11, x7, x19 + adcs x12, x8, x20 + adc x13, x9, x21 + mov x3, #-19 + asr x23, x13, #63 + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x23 + sbcs x12, x12, x23 + sbc x13, x13, x4 + # Sub + subs x18, x6, x18 + sbcs x19, x7, x19 + sbcs x20, x8, x20 + sbcs x21, x9, x21 + mov x3, #-19 + csetm x23, cc + # Mask the modulus + and x3, x23, x3 + and x4, x23, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x18, x18, x3 + adcs x19, x19, x23 + adcs x20, x20, x23 + adc x21, x21, x4 + # Square + # A[0] * A[1] + mul x7, x10, x11 + umulh x8, x10, x11 + # A[0] * A[2] + mul x3, x10, x12 + umulh x9, x10, x12 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x10, x13 + umulh x23, x10, x13 + adds x9, x9, x3 + adc x23, x23, xzr + # A[1] * A[2] + mul x3, x11, x12 + umulh x4, x11, x12 + adds x9, x9, x3 + adcs x23, x23, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x11, x13 + umulh x4, x11, x13 + adds x23, x23, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x12, x13 + umulh x27, x12, x13 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x23, x23, x23 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x6, x10, x10 + umulh x5, x10, x10 + # A[1] * A[1] + mul x3, x11, x11 + umulh x4, x11, x11 + adds x7, x7, x5 + adcs x8, x8, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x12, x12 + umulh x4, x12, x12 + adds x9, x9, x5 + adcs x23, x23, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x13, x13 + umulh x4, x13, x13 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x23 + umulh x23, x3, x23 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x23 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + asr x5, x9, #63 + and x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + # Square + # A[0] * A[1] + mul x7, x18, x19 + umulh x8, x18, x19 + # A[0] * A[2] + mul x3, x18, x20 + umulh x9, x18, x20 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x18, x21 + umulh x23, x18, x21 + adds x9, x9, x3 + adc x23, x23, xzr + # A[1] * A[2] + mul x3, x19, x20 + umulh x4, x19, x20 + adds x9, x9, x3 + adcs x23, x23, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x19, x21 + umulh x4, x19, x21 + adds x23, x23, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x20, x21 + umulh x27, x20, x21 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x23, x23, x23 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x6, x18, x18 + umulh x5, x18, x18 + # A[1] * A[1] + mul x3, x19, x19 + umulh x4, x19, x19 + adds x7, x7, x5 + adcs x8, x8, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x9, x9, x5 + adcs x23, x23, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x23 + umulh x23, x3, x23 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x23 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + asr x5, x9, #63 + and x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + # Multiply + ldp x14, x15, [x2] + ldp x16, x17, [x2, #16] + # A[0] * B[0] + mul x10, x14, x6 + umulh x11, x14, x6 + # A[0] * B[1] + mul x3, x14, x7 + umulh x12, x14, x7 + adds x11, x11, x3 + adc x12, x12, xzr + # A[1] * B[0] + mul x3, x15, x6 + umulh x4, x15, x6 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + umulh x4, x14, x8 + adds x12, x12, x3 + adc x13, x13, x4 + # A[1] * B[1] + mul x3, x15, x7 + umulh x4, x15, x7 + adds x12, x12, x3 + adcs x13, x13, x4 + adc x23, xzr, xzr + # A[2] * B[0] + mul x3, x16, x6 + umulh x4, x16, x6 + adds x12, x12, x3 + adcs x13, x13, x4 + adc x23, x23, xzr + # A[0] * B[3] + mul x3, x14, x9 + umulh x4, x14, x9 + adds x13, x13, x3 + adcs x23, x23, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x8 + umulh x4, x15, x8 + adds x13, x13, x3 + adcs x23, x23, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x7 + umulh x4, x16, x7 + adds x13, x13, x3 + adcs x23, x23, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x6 + umulh x4, x17, x6 + adds x13, x13, x3 + adcs x23, x23, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x9 + umulh x4, x15, x9 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x8 + umulh x4, x16, x8 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x7 + umulh x4, x17, x7 + adds x23, x23, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x9 + umulh x4, x16, x9 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x8 + umulh x4, x17, x8 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x9 + umulh x4, x17, x9 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x23, #63 + extr x23, x23, x13, #63 + and x13, x13, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x23 + umulh x23, x3, x23 + adds x10, x10, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x11, x11, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x12, x12, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x13, x13, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x11, x11, x23 + adcs x12, x12, x26 + adcs x13, x13, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Reduce if top bit set + asr x5, x13, #63 + and x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Store + stp x10, x11, [x29, #48] + stp x12, x13, [x29, #64] sub x25, x25, #1 cmp x25, #0 bge L_curve25519_bits @@ -2587,157 +2499,160 @@ L_curve25519_inv_8: bl fe_mul ldr x0, [x29, #176] # Multiply - ldp x18, x19, [x0] - ldp x20, x21, [x0, #16] - ldp x14, x15, [x29, #16] - ldp x16, x17, [x29, #32] + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] # A[0] * B[0] - mul x6, x18, x14 - umulh x7, x18, x14 + mul x14, x6, x10 + umulh x15, x6, x10 # A[0] * B[1] - mul x3, x18, x15 - umulh x8, x18, x15 - adds x7, x7, x3 - adc x8, x8, xzr + mul x3, x6, x11 + umulh x16, x6, x11 + adds x15, x15, x3 + adc x16, x16, xzr # A[1] * B[0] - mul x3, x19, x14 - umulh x4, x19, x14 - adds x7, x7, x3 - adcs x8, x8, x4 - adc x9, xzr, xzr + mul x3, x7, x10 + umulh x4, x7, x10 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, xzr, xzr # A[0] * B[2] - mul x3, x18, x16 - umulh x4, x18, x16 - adds x8, x8, x3 - adc x9, x9, x4 + mul x3, x6, x12 + umulh x4, x6, x12 + adds x16, x16, x3 + adc x17, x17, x4 # A[1] * B[1] - mul x3, x19, x15 - umulh x4, x19, x15 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, xzr, xzr + mul x3, x7, x11 + umulh x4, x7, x11 + adds x16, x16, x3 + adcs x17, x17, x4 + adc x18, xzr, xzr # A[2] * B[0] - mul x3, x20, x14 - umulh x4, x20, x14 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, x10, xzr + mul x3, x8, x10 + umulh x4, x8, x10 + adds x16, x16, x3 + adcs x17, x17, x4 + adc x18, x18, xzr # A[0] * B[3] - mul x3, x18, x17 - umulh x4, x18, x17 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr + mul x3, x6, x13 + umulh x4, x6, x13 + adds x17, x17, x3 + adcs x18, x18, x4 + adc x19, xzr, xzr # A[1] * B[2] - mul x3, x19, x16 - umulh x4, x19, x16 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + mul x3, x7, x12 + umulh x4, x7, x12 + adds x17, x17, x3 + adcs x18, x18, x4 + adc x19, x19, xzr # A[2] * B[1] - mul x3, x20, x15 - umulh x4, x20, x15 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + mul x3, x8, x11 + umulh x4, x8, x11 + adds x17, x17, x3 + adcs x18, x18, x4 + adc x19, x19, xzr # A[3] * B[0] - mul x3, x21, x14 - umulh x4, x21, x14 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr + mul x3, x9, x10 + umulh x4, x9, x10 + adds x17, x17, x3 + adcs x18, x18, x4 + adc x19, x19, xzr # A[1] * B[3] - mul x3, x19, x17 - umulh x4, x19, x17 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, xzr, xzr + mul x3, x7, x13 + umulh x4, x7, x13 + adds x18, x18, x3 + adcs x19, x19, x4 + adc x20, xzr, xzr # A[2] * B[2] - mul x3, x20, x16 - umulh x4, x20, x16 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr + mul x3, x8, x12 + umulh x4, x8, x12 + adds x18, x18, x3 + adcs x19, x19, x4 + adc x20, x20, xzr # A[3] * B[1] - mul x3, x21, x15 - umulh x4, x21, x15 - adds x10, x10, x3 - adcs x11, x11, x4 - adc x12, x12, xzr + mul x3, x9, x11 + umulh x4, x9, x11 + adds x18, x18, x3 + adcs x19, x19, x4 + adc x20, x20, xzr # A[2] * B[3] - mul x3, x20, x17 - umulh x4, x20, x17 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, xzr, xzr + mul x3, x8, x13 + umulh x4, x8, x13 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, xzr, xzr # A[3] * B[2] - mul x3, x21, x16 - umulh x4, x21, x16 - adds x11, x11, x3 - adcs x12, x12, x4 - adc x13, x13, xzr + mul x3, x9, x12 + umulh x4, x9, x12 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, x21, xzr # A[3] * B[3] - mul x3, x21, x17 - umulh x4, x21, x17 - adds x12, x12, x3 - adc x13, x13, x4 + mul x3, x9, x13 + umulh x4, x9, x13 + adds x20, x20, x3 + adc x21, x21, x4 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff + extr x21, x21, x20, #63 + extr x20, x20, x19, #63 + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + and x17, x17, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x10 - umulh x10, x3, x10 - adds x6, x6, x4 - mul x4, x3, x11 - umulh x11, x3, x11 - adcs x7, x7, x4 - mul x4, x3, x12 - umulh x12, x3, x12 - adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 + mul x4, x3, x18 + umulh x18, x3, x18 + adds x14, x14, x4 + mul x4, x3, x19 + umulh x19, x3, x19 + adcs x15, x15, x4 + mul x4, x3, x20 + umulh x20, x3, x20 + adcs x16, x16, x4 + mul x4, x3, x21 + umulh x5, x3, x21 + adcs x17, x17, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 + adds x15, x15, x18 + adcs x16, x16, x19 + adcs x17, x17, x20 adc x5, x5, xzr # Overflow - extr x5, x5, x9, #63 + extr x5, x5, x17, #63 mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr # Reduce if top bit set - lsr x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr + asr x5, x17, #63 + and x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr # Store - stp x6, x7, [x0] - stp x8, x9, [x0, #16] + stp x14, x15, [x0] + stp x16, x17, [x0, #16] mov x0, xzr - ldr x17, [x29, #200] - ldr x18, [x29, #208] - ldr x19, [x29, #216] - ldr x20, [x29, #224] - ldr x21, [x29, #232] - ldr x22, [x29, #240] - ldr x23, [x29, #248] - ldr x24, [x29, #256] - ldr x25, [x29, #264] - ldp x29, x30, [sp], #0x110 + ldr x17, [x29, #192] + ldr x18, [x29, #200] + ldr x19, [x29, #208] + ldr x20, [x29, #216] + ldr x21, [x29, #224] + ldr x22, [x29, #232] + ldr x23, [x29, #240] + ldr x24, [x29, #248] + ldr x25, [x29, #256] + ldr x26, [x29, #264] + ldr x27, [x29, #272] + ldr x28, [x29, #280] + ldp x29, x30, [sp], #0x120 ret .size curve25519,.-curve25519 .text @@ -2860,8 +2775,6 @@ L_fe_pow22523_8: ldr x0, [x29, #112] ldr x2, [x29, #120] bl fe_mul - ldr x1, [x29, #120] - ldr x0, [x29, #112] ldr x21, [x29, #136] ldp x29, x30, [sp], #0x90 ret @@ -2873,12 +2786,11 @@ L_fe_pow22523_8: fe_ge_to_p2: stp x29, x30, [sp, #-112]! add x29, sp, #0 - str x17, [x29, #64] - str x18, [x29, #72] - str x19, [x29, #80] - str x20, [x29, #88] - str x21, [x29, #96] - str x22, [x29, #104] + str x17, [x29, #72] + str x18, [x29, #80] + str x19, [x29, #88] + str x20, [x29, #96] + str x21, [x29, #104] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -2888,100 +2800,100 @@ fe_ge_to_p2: ldr x1, [x29, #32] ldr x2, [x29, #56] # Multiply - ldp x11, x16, [x1] - ldp x17, x18, [x1, #16] - ldp x19, x20, [x2] - ldp x21, x22, [x2, #16] + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x18, [x2, #16] # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 + mul x3, x11, x15 + umulh x4, x11, x15 # A[0] * B[1] - mul x12, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x12 + mul x19, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x19 adc x5, x5, xzr # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 + mul x19, x12, x15 + umulh x20, x12, x15 + adds x4, x4, x19 + adcs x5, x5, x20 adc x6, xzr, xzr # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 + mul x19, x11, x17 + umulh x20, x11, x17 + adds x5, x5, x19 + adc x6, x6, x20 # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x19, x12, x16 + umulh x20, x12, x16 + adds x5, x5, x19 + adcs x6, x6, x20 adc x7, xzr, xzr # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x19, x13, x15 + umulh x20, x13, x15 + adds x5, x5, x19 + adcs x6, x6, x20 adc x7, x7, xzr # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x11, x18 + umulh x20, x11, x18 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, xzr, xzr # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x12, x17 + umulh x20, x12, x17 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x13, x16 + umulh x20, x13, x16 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x14, x15 + umulh x20, x14, x15 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x12, x18 + umulh x20, x12, x18 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, xzr, xzr # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x13, x17 + umulh x20, x13, x17 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, x9, xzr # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x14, x16 + umulh x20, x14, x16 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, x9, xzr # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x19, x13, x18 + umulh x20, x13, x18 + adds x8, x8, x19 + adcs x9, x9, x20 adc x10, xzr, xzr # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x19, x14, x17 + umulh x20, x14, x17 + adds x8, x8, x19 + adcs x9, x9, x20 adc x10, x10, xzr # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 + mul x19, x14, x18 + umulh x20, x14, x18 + adds x9, x9, x19 + adc x10, x10, x20 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -2990,38 +2902,38 @@ fe_ge_to_p2: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr + mov x19, #19 + mul x20, x19, x7 + umulh x7, x19, x7 + adds x3, x3, x20 + mul x20, x19, x8 + umulh x8, x19, x8 + adcs x4, x4, x20 + mul x20, x19, x9 + umulh x9, x19, x9 + adcs x5, x5, x20 + mul x20, x19, x10 + umulh x21, x19, x10 + adcs x6, x6, x20 + adc x21, x21, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x14, x14, xzr + adc x21, x21, xzr # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 + extr x21, x21, x6, #63 + mul x21, x21, x19 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x21 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 + asr x21, x6, #63 + and x21, x21, x19 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x21 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3032,100 +2944,100 @@ fe_ge_to_p2: ldr x1, [x29, #40] ldr x2, [x29, #48] # Multiply - ldp x11, x16, [x1] - ldp x17, x18, [x1, #16] - ldp x19, x20, [x2] - ldp x21, x22, [x2, #16] + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x18, [x2, #16] # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 + mul x3, x11, x15 + umulh x4, x11, x15 # A[0] * B[1] - mul x12, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x12 + mul x19, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x19 adc x5, x5, xzr # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 + mul x19, x12, x15 + umulh x20, x12, x15 + adds x4, x4, x19 + adcs x5, x5, x20 adc x6, xzr, xzr # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 + mul x19, x11, x17 + umulh x20, x11, x17 + adds x5, x5, x19 + adc x6, x6, x20 # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x19, x12, x16 + umulh x20, x12, x16 + adds x5, x5, x19 + adcs x6, x6, x20 adc x7, xzr, xzr # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x19, x13, x15 + umulh x20, x13, x15 + adds x5, x5, x19 + adcs x6, x6, x20 adc x7, x7, xzr # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x11, x18 + umulh x20, x11, x18 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, xzr, xzr # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x12, x17 + umulh x20, x12, x17 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x13, x16 + umulh x20, x13, x16 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x14, x15 + umulh x20, x14, x15 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x12, x18 + umulh x20, x12, x18 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, xzr, xzr # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x13, x17 + umulh x20, x13, x17 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, x9, xzr # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x14, x16 + umulh x20, x14, x16 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, x9, xzr # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x19, x13, x18 + umulh x20, x13, x18 + adds x8, x8, x19 + adcs x9, x9, x20 adc x10, xzr, xzr # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x19, x14, x17 + umulh x20, x14, x17 + adds x8, x8, x19 + adcs x9, x9, x20 adc x10, x10, xzr # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 + mul x19, x14, x18 + umulh x20, x14, x18 + adds x9, x9, x19 + adc x10, x10, x20 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3134,38 +3046,38 @@ fe_ge_to_p2: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr + mov x19, #19 + mul x20, x19, x7 + umulh x7, x19, x7 + adds x3, x3, x20 + mul x20, x19, x8 + umulh x8, x19, x8 + adcs x4, x4, x20 + mul x20, x19, x9 + umulh x9, x19, x9 + adcs x5, x5, x20 + mul x20, x19, x10 + umulh x21, x19, x10 + adcs x6, x6, x20 + adc x21, x21, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x14, x14, xzr + adc x21, x21, xzr # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 + extr x21, x21, x6, #63 + mul x21, x21, x19 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x21 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 + asr x21, x6, #63 + and x21, x21, x19 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x21 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3173,102 +3085,100 @@ fe_ge_to_p2: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x0, [x29, #24] - ldr x1, [x29, #56] + ldr x2, [x29, #56] # Multiply - ldp x11, x16, [x2] - ldp x17, x18, [x2, #16] - ldp x19, x20, [x1] - ldp x21, x22, [x1, #16] + ldp x11, x12, [x2] + ldp x13, x14, [x2, #16] # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 + mul x3, x15, x11 + umulh x4, x15, x11 # A[0] * B[1] - mul x12, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x12 + mul x19, x15, x12 + umulh x5, x15, x12 + adds x4, x4, x19 adc x5, x5, xzr # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 + mul x19, x16, x11 + umulh x20, x16, x11 + adds x4, x4, x19 + adcs x5, x5, x20 adc x6, xzr, xzr # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 + mul x19, x15, x13 + umulh x20, x15, x13 + adds x5, x5, x19 + adc x6, x6, x20 # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x19, x16, x12 + umulh x20, x16, x12 + adds x5, x5, x19 + adcs x6, x6, x20 adc x7, xzr, xzr # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x19, x17, x11 + umulh x20, x17, x11 + adds x5, x5, x19 + adcs x6, x6, x20 adc x7, x7, xzr # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x15, x14 + umulh x20, x15, x14 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, xzr, xzr # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x16, x13 + umulh x20, x16, x13 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x17, x12 + umulh x20, x17, x12 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x19, x18, x11 + umulh x20, x18, x11 + adds x6, x6, x19 + adcs x7, x7, x20 adc x8, x8, xzr # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x16, x14 + umulh x20, x16, x14 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, xzr, xzr # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x17, x13 + umulh x20, x17, x13 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, x9, xzr # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x19, x18, x12 + umulh x20, x18, x12 + adds x7, x7, x19 + adcs x8, x8, x20 adc x9, x9, xzr # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x19, x17, x14 + umulh x20, x17, x14 + adds x8, x8, x19 + adcs x9, x9, x20 adc x10, xzr, xzr # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x19, x18, x13 + umulh x20, x18, x13 + adds x8, x8, x19 + adcs x9, x9, x20 adc x10, x10, xzr # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 + mul x19, x18, x14 + umulh x20, x18, x14 + adds x9, x9, x19 + adc x10, x10, x20 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3277,50 +3187,49 @@ fe_ge_to_p2: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr + mov x19, #19 + mul x20, x19, x7 + umulh x7, x19, x7 + adds x3, x3, x20 + mul x20, x19, x8 + umulh x8, x19, x8 + adcs x4, x4, x20 + mul x20, x19, x9 + umulh x9, x19, x9 + adcs x5, x5, x20 + mul x20, x19, x10 + umulh x21, x19, x10 + adcs x6, x6, x20 + adc x21, x21, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x14, x14, xzr + adc x21, x21, xzr # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 + extr x21, x21, x6, #63 + mul x21, x21, x19 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x21 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 + asr x21, x6, #63 + and x21, x21, x19 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x21 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Store stp x3, x4, [x0] stp x5, x6, [x0, #16] - ldr x17, [x29, #64] - ldr x18, [x29, #72] - ldr x19, [x29, #80] - ldr x20, [x29, #88] - ldr x21, [x29, #96] - ldr x22, [x29, #104] + ldr x17, [x29, #72] + ldr x18, [x29, #80] + ldr x19, [x29, #88] + ldr x20, [x29, #96] + ldr x21, [x29, #104] ldp x29, x30, [sp], #0x70 ret .size fe_ge_to_p2,.-fe_ge_to_p2 @@ -3329,14 +3238,17 @@ fe_ge_to_p2: .type fe_ge_to_p3,@function .align 4 fe_ge_to_p3: - stp x29, x30, [sp, #-128]! + stp x29, x30, [sp, #-160]! add x29, sp, #0 - str x17, [x29, #80] - str x18, [x29, #88] - str x19, [x29, #96] - str x20, [x29, #104] - str x21, [x29, #112] - str x22, [x29, #120] + str x17, [x29, #88] + str x18, [x29, #96] + str x19, [x29, #104] + str x20, [x29, #112] + str x21, [x29, #120] + str x22, [x29, #128] + str x23, [x29, #136] + str x24, [x29, #144] + str x25, [x29, #152] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -3347,387 +3259,100 @@ fe_ge_to_p3: ldr x1, [x29, #40] ldr x2, [x29, #64] # Multiply - ldp x11, x16, [x1] - ldp x17, x18, [x1, #16] - ldp x19, x20, [x2] - ldp x21, x22, [x2, #16] - # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 - # A[0] * B[1] - mul x12, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x12 - adc x5, x5, xzr - # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 - adc x6, xzr, xzr - # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 - # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr - # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, x7, xzr - # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr - # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr - # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr - # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x14, x14, xzr - # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #16] - ldr x1, [x29, #48] - ldr x2, [x29, #56] - # Multiply - ldp x11, x16, [x1] - ldp x17, x18, [x1, #16] - ldp x19, x20, [x2] - ldp x21, x22, [x2, #16] - # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 - # A[0] * B[1] - mul x12, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x12 - adc x5, x5, xzr - # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 - adc x6, xzr, xzr - # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 - # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr - # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, x7, xzr - # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr - # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr - # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr - # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x14, x14, xzr - # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #24] - ldr x1, [x29, #64] - # Multiply - ldp x11, x16, [x2] + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] ldp x17, x18, [x2, #16] - ldp x19, x20, [x1] - ldp x21, x22, [x1, #16] # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 + mul x3, x11, x15 + umulh x4, x11, x15 # A[0] * B[1] - mul x12, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x12 + mul x23, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x23 adc x5, x5, xzr # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 + mul x23, x12, x15 + umulh x24, x12, x15 + adds x4, x4, x23 + adcs x5, x5, x24 adc x6, xzr, xzr # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 + mul x23, x11, x17 + umulh x24, x11, x17 + adds x5, x5, x23 + adc x6, x6, x24 # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x23, x12, x16 + umulh x24, x12, x16 + adds x5, x5, x23 + adcs x6, x6, x24 adc x7, xzr, xzr # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x23, x13, x15 + umulh x24, x13, x15 + adds x5, x5, x23 + adcs x6, x6, x24 adc x7, x7, xzr # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x11, x18 + umulh x24, x11, x18 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, xzr, xzr # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x12, x17 + umulh x24, x12, x17 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, x8, xzr # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x13, x16 + umulh x24, x13, x16 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, x8, xzr # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x14, x15 + umulh x24, x14, x15 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, x8, xzr # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x23, x12, x18 + umulh x24, x12, x18 + adds x7, x7, x23 + adcs x8, x8, x24 adc x9, xzr, xzr # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x23, x13, x17 + umulh x24, x13, x17 + adds x7, x7, x23 + adcs x8, x8, x24 adc x9, x9, xzr # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x23, x14, x16 + umulh x24, x14, x16 + adds x7, x7, x23 + adcs x8, x8, x24 adc x9, x9, xzr # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x23, x13, x18 + umulh x24, x13, x18 + adds x8, x8, x23 + adcs x9, x9, x24 adc x10, xzr, xzr # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x23, x14, x17 + umulh x24, x14, x17 + adds x8, x8, x23 + adcs x9, x9, x24 adc x10, x10, xzr # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 + mul x23, x14, x18 + umulh x24, x14, x18 + adds x9, x9, x23 + adc x10, x10, x24 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3736,38 +3361,38 @@ fe_ge_to_p3: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr + mov x23, #19 + mul x24, x23, x7 + umulh x7, x23, x7 + adds x3, x3, x24 + mul x24, x23, x8 + umulh x8, x23, x8 + adcs x4, x4, x24 + mul x24, x23, x9 + umulh x9, x23, x9 + adcs x5, x5, x24 + mul x24, x23, x10 + umulh x25, x23, x10 + adcs x6, x6, x24 + adc x25, x25, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x14, x14, xzr + adc x25, x25, xzr # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 + extr x25, x25, x6, #63 + mul x25, x25, x23 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x25 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 + asr x25, x6, #63 + and x25, x25, x23 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x25 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3775,103 +3400,100 @@ fe_ge_to_p3: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x0, [x29, #32] - ldr x1, [x29, #40] ldr x2, [x29, #48] # Multiply - ldp x11, x16, [x1] - ldp x17, x18, [x1, #16] ldp x19, x20, [x2] ldp x21, x22, [x2, #16] # A[0] * B[0] mul x3, x11, x19 umulh x4, x11, x19 # A[0] * B[1] - mul x12, x11, x20 + mul x23, x11, x20 umulh x5, x11, x20 - adds x4, x4, x12 + adds x4, x4, x23 adc x5, x5, xzr # A[1] * B[0] - mul x12, x16, x19 - umulh x13, x16, x19 - adds x4, x4, x12 - adcs x5, x5, x13 + mul x23, x12, x19 + umulh x24, x12, x19 + adds x4, x4, x23 + adcs x5, x5, x24 adc x6, xzr, xzr # A[0] * B[2] - mul x12, x11, x21 - umulh x13, x11, x21 - adds x5, x5, x12 - adc x6, x6, x13 + mul x23, x11, x21 + umulh x24, x11, x21 + adds x5, x5, x23 + adc x6, x6, x24 # A[1] * B[1] - mul x12, x16, x20 - umulh x13, x16, x20 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x23, x12, x20 + umulh x24, x12, x20 + adds x5, x5, x23 + adcs x6, x6, x24 adc x7, xzr, xzr # A[2] * B[0] - mul x12, x17, x19 - umulh x13, x17, x19 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x23, x13, x19 + umulh x24, x13, x19 + adds x5, x5, x23 + adcs x6, x6, x24 adc x7, x7, xzr # A[0] * B[3] - mul x12, x11, x22 - umulh x13, x11, x22 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x11, x22 + umulh x24, x11, x22 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, xzr, xzr # A[1] * B[2] - mul x12, x16, x21 - umulh x13, x16, x21 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x12, x21 + umulh x24, x12, x21 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, x8, xzr # A[2] * B[1] - mul x12, x17, x20 - umulh x13, x17, x20 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x13, x20 + umulh x24, x13, x20 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, x8, xzr # A[3] * B[0] - mul x12, x18, x19 - umulh x13, x18, x19 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x23, x14, x19 + umulh x24, x14, x19 + adds x6, x6, x23 + adcs x7, x7, x24 adc x8, x8, xzr # A[1] * B[3] - mul x12, x16, x22 - umulh x13, x16, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x23, x12, x22 + umulh x24, x12, x22 + adds x7, x7, x23 + adcs x8, x8, x24 adc x9, xzr, xzr # A[2] * B[2] - mul x12, x17, x21 - umulh x13, x17, x21 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x23, x13, x21 + umulh x24, x13, x21 + adds x7, x7, x23 + adcs x8, x8, x24 adc x9, x9, xzr # A[3] * B[1] - mul x12, x18, x20 - umulh x13, x18, x20 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x23, x14, x20 + umulh x24, x14, x20 + adds x7, x7, x23 + adcs x8, x8, x24 adc x9, x9, xzr # A[2] * B[3] - mul x12, x17, x22 - umulh x13, x17, x22 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x23, x13, x22 + umulh x24, x13, x22 + adds x8, x8, x23 + adcs x9, x9, x24 adc x10, xzr, xzr # A[3] * B[2] - mul x12, x18, x21 - umulh x13, x18, x21 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x23, x14, x21 + umulh x24, x14, x21 + adds x8, x8, x23 + adcs x9, x9, x24 adc x10, x10, xzr # A[3] * B[3] - mul x12, x18, x22 - umulh x13, x18, x22 - adds x9, x9, x12 - adc x10, x10, x13 + mul x23, x14, x22 + umulh x24, x14, x22 + adds x9, x9, x23 + adc x10, x10, x24 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3880,51 +3502,333 @@ fe_ge_to_p3: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x7 - umulh x7, x12, x7 - adds x3, x3, x13 - mul x13, x12, x8 - umulh x8, x12, x8 - adcs x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x14, x12, x10 - adcs x6, x6, x13 - adc x14, x14, xzr + mov x23, #19 + mul x24, x23, x7 + umulh x7, x23, x7 + adds x3, x3, x24 + mul x24, x23, x8 + umulh x8, x23, x8 + adcs x4, x4, x24 + mul x24, x23, x9 + umulh x9, x23, x9 + adcs x5, x5, x24 + mul x24, x23, x10 + umulh x25, x23, x10 + adcs x6, x6, x24 + adc x25, x25, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x14, x14, xzr + adc x25, x25, xzr # Overflow - extr x14, x14, x6, #63 - mul x14, x14, x12 + extr x25, x25, x6, #63 + mul x25, x25, x23 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x25 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - lsr x14, x6, #63 - mul x14, x14, x12 + asr x25, x6, #63 + and x25, x25, x23 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x14 + adds x3, x3, x25 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Store stp x3, x4, [x0] stp x5, x6, [x0, #16] - ldr x17, [x29, #80] - ldr x18, [x29, #88] - ldr x19, [x29, #96] - ldr x20, [x29, #104] - ldr x21, [x29, #112] - ldr x22, [x29, #120] - ldp x29, x30, [sp], #0x80 + ldr x0, [x29, #16] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x2] + ldp x13, x14, [x2, #16] + # A[0] * B[0] + mul x3, x19, x11 + umulh x4, x19, x11 + # A[0] * B[1] + mul x23, x19, x12 + umulh x5, x19, x12 + adds x4, x4, x23 + adc x5, x5, xzr + # A[1] * B[0] + mul x23, x20, x11 + umulh x24, x20, x11 + adds x4, x4, x23 + adcs x5, x5, x24 + adc x6, xzr, xzr + # A[0] * B[2] + mul x23, x19, x13 + umulh x24, x19, x13 + adds x5, x5, x23 + adc x6, x6, x24 + # A[1] * B[1] + mul x23, x20, x12 + umulh x24, x20, x12 + adds x5, x5, x23 + adcs x6, x6, x24 + adc x7, xzr, xzr + # A[2] * B[0] + mul x23, x21, x11 + umulh x24, x21, x11 + adds x5, x5, x23 + adcs x6, x6, x24 + adc x7, x7, xzr + # A[0] * B[3] + mul x23, x19, x14 + umulh x24, x19, x14 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, xzr, xzr + # A[1] * B[2] + mul x23, x20, x13 + umulh x24, x20, x13 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, x8, xzr + # A[2] * B[1] + mul x23, x21, x12 + umulh x24, x21, x12 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, x8, xzr + # A[3] * B[0] + mul x23, x22, x11 + umulh x24, x22, x11 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, x8, xzr + # A[1] * B[3] + mul x23, x20, x14 + umulh x24, x20, x14 + adds x7, x7, x23 + adcs x8, x8, x24 + adc x9, xzr, xzr + # A[2] * B[2] + mul x23, x21, x13 + umulh x24, x21, x13 + adds x7, x7, x23 + adcs x8, x8, x24 + adc x9, x9, xzr + # A[3] * B[1] + mul x23, x22, x12 + umulh x24, x22, x12 + adds x7, x7, x23 + adcs x8, x8, x24 + adc x9, x9, xzr + # A[2] * B[3] + mul x23, x21, x14 + umulh x24, x21, x14 + adds x8, x8, x23 + adcs x9, x9, x24 + adc x10, xzr, xzr + # A[3] * B[2] + mul x23, x22, x13 + umulh x24, x22, x13 + adds x8, x8, x23 + adcs x9, x9, x24 + adc x10, x10, xzr + # A[3] * B[3] + mul x23, x22, x14 + umulh x24, x22, x14 + adds x9, x9, x23 + adc x10, x10, x24 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x23, #19 + mul x24, x23, x7 + umulh x7, x23, x7 + adds x3, x3, x24 + mul x24, x23, x8 + umulh x8, x23, x8 + adcs x4, x4, x24 + mul x24, x23, x9 + umulh x9, x23, x9 + adcs x5, x5, x24 + mul x24, x23, x10 + umulh x25, x23, x10 + adcs x6, x6, x24 + adc x25, x25, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x25, x25, xzr + # Overflow + extr x25, x25, x6, #63 + mul x25, x25, x23 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x25 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + asr x25, x6, #63 + and x25, x25, x23 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x25 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + # Multiply + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x23, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x23 + adc x5, x5, xzr + # A[1] * B[0] + mul x23, x12, x15 + umulh x24, x12, x15 + adds x4, x4, x23 + adcs x5, x5, x24 + adc x6, xzr, xzr + # A[0] * B[2] + mul x23, x11, x17 + umulh x24, x11, x17 + adds x5, x5, x23 + adc x6, x6, x24 + # A[1] * B[1] + mul x23, x12, x16 + umulh x24, x12, x16 + adds x5, x5, x23 + adcs x6, x6, x24 + adc x7, xzr, xzr + # A[2] * B[0] + mul x23, x13, x15 + umulh x24, x13, x15 + adds x5, x5, x23 + adcs x6, x6, x24 + adc x7, x7, xzr + # A[0] * B[3] + mul x23, x11, x18 + umulh x24, x11, x18 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, xzr, xzr + # A[1] * B[2] + mul x23, x12, x17 + umulh x24, x12, x17 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, x8, xzr + # A[2] * B[1] + mul x23, x13, x16 + umulh x24, x13, x16 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, x8, xzr + # A[3] * B[0] + mul x23, x14, x15 + umulh x24, x14, x15 + adds x6, x6, x23 + adcs x7, x7, x24 + adc x8, x8, xzr + # A[1] * B[3] + mul x23, x12, x18 + umulh x24, x12, x18 + adds x7, x7, x23 + adcs x8, x8, x24 + adc x9, xzr, xzr + # A[2] * B[2] + mul x23, x13, x17 + umulh x24, x13, x17 + adds x7, x7, x23 + adcs x8, x8, x24 + adc x9, x9, xzr + # A[3] * B[1] + mul x23, x14, x16 + umulh x24, x14, x16 + adds x7, x7, x23 + adcs x8, x8, x24 + adc x9, x9, xzr + # A[2] * B[3] + mul x23, x13, x18 + umulh x24, x13, x18 + adds x8, x8, x23 + adcs x9, x9, x24 + adc x10, xzr, xzr + # A[3] * B[2] + mul x23, x14, x17 + umulh x24, x14, x17 + adds x8, x8, x23 + adcs x9, x9, x24 + adc x10, x10, xzr + # A[3] * B[3] + mul x23, x14, x18 + umulh x24, x14, x18 + adds x9, x9, x23 + adc x10, x10, x24 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x23, #19 + mul x24, x23, x7 + umulh x7, x23, x7 + adds x3, x3, x24 + mul x24, x23, x8 + umulh x8, x23, x8 + adcs x4, x4, x24 + mul x24, x23, x9 + umulh x9, x23, x9 + adcs x5, x5, x24 + mul x24, x23, x10 + umulh x25, x23, x10 + adcs x6, x6, x24 + adc x25, x25, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x25, x25, xzr + # Overflow + extr x25, x25, x6, #63 + mul x25, x25, x23 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x25 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + asr x25, x6, #63 + and x25, x25, x23 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x25 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #88] + ldr x18, [x29, #96] + ldr x19, [x29, #104] + ldr x20, [x29, #112] + ldr x21, [x29, #120] + ldr x22, [x29, #128] + ldr x23, [x29, #136] + ldr x24, [x29, #144] + ldr x25, [x29, #152] + ldp x29, x30, [sp], #0xa0 ret .size fe_ge_to_p3,.-fe_ge_to_p3 .text @@ -3932,7 +3836,7 @@ fe_ge_to_p3: .type fe_ge_dbl,@function .align 4 fe_ge_dbl: - stp x29, x30, [sp, #-144]! + stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] str x18, [x29, #96] @@ -3941,6 +3845,10 @@ fe_ge_dbl: str x21, [x29, #120] str x22, [x29, #128] str x23, [x29, #136] + str x24, [x29, #144] + str x25, [x29, #152] + str x26, [x29, #160] + str x27, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -3950,36 +3858,36 @@ fe_ge_dbl: str x6, [x29, #64] ldr x1, [x29, #48] # Square - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] # A[0] * A[1] - mul x5, x20, x21 - umulh x6, x20, x21 + mul x5, x12, x13 + umulh x6, x12, x13 # A[0] * A[2] - mul x12, x20, x22 - umulh x7, x20, x22 - adds x6, x6, x12 + mul x24, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x24 adc x7, x7, xzr # A[0] * A[3] - mul x12, x20, x23 - umulh x8, x20, x23 - adds x7, x7, x12 + mul x24, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x24 adc x8, x8, xzr # A[1] * A[2] - mul x12, x21, x22 - umulh x13, x21, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x13, x14 + umulh x25, x13, x14 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[1] * A[3] - mul x12, x21, x23 - umulh x13, x21, x23 - adds x8, x8, x12 - adc x9, x9, x13 + mul x24, x13, x15 + umulh x25, x13, x15 + adds x8, x8, x24 + adc x9, x9, x25 # A[2] * A[3] - mul x12, x22, x23 - umulh x10, x22, x23 - adds x9, x9, x12 + mul x24, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x24 adc x10, x10, xzr # Double adds x5, x5, x5 @@ -3990,26 +3898,26 @@ fe_ge_dbl: adcs x10, x10, x10 adc x11, xzr, xzr # A[0] * A[0] - mul x4, x20, x20 - umulh x15, x20, x20 + mul x4, x12, x12 + umulh x26, x12, x12 # A[1] * A[1] - mul x12, x21, x21 - umulh x13, x21, x21 - adds x5, x5, x15 - adcs x6, x6, x12 - adc x15, x13, xzr + mul x24, x13, x13 + umulh x25, x13, x13 + adds x5, x5, x26 + adcs x6, x6, x24 + adc x26, x25, xzr # A[2] * A[2] - mul x12, x22, x22 - umulh x13, x22, x22 - adds x7, x7, x15 - adcs x8, x8, x12 - adc x15, x13, xzr + mul x24, x14, x14 + umulh x25, x14, x14 + adds x7, x7, x26 + adcs x8, x8, x24 + adc x26, x25, xzr # A[3] * A[3] - mul x12, x23, x23 - umulh x13, x23, x23 - adds x9, x9, x15 - adcs x10, x10, x12 - adc x11, x11, x13 + mul x24, x15, x15 + umulh x25, x15, x15 + adds x9, x9, x26 + adcs x10, x10, x24 + adc x11, x11, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -4018,38 +3926,38 @@ fe_ge_dbl: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x8 + umulh x8, x24, x8 + adds x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x10, x24, x10 + adcs x6, x6, x25 + mul x25, x24, x11 + umulh x26, x24, x11 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x14, x14, xzr + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -4057,338 +3965,323 @@ fe_ge_dbl: stp x4, x5, [x0] stp x6, x7, [x0, #16] ldr x0, [x29, #32] - ldr x2, [x29, #56] + ldr x1, [x29, #56] # Square - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x20, x21, [x1] + ldp x22, x23, [x1, #16] # A[0] * A[1] - mul x5, x20, x21 - umulh x6, x20, x21 + mul x9, x20, x21 + umulh x10, x20, x21 # A[0] * A[2] - mul x12, x20, x22 - umulh x7, x20, x22 - adds x6, x6, x12 - adc x7, x7, xzr + mul x24, x20, x22 + umulh x11, x20, x22 + adds x10, x10, x24 + adc x11, x11, xzr # A[0] * A[3] - mul x12, x20, x23 - umulh x8, x20, x23 - adds x7, x7, x12 - adc x8, x8, xzr + mul x24, x20, x23 + umulh x16, x20, x23 + adds x11, x11, x24 + adc x16, x16, xzr # A[1] * A[2] - mul x12, x21, x22 - umulh x13, x21, x22 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x21, x22 + umulh x25, x21, x22 + adds x11, x11, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * A[3] - mul x12, x21, x23 - umulh x13, x21, x23 - adds x8, x8, x12 - adc x9, x9, x13 + mul x24, x21, x23 + umulh x25, x21, x23 + adds x16, x16, x24 + adc x17, x17, x25 # A[2] * A[3] - mul x12, x22, x23 - umulh x10, x22, x23 - adds x9, x9, x12 - adc x10, x10, xzr + mul x24, x22, x23 + umulh x18, x22, x23 + adds x17, x17, x24 + adc x18, x18, xzr # Double - adds x5, x5, x5 - adcs x6, x6, x6 - adcs x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 + adds x9, x9, x9 adcs x10, x10, x10 - adc x11, xzr, xzr + adcs x11, x11, x11 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x18, x18, x18 + adc x19, xzr, xzr # A[0] * A[0] - mul x4, x20, x20 - umulh x15, x20, x20 + mul x8, x20, x20 + umulh x26, x20, x20 # A[1] * A[1] - mul x12, x21, x21 - umulh x13, x21, x21 - adds x5, x5, x15 - adcs x6, x6, x12 - adc x15, x13, xzr + mul x24, x21, x21 + umulh x25, x21, x21 + adds x9, x9, x26 + adcs x10, x10, x24 + adc x26, x25, xzr # A[2] * A[2] - mul x12, x22, x22 - umulh x13, x22, x22 - adds x7, x7, x15 - adcs x8, x8, x12 - adc x15, x13, xzr + mul x24, x22, x22 + umulh x25, x22, x22 + adds x11, x11, x26 + adcs x16, x16, x24 + adc x26, x25, xzr # A[3] * A[3] - mul x12, x23, x23 - umulh x13, x23, x23 - adds x9, x9, x15 - adcs x10, x10, x12 - adc x11, x11, x13 + mul x24, x23, x23 + umulh x25, x23, x23 + adds x17, x17, x26 + adcs x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x11, #63 + and x11, x11, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x8, x8, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x9, x9, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x10, x10, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x11, x11, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x9, x9, x16 + adcs x10, x10, x17 + adcs x11, x11, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + extr x26, x26, x11, #63 + mul x26, x26, x24 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x26 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + asr x26, x11, #63 + and x26, x26, x24 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x26 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] + stp x8, x9, [x0] + stp x10, x11, [x0, #16] ldr x0, [x29, #24] # Add - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - ldp x8, x9, [x2] - ldp x10, x11, [x2, #16] - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - adc x7, x7, x11 - mov x12, #-19 - asr x15, x7, #63 + adds x12, x12, x20 + adcs x13, x13, x21 + adcs x14, x14, x22 + adc x15, x15, x23 + mov x24, #-19 + asr x27, x15, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x12 - sbcs x5, x5, x15 - sbcs x6, x6, x15 - sbc x7, x7, x13 - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x1, [x29, #40] + subs x12, x12, x24 + sbcs x13, x13, x27 + sbcs x14, x14, x27 + sbc x15, x15, x25 + ldr x0, [x29, #40] # Square - ldp x20, x21, [x0] - ldp x22, x23, [x0, #16] # A[0] * A[1] - mul x5, x20, x21 - umulh x6, x20, x21 + mul x17, x12, x13 + umulh x18, x12, x13 # A[0] * A[2] - mul x12, x20, x22 - umulh x7, x20, x22 - adds x6, x6, x12 - adc x7, x7, xzr + mul x24, x12, x14 + umulh x19, x12, x14 + adds x18, x18, x24 + adc x19, x19, xzr # A[0] * A[3] - mul x12, x20, x23 - umulh x8, x20, x23 - adds x7, x7, x12 - adc x8, x8, xzr + mul x24, x12, x15 + umulh x20, x12, x15 + adds x19, x19, x24 + adc x20, x20, xzr # A[1] * A[2] - mul x12, x21, x22 - umulh x13, x21, x22 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x13, x14 + umulh x25, x13, x14 + adds x19, x19, x24 + adcs x20, x20, x25 + adc x21, xzr, xzr # A[1] * A[3] - mul x12, x21, x23 - umulh x13, x21, x23 - adds x8, x8, x12 - adc x9, x9, x13 + mul x24, x13, x15 + umulh x25, x13, x15 + adds x20, x20, x24 + adc x21, x21, x25 # A[2] * A[3] - mul x12, x22, x23 - umulh x10, x22, x23 - adds x9, x9, x12 - adc x10, x10, xzr + mul x24, x14, x15 + umulh x22, x14, x15 + adds x21, x21, x24 + adc x22, x22, xzr # Double - adds x5, x5, x5 - adcs x6, x6, x6 - adcs x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adc x11, xzr, xzr + adds x17, x17, x17 + adcs x18, x18, x18 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adc x23, xzr, xzr # A[0] * A[0] - mul x4, x20, x20 - umulh x15, x20, x20 + mul x16, x12, x12 + umulh x26, x12, x12 # A[1] * A[1] - mul x12, x21, x21 - umulh x13, x21, x21 - adds x5, x5, x15 - adcs x6, x6, x12 - adc x15, x13, xzr + mul x24, x13, x13 + umulh x25, x13, x13 + adds x17, x17, x26 + adcs x18, x18, x24 + adc x26, x25, xzr # A[2] * A[2] - mul x12, x22, x22 - umulh x13, x22, x22 - adds x7, x7, x15 - adcs x8, x8, x12 - adc x15, x13, xzr + mul x24, x14, x14 + umulh x25, x14, x14 + adds x19, x19, x26 + adcs x20, x20, x24 + adc x26, x25, xzr # A[3] * A[3] - mul x12, x23, x23 - umulh x13, x23, x23 - adds x9, x9, x15 - adcs x10, x10, x12 - adc x11, x11, x13 + mul x24, x15, x15 + umulh x25, x15, x15 + adds x21, x21, x26 + adcs x22, x22, x24 + adc x23, x23, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff + extr x23, x23, x22, #63 + extr x22, x22, x21, #63 + extr x21, x21, x20, #63 + extr x20, x20, x19, #63 + and x19, x19, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x20 + umulh x20, x24, x20 + adds x16, x16, x25 + mul x25, x24, x21 + umulh x21, x24, x21 + adcs x17, x17, x25 + mul x25, x24, x22 + umulh x22, x24, x22 + adcs x18, x18, x25 + mul x25, x24, x23 + umulh x26, x24, x23 + adcs x19, x19, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x17, x17, x20 + adcs x18, x18, x21 + adcs x19, x19, x22 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + extr x26, x26, x19, #63 + mul x26, x26, x24 + and x19, x19, #0x7fffffffffffffff + adds x16, x16, x26 + adcs x17, x17, xzr + adcs x18, x18, xzr + adc x19, x19, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + asr x26, x19, #63 + and x26, x26, x24 + and x19, x19, #0x7fffffffffffffff + adds x16, x16, x26 + adcs x17, x17, xzr + adcs x18, x18, xzr + adc x19, x19, xzr # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x1, [x29, #32] - ldr x2, [x29, #16] - # Add - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - ldp x8, x9, [x2] - ldp x10, x11, [x2, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 - # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 stp x16, x17, [x0] stp x18, x19, [x0, #16] - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x1, [x29, #40] - # Sub - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + ldr x0, [x29, #24] + ldr x1, [x29, #32] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x24, #-19 + asr x27, x15, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x24 + sbcs x13, x13, x27 + sbcs x14, x14, x27 + sbc x15, x15, x25 + # Sub + subs x20, x8, x4 + sbcs x21, x9, x5 + sbcs x22, x10, x6 + sbcs x23, x11, x7 + mov x24, #-19 + csetm x27, cc + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x4, x5, [x2] - stp x6, x7, [x2, #16] - ldr x0, [x29, #64] + adds x20, x20, x24 + adcs x21, x21, x27 + adcs x22, x22, x27 + adc x23, x23, x25 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x20, x21, [x1] + stp x22, x23, [x1, #16] + ldr x0, [x29, #16] + # Sub + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x18, x18, x14 + sbcs x19, x19, x15 + mov x24, #-19 + csetm x27, cc + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x16, x17, [x0] + stp x18, x19, [x0, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #64] # Square * 2 - ldp x20, x21, [x0] - ldp x22, x23, [x0, #16] + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] # A[0] * A[1] - mul x5, x20, x21 - umulh x6, x20, x21 + mul x5, x12, x13 + umulh x6, x12, x13 # A[0] * A[2] - mul x12, x20, x22 - umulh x7, x20, x22 - adds x6, x6, x12 + mul x24, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x24 adc x7, x7, xzr # A[0] * A[3] - mul x12, x20, x23 - umulh x8, x20, x23 - adds x7, x7, x12 + mul x24, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x24 adc x8, x8, xzr # A[1] * A[2] - mul x12, x21, x22 - umulh x13, x21, x22 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x13, x14 + umulh x25, x13, x14 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[1] * A[3] - mul x12, x21, x23 - umulh x13, x21, x23 - adds x8, x8, x12 - adc x9, x9, x13 + mul x24, x13, x15 + umulh x25, x13, x15 + adds x8, x8, x24 + adc x9, x9, x25 # A[2] * A[3] - mul x12, x22, x23 - umulh x10, x22, x23 - adds x9, x9, x12 + mul x24, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x24 adc x10, x10, xzr # Double adds x5, x5, x5 @@ -4399,30 +4292,30 @@ fe_ge_dbl: adcs x10, x10, x10 adc x11, xzr, xzr # A[0] * A[0] - mul x4, x20, x20 - umulh x15, x20, x20 + mul x4, x12, x12 + umulh x27, x12, x12 # A[1] * A[1] - mul x12, x21, x21 - umulh x13, x21, x21 - adds x5, x5, x15 - adcs x6, x6, x12 - adc x15, x13, xzr + mul x24, x13, x13 + umulh x25, x13, x13 + adds x5, x5, x27 + adcs x6, x6, x24 + adc x27, x25, xzr # A[2] * A[2] - mul x12, x22, x22 - umulh x13, x22, x22 - adds x7, x7, x15 - adcs x8, x8, x12 - adc x15, x13, xzr + mul x24, x14, x14 + umulh x25, x14, x14 + adds x7, x7, x27 + adcs x8, x8, x24 + adc x27, x25, xzr # A[3] * A[3] - mul x12, x23, x23 - umulh x13, x23, x23 - adds x9, x9, x15 - adcs x10, x10, x12 - adc x11, x11, x13 + mul x24, x15, x15 + umulh x25, x15, x15 + adds x9, x9, x27 + adcs x10, x10, x24 + adc x11, x11, x25 # Double and Reduce - mov x12, #0x169 + mov x24, #0x169 # Move top half into t4-t7 and remove top bit from t3 - lsr x15, x11, #61 + lsr x27, x11, #61 extr x11, x11, x10, #62 extr x10, x10, x9, #62 extr x9, x9, x8, #62 @@ -4435,69 +4328,63 @@ fe_ge_dbl: # Two left, only one right and x11, x11, #0x7fffffffffffffff # Multiply top bits by 19*19 - mul x15, x15, x12 + mul x27, x27, x24 # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x8 + umulh x8, x24, x8 + adds x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x10, x24, x10 + adcs x6, x6, x25 + mul x25, x24, x11 + umulh x26, x24, x11 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in - adds x4, x4, x15 + adds x4, x4, x27 adcs x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x14, x14, xzr + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x0, [x29, #32] + ldr x0, [x29, #40] # Sub - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x4, x4, x20 + sbcs x5, x5, x21 + sbcs x6, x6, x22 + sbcs x7, x7, x23 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x4, x5, [x1] - stp x6, x7, [x1, #16] + adds x4, x4, x24 + adcs x5, x5, x27 + adcs x6, x6, x27 + adc x7, x7, x25 + stp x4, x5, [x0] + stp x6, x7, [x0, #16] ldr x17, [x29, #88] ldr x18, [x29, #96] ldr x19, [x29, #104] @@ -4505,7 +4392,11 @@ fe_ge_dbl: ldr x21, [x29, #120] ldr x22, [x29, #128] ldr x23, [x29, #136] - ldp x29, x30, [sp], #0x90 + ldr x24, [x29, #144] + ldr x25, [x29, #152] + ldr x26, [x29, #160] + ldr x27, [x29, #168] + ldp x29, x30, [sp], #0xb0 ret .size fe_ge_dbl,.-fe_ge_dbl .text @@ -4534,430 +4425,454 @@ fe_ge_madd: str x5, [x29, #56] str x6, [x29, #64] str x7, [x29, #72] - ldr x1, [x29, #24] ldr x2, [x29, #56] ldr x3, [x29, #48] # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x3] - ldp x10, x11, [x3, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x18, x19, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x18 + adc x7, x15, x19 + mov x24, #-19 + asr x27, x7, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x4, x4, x24 + sbcs x5, x5, x27 + sbcs x6, x6, x27 + sbc x7, x7, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x18 + sbcs x11, x15, x19 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x2, [x29, #32] - ldr x3, [x29, #184] + adds x8, x8, x24 + adcs x9, x9, x27 + adcs x10, x10, x27 + adc x11, x11, x25 + ldr x0, [x29, #32] + ldr x2, [x29, #184] # Multiply - ldp x20, x21, [x0] - ldp x22, x23, [x0, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x12, x4, x20 + umulh x13, x4, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr + mul x24, x4, x21 + umulh x14, x4, x21 + adds x13, x13, x24 + adc x14, x14, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr + mul x24, x5, x20 + umulh x25, x5, x20 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x4, x22 + umulh x25, x4, x22 + adds x14, x14, x24 + adc x15, x15, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr + mul x24, x5, x21 + umulh x25, x5, x21 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr + mul x24, x6, x20 + umulh x25, x6, x20 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, x16, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x4, x23 + umulh x25, x4, x23 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x5, x22 + umulh x25, x5, x22 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x6, x21 + umulh x25, x6, x21 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x7, x20 + umulh x25, x7, x20 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr + mul x24, x5, x23 + umulh x25, x5, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x6, x22 + umulh x25, x6, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x7, x21 + umulh x25, x7, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr + mul x24, x6, x23 + umulh x25, x6, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr + mul x24, x7, x22 + umulh x25, x7, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x7, x23 + umulh x25, x7, x23 + adds x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x12, x12, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x13, x13, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x14, x14, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x15, x15, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + extr x26, x26, x15, #63 + mul x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + asr x26, x15, #63 + and x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Store - stp x4, x5, [x2] - stp x6, x7, [x2, #16] - ldr x0, [x29, #192] + ldr x0, [x29, #24] + ldr x1, [x29, #192] # Multiply ldp x20, x21, [x1] ldp x22, x23, [x1, #16] - ldp x24, x25, [x0] - ldp x26, x27, [x0, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x4, x8, x20 + umulh x5, x8, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 + mul x24, x8, x21 + umulh x6, x8, x21 + adds x5, x5, x24 adc x6, x6, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x24, x9, x20 + umulh x25, x9, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x8, x22 + umulh x25, x8, x22 + adds x6, x6, x24 + adc x7, x7, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr + mul x24, x9, x21 + umulh x25, x9, x21 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr + mul x24, x10, x20 + umulh x25, x10, x20 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, x16, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x8, x23 + umulh x25, x8, x23 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x9, x22 + umulh x25, x9, x22 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x10, x21 + umulh x25, x10, x21 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x11, x20 + umulh x25, x11, x20 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr + mul x24, x9, x23 + umulh x25, x9, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x10, x22 + umulh x25, x10, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x11, x21 + umulh x25, x11, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr + mul x24, x10, x23 + umulh x25, x10, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr + mul x24, x11, x22 + umulh x25, x11, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x11, x23 + umulh x25, x11, x23 + adds x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x4, x4, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x5, x5, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x6, x6, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x24, #-19 + asr x27, x11, #63 + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x24 + sbcs x9, x9, x27 + sbcs x10, x10, x27 + sbc x11, x11, x25 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x18, x14, x6 + sbcs x19, x15, x7 + mov x24, #-19 + csetm x27, cc + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] ldr x0, [x29, #40] ldr x1, [x29, #176] ldr x3, [x29, #72] # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] + ldp x16, x17, [x1] + ldp x18, x19, [x1, #16] + ldp x20, x21, [x3] + ldp x22, x23, [x3, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x4, x16, x20 + umulh x5, x16, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 + mul x24, x16, x21 + umulh x6, x16, x21 + adds x5, x5, x24 adc x6, x6, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x24, x17, x20 + umulh x25, x17, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x16, x22 + umulh x25, x16, x22 + adds x6, x6, x24 + adc x7, x7, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x17, x21 + umulh x25, x17, x21 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x18, x20 + umulh x25, x18, x20 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x16, x23 + umulh x25, x16, x23 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x17, x22 + umulh x25, x17, x22 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x18, x21 + umulh x25, x18, x21 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x19, x20 + umulh x25, x19, x20 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x17, x23 + umulh x25, x17, x23 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x18, x22 + umulh x25, x18, x22 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x19, x21 + umulh x25, x19, x21 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x18, x23 + umulh x25, x18, x23 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x19, x22 + umulh x25, x19, x22 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, x11, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x19, x23 + umulh x25, x19, x23 + adds x10, x10, x24 + adc x11, x11, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -4966,143 +4881,96 @@ fe_ge_madd: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x8 + umulh x8, x24, x8 + adds x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x10, x24, x10 + adcs x6, x6, x25 + mul x25, x24, x11 + umulh x26, x24, x11 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x14, x14, xzr + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x0, [x29, #24] - ldr x1, [x29, #16] - # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 - # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x0, [x29, #64] + ldr x0, [x29, #32] + ldr x1, [x29, #64] # Double - ldp x4, x5, [x0] - ldp x6, x7, [x0, #16] - adds x4, x4, x4 - adcs x5, x5, x5 - adcs x6, x6, x6 - adc x7, x7, x7 - mov x12, #-19 - asr x15, x7, #63 + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 + mov x24, #-19 + asr x27, x11, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x12 - sbcs x5, x5, x15 - sbcs x6, x6, x15 - sbc x7, x7, x13 - stp x4, x5, [x2] - stp x6, x7, [x2, #16] - ldr x0, [x29, #40] + subs x8, x8, x24 + sbcs x9, x9, x27 + sbcs x10, x10, x27 + sbc x11, x11, x25 + ldr x1, [x29, #40] # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x24, #-19 + asr x27, x15, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x12, x12, x24 + sbcs x13, x13, x27 + sbcs x14, x14, x27 + sbc x15, x15, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x16, x8, x4 + sbcs x17, x9, x5 + sbcs x18, x10, x6 + sbcs x19, x11, x7 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x2] - stp x18, x19, [x2, #16] - stp x4, x5, [x0] - stp x6, x7, [x0, #16] + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] ldr x17, [x29, #88] ldr x18, [x29, #96] ldr x19, [x29, #104] @@ -5143,430 +5011,454 @@ fe_ge_msub: str x5, [x29, #56] str x6, [x29, #64] str x7, [x29, #72] - ldr x1, [x29, #24] ldr x2, [x29, #56] ldr x3, [x29, #48] # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x3] - ldp x10, x11, [x3, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x18, x19, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x18 + adc x7, x15, x19 + mov x24, #-19 + asr x27, x7, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x4, x4, x24 + sbcs x5, x5, x27 + sbcs x6, x6, x27 + sbc x7, x7, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x18 + sbcs x11, x15, x19 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x2, [x29, #32] - ldr x3, [x29, #192] + adds x8, x8, x24 + adcs x9, x9, x27 + adcs x10, x10, x27 + adc x11, x11, x25 + ldr x0, [x29, #32] + ldr x2, [x29, #192] # Multiply - ldp x20, x21, [x0] - ldp x22, x23, [x0, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x12, x4, x20 + umulh x13, x4, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr + mul x24, x4, x21 + umulh x14, x4, x21 + adds x13, x13, x24 + adc x14, x14, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr + mul x24, x5, x20 + umulh x25, x5, x20 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x4, x22 + umulh x25, x4, x22 + adds x14, x14, x24 + adc x15, x15, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr + mul x24, x5, x21 + umulh x25, x5, x21 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr + mul x24, x6, x20 + umulh x25, x6, x20 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, x16, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x4, x23 + umulh x25, x4, x23 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x5, x22 + umulh x25, x5, x22 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x6, x21 + umulh x25, x6, x21 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x7, x20 + umulh x25, x7, x20 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr + mul x24, x5, x23 + umulh x25, x5, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x6, x22 + umulh x25, x6, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x7, x21 + umulh x25, x7, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr + mul x24, x6, x23 + umulh x25, x6, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr + mul x24, x7, x22 + umulh x25, x7, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x7, x23 + umulh x25, x7, x23 + adds x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x12, x12, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x13, x13, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x14, x14, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x15, x15, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + extr x26, x26, x15, #63 + mul x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + asr x26, x15, #63 + and x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Store - stp x4, x5, [x2] - stp x6, x7, [x2, #16] - ldr x0, [x29, #184] + ldr x0, [x29, #24] + ldr x1, [x29, #184] # Multiply ldp x20, x21, [x1] ldp x22, x23, [x1, #16] - ldp x24, x25, [x0] - ldp x26, x27, [x0, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x4, x8, x20 + umulh x5, x8, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 + mul x24, x8, x21 + umulh x6, x8, x21 + adds x5, x5, x24 adc x6, x6, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x24, x9, x20 + umulh x25, x9, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x8, x22 + umulh x25, x8, x22 + adds x6, x6, x24 + adc x7, x7, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr + mul x24, x9, x21 + umulh x25, x9, x21 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr + mul x24, x10, x20 + umulh x25, x10, x20 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, x16, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x8, x23 + umulh x25, x8, x23 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x9, x22 + umulh x25, x9, x22 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x10, x21 + umulh x25, x10, x21 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x11, x20 + umulh x25, x11, x20 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr + mul x24, x9, x23 + umulh x25, x9, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x10, x22 + umulh x25, x10, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x11, x21 + umulh x25, x11, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr + mul x24, x10, x23 + umulh x25, x10, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr + mul x24, x11, x22 + umulh x25, x11, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x11, x23 + umulh x25, x11, x23 + adds x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x4, x4, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x5, x5, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x6, x6, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x24, #-19 + asr x27, x11, #63 + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x24 + sbcs x9, x9, x27 + sbcs x10, x10, x27 + sbc x11, x11, x25 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x18, x14, x6 + sbcs x19, x15, x7 + mov x24, #-19 + csetm x27, cc + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] ldr x0, [x29, #40] ldr x1, [x29, #176] ldr x3, [x29, #72] # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] + ldp x16, x17, [x1] + ldp x18, x19, [x1, #16] + ldp x20, x21, [x3] + ldp x22, x23, [x3, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x4, x16, x20 + umulh x5, x16, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 + mul x24, x16, x21 + umulh x6, x16, x21 + adds x5, x5, x24 adc x6, x6, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x24, x17, x20 + umulh x25, x17, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x16, x22 + umulh x25, x16, x22 + adds x6, x6, x24 + adc x7, x7, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x17, x21 + umulh x25, x17, x21 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x18, x20 + umulh x25, x18, x20 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x16, x23 + umulh x25, x16, x23 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x17, x22 + umulh x25, x17, x22 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x18, x21 + umulh x25, x18, x21 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x19, x20 + umulh x25, x19, x20 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x17, x23 + umulh x25, x17, x23 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x18, x22 + umulh x25, x18, x22 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x19, x21 + umulh x25, x19, x21 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x18, x23 + umulh x25, x18, x23 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x19, x22 + umulh x25, x19, x22 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, x11, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x19, x23 + umulh x25, x19, x23 + adds x10, x10, x24 + adc x11, x11, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -5575,142 +5467,96 @@ fe_ge_msub: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x8 + umulh x8, x24, x8 + adds x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x10, x24, x10 + adcs x6, x6, x25 + mul x25, x24, x11 + umulh x26, x24, x11 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x14, x14, xzr + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x1, [x29, #24] - ldr x3, [x29, #16] - # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x1] - ldp x10, x11, [x1, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 - # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x1] - stp x18, x19, [x1, #16] - stp x4, x5, [x3] - stp x6, x7, [x3, #16] + ldr x0, [x29, #32] ldr x1, [x29, #64] # Double - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - adds x4, x4, x4 - adcs x5, x5, x5 - adcs x6, x6, x6 - adc x7, x7, x7 - mov x12, #-19 - asr x15, x7, #63 + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 + mov x24, #-19 + asr x27, x11, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x12 - sbcs x5, x5, x15 - sbcs x6, x6, x15 - sbc x7, x7, x13 - stp x4, x5, [x2] - stp x6, x7, [x2, #16] + subs x8, x8, x24 + sbcs x9, x9, x27 + sbcs x10, x10, x27 + sbc x11, x11, x25 + ldr x1, [x29, #40] # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x24, #-19 + asr x27, x15, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x12, x12, x24 + sbcs x13, x13, x27 + sbcs x14, x14, x27 + sbc x15, x15, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x16, x8, x4 + sbcs x17, x9, x5 + sbcs x18, x10, x6 + sbcs x19, x11, x7 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x12, x13, [x1] + stp x14, x15, [x1, #16] stp x16, x17, [x0] stp x18, x19, [x0, #16] - stp x4, x5, [x2] - stp x6, x7, [x2, #16] ldr x17, [x29, #88] ldr x18, [x29, #96] ldr x19, [x29, #104] @@ -5730,19 +5576,19 @@ fe_ge_msub: .type fe_ge_add,@function .align 4 fe_ge_add: - stp x29, x30, [sp, #-208]! + stp x29, x30, [sp, #-176]! add x29, sp, #0 - str x17, [x29, #120] - str x18, [x29, #128] - str x19, [x29, #136] - str x20, [x29, #144] - str x21, [x29, #152] - str x22, [x29, #160] - str x23, [x29, #168] - str x24, [x29, #176] - str x25, [x29, #184] - str x26, [x29, #192] - str x27, [x29, #200] + str x17, [x29, #88] + str x18, [x29, #96] + str x19, [x29, #104] + str x20, [x29, #112] + str x21, [x29, #120] + str x22, [x29, #128] + str x23, [x29, #136] + str x24, [x29, #144] + str x25, [x29, #152] + str x26, [x29, #160] + str x27, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -5751,573 +5597,454 @@ fe_ge_add: str x5, [x29, #56] str x6, [x29, #64] str x7, [x29, #72] - ldr x1, [x29, #24] ldr x2, [x29, #56] ldr x3, [x29, #48] # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x3] - ldp x10, x11, [x3, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x18, x19, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x18 + adc x7, x15, x19 + mov x24, #-19 + asr x27, x7, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x4, x4, x24 + sbcs x5, x5, x27 + sbcs x6, x6, x27 + sbc x7, x7, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x18 + sbcs x11, x15, x19 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x2, [x29, #32] - ldr x3, [x29, #224] - # Multiply - ldp x20, x21, [x0] - ldp x22, x23, [x0, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] - # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 - # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr - # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr - # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 - # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr - # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr - # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr - # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr - # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr - # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr - # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - stp x4, x5, [x2] - stp x6, x7, [x2, #16] - ldr x2, [x29, #232] - # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] - ldp x24, x25, [x2] - ldp x26, x27, [x2, #16] - # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 - # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr - # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr - # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 - # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr - # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr - # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr - # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr - # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr - # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr - # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x1, [x29, #40] - ldr x2, [x29, #216] - ldr x3, [x29, #72] + adds x8, x8, x24 + adcs x9, x9, x27 + adcs x10, x10, x27 + adc x11, x11, x25 + ldr x0, [x29, #32] + ldr x2, [x29, #192] # Multiply ldp x20, x21, [x2] ldp x22, x23, [x2, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x12, x4, x20 + umulh x13, x4, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr + mul x24, x4, x21 + umulh x14, x4, x21 + adds x13, x13, x24 + adc x14, x14, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr + mul x24, x5, x20 + umulh x25, x5, x20 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x4, x22 + umulh x25, x4, x22 + adds x14, x14, x24 + adc x15, x15, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr + mul x24, x5, x21 + umulh x25, x5, x21 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr + mul x24, x6, x20 + umulh x25, x6, x20 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, x16, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x4, x23 + umulh x25, x4, x23 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x5, x22 + umulh x25, x5, x22 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x6, x21 + umulh x25, x6, x21 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x7, x20 + umulh x25, x7, x20 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr + mul x24, x5, x23 + umulh x25, x5, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x6, x22 + umulh x25, x6, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x7, x21 + umulh x25, x7, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr + mul x24, x6, x23 + umulh x25, x6, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr + mul x24, x7, x22 + umulh x25, x7, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x7, x23 + umulh x25, x7, x23 + adds x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x12, x12, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x13, x13, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x14, x14, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x15, x15, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + extr x26, x26, x15, #63 + mul x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + asr x26, x15, #63 + and x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x1, [x29, #64] - ldr x2, [x29, #208] + ldr x0, [x29, #24] + ldr x1, [x29, #200] # Multiply ldp x20, x21, [x1] ldp x22, x23, [x1, #16] - ldp x24, x25, [x2] - ldp x26, x27, [x2, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x4, x8, x20 + umulh x5, x8, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 + mul x24, x8, x21 + umulh x6, x8, x21 + adds x5, x5, x24 adc x6, x6, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x24, x9, x20 + umulh x25, x9, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x8, x22 + umulh x25, x8, x22 + adds x6, x6, x24 + adc x7, x7, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x9, x21 + umulh x25, x9, x21 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, xzr, xzr + # A[2] * B[0] + mul x24, x10, x20 + umulh x25, x10, x20 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, x16, xzr + # A[0] * B[3] + mul x24, x8, x23 + umulh x25, x8, x23 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr + # A[1] * B[2] + mul x24, x9, x22 + umulh x25, x9, x22 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr + # A[2] * B[1] + mul x24, x10, x21 + umulh x25, x10, x21 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr + # A[3] * B[0] + mul x24, x11, x20 + umulh x25, x11, x20 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr + # A[1] * B[3] + mul x24, x9, x23 + umulh x25, x9, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr + # A[2] * B[2] + mul x24, x10, x22 + umulh x25, x10, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr + # A[3] * B[1] + mul x24, x11, x21 + umulh x25, x11, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr + # A[2] * B[3] + mul x24, x10, x23 + umulh x25, x10, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr + # A[3] * B[2] + mul x24, x11, x22 + umulh x25, x11, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr + # A[3] * B[3] + mul x24, x11, x23 + umulh x25, x11, x23 + adds x18, x18, x24 + adc x19, x19, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x4, x4, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x5, x5, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x6, x6, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x7, x7, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x18 + adc x26, x26, xzr + # Overflow + extr x26, x26, x7, #63 + mul x26, x26, x24 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x26 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + asr x26, x7, #63 + and x26, x26, x24 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x26 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x24, #-19 + asr x27, x11, #63 + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x24 + sbcs x9, x9, x27 + sbcs x10, x10, x27 + sbc x11, x11, x25 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x18, x14, x6 + sbcs x19, x15, x7 + mov x24, #-19 + csetm x27, cc + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] + ldr x0, [x29, #48] + ldr x1, [x29, #64] + ldr x2, [x29, #176] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x18, x19, [x2, #16] + # A[0] * B[0] + mul x4, x12, x16 + umulh x5, x12, x16 + # A[0] * B[1] + mul x24, x12, x17 + umulh x6, x12, x17 + adds x5, x5, x24 + adc x6, x6, xzr + # A[1] * B[0] + mul x24, x13, x16 + umulh x25, x13, x16 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[0] * B[2] + mul x24, x12, x18 + umulh x25, x12, x18 + adds x6, x6, x24 + adc x7, x7, x25 + # A[1] * B[1] + mul x24, x13, x17 + umulh x25, x13, x17 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x14, x16 + umulh x25, x14, x16 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x13, x18 + umulh x25, x13, x18 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x14, x17 + umulh x25, x14, x17 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x15, x16 + umulh x25, x15, x16 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x14, x18 + umulh x25, x14, x18 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x15, x17 + umulh x25, x15, x17 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x15, x18 + umulh x25, x15, x18 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, x11, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x15, x19 + umulh x25, x15, x19 + adds x10, x10, x24 + adc x11, x11, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -6326,155 +6053,248 @@ fe_ge_add: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x8 + umulh x8, x24, x8 + adds x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x10, x24, x10 + adcs x6, x6, x25 + mul x25, x24, x11 + umulh x26, x24, x11 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x14, x14, xzr + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - add x1, x29, #80 + ldr x0, [x29, #48] # Double - ldp x4, x5, [x0] - ldp x6, x7, [x0, #16] adds x4, x4, x4 adcs x5, x5, x5 adcs x6, x6, x6 adc x7, x7, x7 - mov x12, #-19 - asr x15, x7, #63 + mov x24, #-19 + asr x27, x7, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x12 - sbcs x5, x5, x15 - sbcs x6, x6, x15 - sbc x7, x7, x13 - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x2, [x29, #24] - ldr x3, [x29, #32] - # Add - ldp x4, x5, [x3] - ldp x6, x7, [x3, #16] - ldp x8, x9, [x2] - ldp x10, x11, [x2, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 - # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x2] - stp x18, x19, [x2, #16] - stp x4, x5, [x0] - stp x6, x7, [x0, #16] + subs x4, x4, x24 + sbcs x5, x5, x27 + sbcs x6, x6, x27 + sbc x7, x7, x25 ldr x0, [x29, #40] + ldr x1, [x29, #184] + ldr x2, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x18, x19, [x1, #16] + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + # A[0] * B[0] + mul x8, x16, x20 + umulh x9, x16, x20 + # A[0] * B[1] + mul x24, x16, x21 + umulh x10, x16, x21 + adds x9, x9, x24 + adc x10, x10, xzr + # A[1] * B[0] + mul x24, x17, x20 + umulh x25, x17, x20 + adds x9, x9, x24 + adcs x10, x10, x25 + adc x11, xzr, xzr + # A[0] * B[2] + mul x24, x16, x22 + umulh x25, x16, x22 + adds x10, x10, x24 + adc x11, x11, x25 + # A[1] * B[1] + mul x24, x17, x21 + umulh x25, x17, x21 + adds x10, x10, x24 + adcs x11, x11, x25 + adc x12, xzr, xzr + # A[2] * B[0] + mul x24, x18, x20 + umulh x25, x18, x20 + adds x10, x10, x24 + adcs x11, x11, x25 + adc x12, x12, xzr + # A[0] * B[3] + mul x24, x16, x23 + umulh x25, x16, x23 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, xzr, xzr + # A[1] * B[2] + mul x24, x17, x22 + umulh x25, x17, x22 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, x13, xzr + # A[2] * B[1] + mul x24, x18, x21 + umulh x25, x18, x21 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, x13, xzr + # A[3] * B[0] + mul x24, x19, x20 + umulh x25, x19, x20 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, x13, xzr + # A[1] * B[3] + mul x24, x17, x23 + umulh x25, x17, x23 + adds x12, x12, x24 + adcs x13, x13, x25 + adc x14, xzr, xzr + # A[2] * B[2] + mul x24, x18, x22 + umulh x25, x18, x22 + adds x12, x12, x24 + adcs x13, x13, x25 + adc x14, x14, xzr + # A[3] * B[1] + mul x24, x19, x21 + umulh x25, x19, x21 + adds x12, x12, x24 + adcs x13, x13, x25 + adc x14, x14, xzr + # A[2] * B[3] + mul x24, x18, x23 + umulh x25, x18, x23 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, xzr, xzr + # A[3] * B[2] + mul x24, x19, x22 + umulh x25, x19, x22 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, x15, xzr + # A[3] * B[3] + mul x24, x19, x23 + umulh x25, x19, x23 + adds x14, x14, x24 + adc x15, x15, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x12 + umulh x12, x24, x12 + adds x8, x8, x25 + mul x25, x24, x13 + umulh x13, x24, x13 + adcs x9, x9, x25 + mul x25, x24, x14 + umulh x14, x24, x14 + adcs x10, x10, x25 + mul x25, x24, x15 + umulh x26, x24, x15 + adcs x11, x11, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x9, x9, x12 + adcs x10, x10, x13 + adcs x11, x11, x14 + adc x26, x26, xzr + # Overflow + extr x26, x26, x11, #63 + mul x26, x26, x24 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x26 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + asr x26, x11, #63 + and x26, x26, x24 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x26 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #40] # Add - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + adds x12, x4, x8 + adcs x13, x5, x9 + adcs x14, x6, x10 + adc x15, x7, x11 + mov x24, #-19 + asr x27, x15, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x12, x12, x24 + sbcs x13, x13, x27 + sbcs x14, x14, x27 + sbc x15, x15, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x16, x4, x8 + sbcs x17, x5, x9 + sbcs x18, x6, x10 + sbcs x19, x7, x11 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x3] - stp x18, x19, [x3, #16] - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x17, [x29, #120] - ldr x18, [x29, #128] - ldr x19, [x29, #136] - ldr x20, [x29, #144] - ldr x21, [x29, #152] - ldr x22, [x29, #160] - ldr x23, [x29, #168] - ldr x24, [x29, #176] - ldr x25, [x29, #184] - ldr x26, [x29, #192] - ldr x27, [x29, #200] - ldp x29, x30, [sp], #0xd0 + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] + ldr x17, [x29, #88] + ldr x18, [x29, #96] + ldr x19, [x29, #104] + ldr x20, [x29, #112] + ldr x21, [x29, #120] + ldr x22, [x29, #128] + ldr x23, [x29, #136] + ldr x24, [x29, #144] + ldr x25, [x29, #152] + ldr x26, [x29, #160] + ldr x27, [x29, #168] + ldp x29, x30, [sp], #0xb0 ret .size fe_ge_add,.-fe_ge_add .text @@ -6482,19 +6302,19 @@ fe_ge_add: .type fe_ge_sub,@function .align 4 fe_ge_sub: - stp x29, x30, [sp, #-208]! + stp x29, x30, [sp, #-176]! add x29, sp, #0 - str x17, [x29, #120] - str x18, [x29, #128] - str x19, [x29, #136] - str x20, [x29, #144] - str x21, [x29, #152] - str x22, [x29, #160] - str x23, [x29, #168] - str x24, [x29, #176] - str x25, [x29, #184] - str x26, [x29, #192] - str x27, [x29, #200] + str x17, [x29, #88] + str x18, [x29, #96] + str x19, [x29, #104] + str x20, [x29, #112] + str x21, [x29, #120] + str x22, [x29, #128] + str x23, [x29, #136] + str x24, [x29, #144] + str x25, [x29, #152] + str x26, [x29, #160] + str x27, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -6503,573 +6323,454 @@ fe_ge_sub: str x5, [x29, #56] str x6, [x29, #64] str x7, [x29, #72] - ldr x1, [x29, #24] ldr x2, [x29, #56] ldr x3, [x29, #48] # Add - ldp x4, x5, [x2] - ldp x6, x7, [x2, #16] - ldp x8, x9, [x3] - ldp x10, x11, [x3, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x18, x19, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x18 + adc x7, x15, x19 + mov x24, #-19 + asr x27, x7, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x4, x4, x24 + sbcs x5, x5, x27 + sbcs x6, x6, x27 + sbc x7, x7, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x18 + sbcs x11, x15, x19 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x2, [x29, #32] - ldr x3, [x29, #232] - # Multiply - ldp x20, x21, [x0] - ldp x22, x23, [x0, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] - # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 - # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr - # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr - # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 - # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr - # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr - # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr - # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr - # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr - # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr - # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - stp x4, x5, [x2] - stp x6, x7, [x2, #16] - ldr x2, [x29, #224] - # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] - ldp x24, x25, [x2] - ldp x26, x27, [x2, #16] - # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 - # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr - # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr - # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 - # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr - # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr - # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr - # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr - # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr - # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr - # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr - # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr - # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr - # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x1, [x29, #40] - ldr x2, [x29, #216] - ldr x3, [x29, #72] + adds x8, x8, x24 + adcs x9, x9, x27 + adcs x10, x10, x27 + adc x11, x11, x25 + ldr x0, [x29, #32] + ldr x2, [x29, #200] # Multiply ldp x20, x21, [x2] ldp x22, x23, [x2, #16] - ldp x24, x25, [x3] - ldp x26, x27, [x3, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x12, x4, x20 + umulh x13, x4, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 - adc x6, x6, xzr + mul x24, x4, x21 + umulh x14, x4, x21 + adds x13, x13, x24 + adc x14, x14, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 - adc x7, xzr, xzr + mul x24, x5, x20 + umulh x25, x5, x20 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x4, x22 + umulh x25, x4, x22 + adds x14, x14, x24 + adc x15, x15, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, xzr, xzr + mul x24, x5, x21 + umulh x25, x5, x21 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 - adc x8, x8, xzr + mul x24, x6, x20 + umulh x25, x6, x20 + adds x14, x14, x24 + adcs x15, x15, x25 + adc x16, x16, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, xzr, xzr + mul x24, x4, x23 + umulh x25, x4, x23 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x5, x22 + umulh x25, x5, x22 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x6, x21 + umulh x25, x6, x21 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 - adc x9, x9, xzr + mul x24, x7, x20 + umulh x25, x7, x20 + adds x15, x15, x24 + adcs x16, x16, x25 + adc x17, x17, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, xzr, xzr + mul x24, x5, x23 + umulh x25, x5, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x6, x22 + umulh x25, x6, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 - adc x10, x10, xzr + mul x24, x7, x21 + umulh x25, x7, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, xzr, xzr + mul x24, x6, x23 + umulh x25, x6, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 - adc x11, x11, xzr + mul x24, x7, x22 + umulh x25, x7, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x7, x23 + umulh x25, x7, x23 + adds x18, x18, x24 + adc x19, x19, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x12, x12, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x13, x13, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x14, x14, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x15, x15, x25 + adc x26, x26, xzr # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x14, x14, xzr + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x18 + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + extr x26, x26, x15, #63 + mul x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + asr x26, x15, #63 + and x26, x26, x24 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x26 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr # Store - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x1, [x29, #64] - ldr x2, [x29, #208] + ldr x0, [x29, #24] + ldr x1, [x29, #192] # Multiply ldp x20, x21, [x1] ldp x22, x23, [x1, #16] - ldp x24, x25, [x2] - ldp x26, x27, [x2, #16] # A[0] * B[0] - mul x4, x20, x24 - umulh x5, x20, x24 + mul x4, x8, x20 + umulh x5, x8, x20 # A[0] * B[1] - mul x12, x20, x25 - umulh x6, x20, x25 - adds x5, x5, x12 + mul x24, x8, x21 + umulh x6, x8, x21 + adds x5, x5, x24 adc x6, x6, xzr # A[1] * B[0] - mul x12, x21, x24 - umulh x13, x21, x24 - adds x5, x5, x12 - adcs x6, x6, x13 + mul x24, x9, x20 + umulh x25, x9, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[0] * B[2] - mul x12, x20, x26 - umulh x13, x20, x26 - adds x6, x6, x12 - adc x7, x7, x13 + mul x24, x8, x22 + umulh x25, x8, x22 + adds x6, x6, x24 + adc x7, x7, x25 # A[1] * B[1] - mul x12, x21, x25 - umulh x13, x21, x25 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x9, x21 + umulh x25, x9, x21 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, xzr, xzr + # A[2] * B[0] + mul x24, x10, x20 + umulh x25, x10, x20 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x16, x16, xzr + # A[0] * B[3] + mul x24, x8, x23 + umulh x25, x8, x23 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, xzr, xzr + # A[1] * B[2] + mul x24, x9, x22 + umulh x25, x9, x22 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr + # A[2] * B[1] + mul x24, x10, x21 + umulh x25, x10, x21 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr + # A[3] * B[0] + mul x24, x11, x20 + umulh x25, x11, x20 + adds x7, x7, x24 + adcs x16, x16, x25 + adc x17, x17, xzr + # A[1] * B[3] + mul x24, x9, x23 + umulh x25, x9, x23 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, xzr, xzr + # A[2] * B[2] + mul x24, x10, x22 + umulh x25, x10, x22 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr + # A[3] * B[1] + mul x24, x11, x21 + umulh x25, x11, x21 + adds x16, x16, x24 + adcs x17, x17, x25 + adc x18, x18, xzr + # A[2] * B[3] + mul x24, x10, x23 + umulh x25, x10, x23 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, xzr, xzr + # A[3] * B[2] + mul x24, x11, x22 + umulh x25, x11, x22 + adds x17, x17, x24 + adcs x18, x18, x25 + adc x19, x19, xzr + # A[3] * B[3] + mul x24, x11, x23 + umulh x25, x11, x23 + adds x18, x18, x24 + adc x19, x19, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x19, x19, x18, #63 + extr x18, x18, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x16 + umulh x16, x24, x16 + adds x4, x4, x25 + mul x25, x24, x17 + umulh x17, x24, x17 + adcs x5, x5, x25 + mul x25, x24, x18 + umulh x18, x24, x18 + adcs x6, x6, x25 + mul x25, x24, x19 + umulh x26, x24, x19 + adcs x7, x7, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x18 + adc x26, x26, xzr + # Overflow + extr x26, x26, x7, #63 + mul x26, x26, x24 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x26 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + asr x26, x7, #63 + and x26, x26, x24 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x26 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x24, #-19 + asr x27, x11, #63 + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x24 + sbcs x9, x9, x27 + sbcs x10, x10, x27 + sbc x11, x11, x25 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x18, x14, x6 + sbcs x19, x15, x7 + mov x24, #-19 + csetm x27, cc + # Mask the modulus + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] + ldr x0, [x29, #48] + ldr x1, [x29, #64] + ldr x2, [x29, #176] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x18, x19, [x2, #16] + # A[0] * B[0] + mul x4, x12, x16 + umulh x5, x12, x16 + # A[0] * B[1] + mul x24, x12, x17 + umulh x6, x12, x17 + adds x5, x5, x24 + adc x6, x6, xzr + # A[1] * B[0] + mul x24, x13, x16 + umulh x25, x13, x16 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[0] * B[2] + mul x24, x12, x18 + umulh x25, x12, x18 + adds x6, x6, x24 + adc x7, x7, x25 + # A[1] * B[1] + mul x24, x13, x17 + umulh x25, x13, x17 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[2] * B[0] - mul x12, x22, x24 - umulh x13, x22, x24 - adds x6, x6, x12 - adcs x7, x7, x13 + mul x24, x14, x16 + umulh x25, x14, x16 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[0] * B[3] - mul x12, x20, x27 - umulh x13, x20, x27 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[1] * B[2] - mul x12, x21, x26 - umulh x13, x21, x26 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x13, x18 + umulh x25, x13, x18 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[1] - mul x12, x22, x25 - umulh x13, x22, x25 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x14, x17 + umulh x25, x14, x17 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[0] - mul x12, x23, x24 - umulh x13, x23, x24 - adds x7, x7, x12 - adcs x8, x8, x13 + mul x24, x15, x16 + umulh x25, x15, x16 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[1] * B[3] - mul x12, x21, x27 - umulh x13, x21, x27 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[2] * B[2] - mul x12, x22, x26 - umulh x13, x22, x26 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x14, x18 + umulh x25, x14, x18 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[1] - mul x12, x23, x25 - umulh x13, x23, x25 - adds x8, x8, x12 - adcs x9, x9, x13 + mul x24, x15, x17 + umulh x25, x15, x17 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[2] * B[3] - mul x12, x22, x27 - umulh x13, x22, x27 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, xzr, xzr # A[3] * B[2] - mul x12, x23, x26 - umulh x13, x23, x26 - adds x9, x9, x12 - adcs x10, x10, x13 + mul x24, x15, x18 + umulh x25, x15, x18 + adds x9, x9, x24 + adcs x10, x10, x25 adc x11, x11, xzr # A[3] * B[3] - mul x12, x23, x27 - umulh x13, x23, x27 - adds x10, x10, x12 - adc x11, x11, x13 + mul x24, x15, x19 + umulh x25, x15, x19 + adds x10, x10, x24 + adc x11, x11, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -7078,154 +6779,248 @@ fe_ge_sub: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x12, #19 - mul x13, x12, x8 - umulh x8, x12, x8 - adds x4, x4, x13 - mul x13, x12, x9 - umulh x9, x12, x9 - adcs x5, x5, x13 - mul x13, x12, x10 - umulh x10, x12, x10 - adcs x6, x6, x13 - mul x13, x12, x11 - umulh x14, x12, x11 - adcs x7, x7, x13 - adc x14, x14, xzr + mov x24, #19 + mul x25, x24, x8 + umulh x8, x24, x8 + adds x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x10, x24, x10 + adcs x6, x6, x25 + mul x25, x24, x11 + umulh x26, x24, x11 + adcs x7, x7, x25 + adc x26, x26, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x14, x14, xzr + adc x26, x26, xzr # Overflow - extr x14, x14, x7, #63 - mul x14, x14, x12 + extr x26, x26, x7, #63 + mul x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - lsr x14, x7, #63 - mul x14, x14, x12 + asr x26, x7, #63 + and x26, x26, x24 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x14 + adds x4, x4, x26 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - add x1, x29, #80 + ldr x0, [x29, #48] # Double - ldp x4, x5, [x0] - ldp x6, x7, [x0, #16] adds x4, x4, x4 adcs x5, x5, x5 adcs x6, x6, x6 adc x7, x7, x7 - mov x12, #-19 - asr x15, x7, #63 + mov x24, #-19 + asr x27, x7, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x12 - sbcs x5, x5, x15 - sbcs x6, x6, x15 - sbc x7, x7, x13 - stp x4, x5, [x1] - stp x6, x7, [x1, #16] - ldr x2, [x29, #24] - ldr x3, [x29, #32] - # Add - ldp x4, x5, [x3] - ldp x6, x7, [x3, #16] - ldp x8, x9, [x2] - ldp x10, x11, [x2, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 - # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc - # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x2] - stp x18, x19, [x2, #16] - stp x4, x5, [x0] - stp x6, x7, [x0, #16] + subs x4, x4, x24 + sbcs x5, x5, x27 + sbcs x6, x6, x27 + sbc x7, x7, x25 ldr x0, [x29, #40] + ldr x1, [x29, #184] + ldr x2, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x18, x19, [x1, #16] + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + # A[0] * B[0] + mul x8, x16, x20 + umulh x9, x16, x20 + # A[0] * B[1] + mul x24, x16, x21 + umulh x10, x16, x21 + adds x9, x9, x24 + adc x10, x10, xzr + # A[1] * B[0] + mul x24, x17, x20 + umulh x25, x17, x20 + adds x9, x9, x24 + adcs x10, x10, x25 + adc x11, xzr, xzr + # A[0] * B[2] + mul x24, x16, x22 + umulh x25, x16, x22 + adds x10, x10, x24 + adc x11, x11, x25 + # A[1] * B[1] + mul x24, x17, x21 + umulh x25, x17, x21 + adds x10, x10, x24 + adcs x11, x11, x25 + adc x12, xzr, xzr + # A[2] * B[0] + mul x24, x18, x20 + umulh x25, x18, x20 + adds x10, x10, x24 + adcs x11, x11, x25 + adc x12, x12, xzr + # A[0] * B[3] + mul x24, x16, x23 + umulh x25, x16, x23 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, xzr, xzr + # A[1] * B[2] + mul x24, x17, x22 + umulh x25, x17, x22 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, x13, xzr + # A[2] * B[1] + mul x24, x18, x21 + umulh x25, x18, x21 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, x13, xzr + # A[3] * B[0] + mul x24, x19, x20 + umulh x25, x19, x20 + adds x11, x11, x24 + adcs x12, x12, x25 + adc x13, x13, xzr + # A[1] * B[3] + mul x24, x17, x23 + umulh x25, x17, x23 + adds x12, x12, x24 + adcs x13, x13, x25 + adc x14, xzr, xzr + # A[2] * B[2] + mul x24, x18, x22 + umulh x25, x18, x22 + adds x12, x12, x24 + adcs x13, x13, x25 + adc x14, x14, xzr + # A[3] * B[1] + mul x24, x19, x21 + umulh x25, x19, x21 + adds x12, x12, x24 + adcs x13, x13, x25 + adc x14, x14, xzr + # A[2] * B[3] + mul x24, x18, x23 + umulh x25, x18, x23 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, xzr, xzr + # A[3] * B[2] + mul x24, x19, x22 + umulh x25, x19, x22 + adds x13, x13, x24 + adcs x14, x14, x25 + adc x15, x15, xzr + # A[3] * B[3] + mul x24, x19, x23 + umulh x25, x19, x23 + adds x14, x14, x24 + adc x15, x15, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x12 + umulh x12, x24, x12 + adds x8, x8, x25 + mul x25, x24, x13 + umulh x13, x24, x13 + adcs x9, x9, x25 + mul x25, x24, x14 + umulh x14, x24, x14 + adcs x10, x10, x25 + mul x25, x24, x15 + umulh x26, x24, x15 + adcs x11, x11, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x9, x9, x12 + adcs x10, x10, x13 + adcs x11, x11, x14 + adc x26, x26, xzr + # Overflow + extr x26, x26, x11, #63 + mul x26, x26, x24 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x26 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + asr x26, x11, #63 + and x26, x26, x24 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x26 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + ldr x0, [x29, #40] + ldr x1, [x29, #32] # Add - ldp x4, x5, [x1] - ldp x6, x7, [x1, #16] - ldp x8, x9, [x0] - ldp x10, x11, [x0, #16] - adds x16, x4, x8 - adcs x17, x5, x9 - adcs x18, x6, x10 - adc x19, x7, x11 - mov x12, #-19 - asr x15, x19, #63 + adds x12, x4, x8 + adcs x13, x5, x9 + adcs x14, x6, x10 + adc x15, x7, x11 + mov x24, #-19 + asr x27, x15, #63 # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x16, x16, x12 - sbcs x17, x17, x15 - sbcs x18, x18, x15 - sbc x19, x19, x13 + subs x12, x12, x24 + sbcs x13, x13, x27 + sbcs x14, x14, x27 + sbc x15, x15, x25 # Sub - subs x4, x4, x8 - sbcs x5, x5, x9 - sbcs x6, x6, x10 - sbcs x7, x7, x11 - mov x12, #-19 - csetm x15, cc + subs x16, x4, x8 + sbcs x17, x5, x9 + sbcs x18, x6, x10 + sbcs x19, x7, x11 + mov x24, #-19 + csetm x27, cc # Mask the modulus - and x12, x15, x12 - and x13, x15, #0x7fffffffffffffff + and x24, x27, x24 + and x25, x27, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x12 - adcs x5, x5, x15 - adcs x6, x6, x15 - adc x7, x7, x13 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - stp x4, x5, [x3] - stp x6, x7, [x3, #16] - ldr x17, [x29, #120] - ldr x18, [x29, #128] - ldr x19, [x29, #136] - ldr x20, [x29, #144] - ldr x21, [x29, #152] - ldr x22, [x29, #160] - ldr x23, [x29, #168] - ldr x24, [x29, #176] - ldr x25, [x29, #184] - ldr x26, [x29, #192] - ldr x27, [x29, #200] - ldp x29, x30, [sp], #0xd0 + adds x16, x16, x24 + adcs x17, x17, x27 + adcs x18, x18, x27 + adc x19, x19, x25 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x18, x19, [x1, #16] + ldr x17, [x29, #88] + ldr x18, [x29, #96] + ldr x19, [x29, #104] + ldr x20, [x29, #112] + ldr x21, [x29, #120] + ldr x22, [x29, #128] + ldr x23, [x29, #136] + ldr x24, [x29, #144] + ldr x25, [x29, #152] + ldr x26, [x29, #160] + ldr x27, [x29, #168] + ldp x29, x30, [sp], #0xb0 ret .size fe_ge_sub,.-fe_ge_sub +#endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c index 6ac00546e..2d0b0642c 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -19,6 +19,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +#ifdef __aarch64__ #ifdef HAVE_CONFIG_H #include #endif @@ -69,8 +70,8 @@ void fe_tobytes(unsigned char* out, const fe n) "adcs x6, x3, xzr\n\t" "adcs x6, x4, xzr\n\t" "adc x6, x5, xzr\n\t" - "lsr x6, x6, #63\n\t" - "mul x6, x6, x7\n\t" + "asr x6, x6, #63\n\t" + "and x6, x6, x7\n\t" "adds x2, x2, x6\n\t" "adcs x3, x3, xzr\n\t" "adcs x4, x4, xzr\n\t" @@ -288,8 +289,8 @@ int fe_isnonzero(const fe a) "adcs x5, x2, xzr\n\t" "adcs x5, x3, xzr\n\t" "adc x5, x4, xzr\n\t" - "lsr x5, x5, #63\n\t" - "mul x5, x5, x6\n\t" + "asr x5, x5, #63\n\t" + "and x5, x5, x6\n\t" "adds x1, x1, x5\n\t" "adcs x2, x2, xzr\n\t" "adcs x3, x3, xzr\n\t" @@ -318,11 +319,9 @@ int fe_isnegative(const fe a) "adcs x5, x2, xzr\n\t" "adcs x5, x3, xzr\n\t" "adc x5, x4, xzr\n\t" - "lsr x5, x5, #63\n\t" - "mul x5, x5, x6\n\t" - "ldr x1, [x0]\n\t" - "adds x1, x1, x5\n\t" "and %[a], x1, #1\n\t" + "lsr x5, x5, #63\n\t" + "eor %[a], %[a], x5\n\t" "ldp x29, x30, [sp], #16\n\t" : [a] "+r" (a) : @@ -338,8 +337,7 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "add x29, sp, #0\n\t" "sxtb %[b], w2\n\t" "sbfx x15, %[b], #7, #1\n\t" - "sxtb x16, w2\n\t" - "eor x16, x16, x15\n\t" + "eor x16, %[b], x15\n\t" "sub x16, x16, x15\n\t" "mov x3, #1\n\t" "mov x4, xzr\n\t" @@ -506,8 +504,6 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x12, x26, x12, eq\n\t" "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" - "add %[base], %[base], #0x180\n\t" - "sub %[base], %[base], #0x180\n\t" "mov x17, #-19\n\t" "mov x18, #-1\n\t" "mov x19, #-1\n\t" @@ -552,98 +548,98 @@ void fe_mul(fe r, const fe a, const fe b) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Multiply */ - "ldp x15, x16, [x1]\n\t" - "ldp x17, x18, [x1, #16]\n\t" - "ldp x19, x20, [x2]\n\t" - "ldp x21, x22, [x2, #16]\n\t" + "ldp x14, x15, [x1]\n\t" + "ldp x16, x17, [x1, #16]\n\t" + "ldp x18, x19, [x2]\n\t" + "ldp x20, x21, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x6, x15, x19\n\t" - "umulh x7, x15, x19\n\t" + "mul x6, x14, x18\n\t" + "umulh x7, x14, x18\n\t" /* A[0] * B[1] */ - "mul x3, x15, x20\n\t" - "umulh x8, x15, x20\n\t" + "mul x3, x14, x19\n\t" + "umulh x8, x14, x19\n\t" "adds x7, x7, x3\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[0] */ - "mul x3, x16, x19\n\t" - "umulh x4, x16, x19\n\t" + "mul x3, x15, x18\n\t" + "umulh x4, x15, x18\n\t" "adds x7, x7, x3\n\t" "adcs x8, x8, x4\n\t" "adc x9, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x3, x15, x21\n\t" - "umulh x4, x15, x21\n\t" + "mul x3, x14, x20\n\t" + "umulh x4, x14, x20\n\t" "adds x8, x8, x3\n\t" "adc x9, x9, x4\n\t" /* A[1] * B[1] */ - "mul x3, x16, x20\n\t" - "umulh x4, x16, x20\n\t" + "mul x3, x15, x19\n\t" + "umulh x4, x15, x19\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x3, x17, x19\n\t" - "umulh x4, x17, x19\n\t" + "mul x3, x16, x18\n\t" + "umulh x4, x16, x18\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" "adc x10, x10, xzr\n\t" /* A[0] * B[3] */ - "mul x3, x15, x22\n\t" - "umulh x4, x15, x22\n\t" + "mul x3, x14, x21\n\t" + "umulh x4, x14, x21\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x3, x16, x21\n\t" - "umulh x4, x16, x21\n\t" + "mul x3, x15, x20\n\t" + "umulh x4, x15, x20\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" /* A[2] * B[1] */ - "mul x3, x17, x20\n\t" - "umulh x4, x17, x20\n\t" + "mul x3, x16, x19\n\t" + "umulh x4, x16, x19\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[0] */ - "mul x3, x18, x19\n\t" - "umulh x4, x18, x19\n\t" + "mul x3, x17, x18\n\t" + "umulh x4, x17, x18\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" /* A[1] * B[3] */ - "mul x3, x16, x22\n\t" - "umulh x4, x16, x22\n\t" + "mul x3, x15, x21\n\t" + "umulh x4, x15, x21\n\t" "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x3, x17, x21\n\t" - "umulh x4, x17, x21\n\t" + "mul x3, x16, x20\n\t" + "umulh x4, x16, x20\n\t" "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, x12, xzr\n\t" /* A[3] * B[1] */ - "mul x3, x18, x20\n\t" - "umulh x4, x18, x20\n\t" + "mul x3, x17, x19\n\t" + "umulh x4, x17, x19\n\t" "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, x12, xzr\n\t" /* A[2] * B[3] */ - "mul x3, x17, x22\n\t" - "umulh x4, x17, x22\n\t" + "mul x3, x16, x21\n\t" + "umulh x4, x16, x21\n\t" "adds x11, x11, x3\n\t" "adcs x12, x12, x4\n\t" "adc x13, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x3, x18, x21\n\t" - "umulh x4, x18, x21\n\t" + "mul x3, x17, x20\n\t" + "umulh x4, x17, x20\n\t" "adds x11, x11, x3\n\t" "adcs x12, x12, x4\n\t" "adc x13, x13, xzr\n\t" /* A[3] * B[3] */ - "mul x3, x18, x22\n\t" - "umulh x4, x18, x22\n\t" + "mul x3, x17, x21\n\t" + "umulh x4, x17, x21\n\t" "adds x12, x12, x3\n\t" "adc x13, x13, x4\n\t" /* Reduce */ @@ -682,8 +678,8 @@ void fe_mul(fe r, const fe a, const fe b) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" + "asr x5, x9, #63\n\t" + "and x5, x5, x3\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -695,7 +691,7 @@ void fe_mul(fe r, const fe a, const fe b) "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" ); } @@ -705,116 +701,116 @@ void fe_sq(fe r, const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Square */ - "ldp x14, x15, [x1]\n\t" - "ldp x16, x17, [x1, #16]\n\t" + "ldp x13, x14, [x1]\n\t" + "ldp x15, x16, [x1, #16]\n\t" /* A[0] * A[1] */ - "mul x3, x14, x15\n\t" - "umulh x4, x14, x15\n\t" + "mul x6, x13, x14\n\t" + "umulh x7, x13, x14\n\t" /* A[0] * A[2] */ - "mul x11, x14, x16\n\t" - "umulh x5, x14, x16\n\t" - "adds x4, x4, x11\n\t" - "adc x5, x5, xzr\n\t" - /* A[0] * A[3] */ - "mul x11, x14, x17\n\t" - "umulh x6, x14, x17\n\t" - "adds x5, x5, x11\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * A[2] */ - "mul x11, x15, x16\n\t" - "umulh x12, x15, x16\n\t" - "adds x5, x5, x11\n\t" - "adcs x6, x6, x12\n\t" - "adc x7, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x11, x15, x17\n\t" - "umulh x12, x15, x17\n\t" - "adds x6, x6, x11\n\t" - "adc x7, x7, x12\n\t" - /* A[2] * A[3] */ - "mul x11, x16, x17\n\t" - "umulh x8, x16, x17\n\t" - "adds x7, x7, x11\n\t" + "mul x2, x13, x15\n\t" + "umulh x8, x13, x15\n\t" + "adds x7, x7, x2\n\t" "adc x8, x8, xzr\n\t" + /* A[0] * A[3] */ + "mul x2, x13, x16\n\t" + "umulh x9, x13, x16\n\t" + "adds x8, x8, x2\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * A[2] */ + "mul x2, x14, x15\n\t" + "umulh x3, x14, x15\n\t" + "adds x8, x8, x2\n\t" + "adcs x9, x9, x3\n\t" + "adc x10, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x2, x14, x16\n\t" + "umulh x3, x14, x16\n\t" + "adds x9, x9, x2\n\t" + "adc x10, x10, x3\n\t" + /* A[2] * A[3] */ + "mul x2, x15, x16\n\t" + "umulh x11, x15, x16\n\t" + "adds x10, x10, x2\n\t" + "adc x11, x11, xzr\n\t" /* Double */ - "adds x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" + "adds x6, x6, x6\n\t" "adcs x7, x7, x7\n\t" "adcs x8, x8, x8\n\t" - "adc x9, xzr, xzr\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adc x12, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x2, x14, x14\n\t" - "umulh x10, x14, x14\n\t" + "mul x5, x13, x13\n\t" + "umulh x4, x13, x13\n\t" /* A[1] * A[1] */ - "mul x11, x15, x15\n\t" - "umulh x12, x15, x15\n\t" - "adds x3, x3, x10\n\t" - "adcs x4, x4, x11\n\t" - "adc x10, x12, xzr\n\t" + "mul x2, x14, x14\n\t" + "umulh x3, x14, x14\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, x2\n\t" + "adc x4, x3, xzr\n\t" /* A[2] * A[2] */ - "mul x11, x16, x16\n\t" - "umulh x12, x16, x16\n\t" - "adds x5, x5, x10\n\t" - "adcs x6, x6, x11\n\t" - "adc x10, x12, xzr\n\t" + "mul x2, x15, x15\n\t" + "umulh x3, x15, x15\n\t" + "adds x8, x8, x4\n\t" + "adcs x9, x9, x2\n\t" + "adc x4, x3, xzr\n\t" /* A[3] * A[3] */ - "mul x11, x17, x17\n\t" - "umulh x12, x17, x17\n\t" - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adc x9, x9, x12\n\t" + "mul x2, x16, x16\n\t" + "umulh x3, x16, x16\n\t" + "adds x10, x10, x4\n\t" + "adcs x11, x11, x2\n\t" + "adc x12, x12, x3\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "extr x6, x6, x5, #63\n\t" - "and x5, x5, #0x7fffffffffffffff\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x11, #19\n\t" - "mul x12, x11, x6\n\t" - "umulh x6, x11, x6\n\t" - "adds x2, x2, x12\n\t" - "mul x12, x11, x7\n\t" - "umulh x7, x11, x7\n\t" - "adcs x3, x3, x12\n\t" - "mul x12, x11, x8\n\t" - "umulh x8, x11, x8\n\t" - "adcs x4, x4, x12\n\t" - "mul x12, x11, x9\n\t" - "umulh x13, x11, x9\n\t" - "adcs x5, x5, x12\n\t" - "adc x13, x13, xzr\n\t" + "mov x2, #19\n\t" + "mul x3, x2, x9\n\t" + "umulh x9, x2, x9\n\t" + "adds x5, x5, x3\n\t" + "mul x3, x2, x10\n\t" + "umulh x10, x2, x10\n\t" + "adcs x6, x6, x3\n\t" + "mul x3, x2, x11\n\t" + "umulh x11, x2, x11\n\t" + "adcs x7, x7, x3\n\t" + "mul x3, x2, x12\n\t" + "umulh x4, x2, x12\n\t" + "adcs x8, x8, x3\n\t" + "adc x4, x4, xzr\n\t" /* Add remaining product results in */ - "adds x3, x3, x6\n\t" - "adcs x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adc x13, x13, xzr\n\t" + "adds x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adc x4, x4, xzr\n\t" /* Overflow */ - "extr x13, x13, x5, #63\n\t" - "mul x13, x13, x11\n\t" - "and x5, x5, #0x7fffffffffffffff\n\t" - "adds x2, x2, x13\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adc x5, x5, xzr\n\t" + "extr x4, x4, x8, #63\n\t" + "mul x4, x4, x2\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" + "adds x5, x5, x4\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, x8, xzr\n\t" /* Reduce if top bit set */ - "lsr x13, x5, #63\n\t" - "mul x13, x13, x11\n\t" - "and x5, x5, #0x7fffffffffffffff\n\t" - "adds x2, x2, x13\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adc x5, x5, xzr\n\t" + "asr x4, x8, #63\n\t" + "and x4, x4, x2\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" + "adds x5, x5, x4\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, x8, xzr\n\t" /* Store */ - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "stp x5, x6, [x0]\n\t" + "stp x7, x8, [x0, #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "x11", "x12", "x13", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16" ); } @@ -824,38 +820,38 @@ void fe_mul121666(fe r, fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Multiply by 121666 */ - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" - "mov x13, #0xdb42\n\t" - "movk x13, #1, lsl 16\n\t" - "mul x6, x2, x13\n\t" - "umulh x7, x2, x13\n\t" - "mul x11, x3, x13\n\t" - "umulh x12, x3, x13\n\t" - "adds x7, x7, x11\n\t" - "adc x8, xzr, x12\n\t" - "mul x11, x4, x13\n\t" - "umulh x12, x4, x13\n\t" - "adds x8, x8, x11\n\t" - "adc x9, xzr, x12\n\t" - "mul x11, x5, x13\n\t" - "umulh x12, x5, x13\n\t" - "adds x9, x9, x11\n\t" - "adc x12, xzr, x12\n\t" - "mov x13, #19\n\t" - "extr x12, x12, x9, #63\n\t" - "mul x12, x12, x13\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "ldp x5, x6, [x1]\n\t" + "ldp x7, x8, [x1, #16]\n\t" + "mov x4, #0xdb42\n\t" + "movk x4, #1, lsl 16\n\t" + "mul x9, x5, x4\n\t" + "umulh x10, x5, x4\n\t" + "mul x2, x6, x4\n\t" + "umulh x3, x6, x4\n\t" + "adds x10, x10, x2\n\t" + "adc x11, xzr, x3\n\t" + "mul x2, x7, x4\n\t" + "umulh x3, x7, x4\n\t" + "adds x11, x11, x2\n\t" + "adc x12, xzr, x3\n\t" + "mul x2, x8, x4\n\t" + "umulh x3, x8, x4\n\t" + "adds x12, x12, x2\n\t" + "adc x3, xzr, x3\n\t" + "mov x4, #19\n\t" + "extr x3, x3, x12, #63\n\t" + "mul x3, x3, x4\n\t" + "and x12, x12, #0x7fffffffffffffff\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, xzr\n\t" + "adcs x11, x11, xzr\n\t" + "adc x12, x12, xzr\n\t" + "stp x9, x10, [x0]\n\t" + "stp x11, x12, [x0, #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "x11", "x12", "x13", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12" ); } @@ -865,127 +861,127 @@ void fe_sq2(fe r, const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Square * 2 */ - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x5, x6, [x1]\n\t" + "ldp x7, x8, [x1, #16]\n\t" /* A[0] * A[1] */ - "mul x7, x2, x3\n\t" - "umulh x8, x2, x3\n\t" + "mul x10, x5, x6\n\t" + "umulh x11, x5, x6\n\t" /* A[0] * A[2] */ - "mul x11, x2, x4\n\t" - "umulh x9, x2, x4\n\t" - "adds x8, x8, x11\n\t" - "adc x9, x9, xzr\n\t" + "mul x2, x5, x7\n\t" + "umulh x12, x5, x7\n\t" + "adds x11, x11, x2\n\t" + "adc x12, x12, xzr\n\t" /* A[0] * A[3] */ - "mul x11, x2, x5\n\t" - "umulh x10, x2, x5\n\t" - "adds x9, x9, x11\n\t" - "adc x10, x10, xzr\n\t" + "mul x2, x5, x8\n\t" + "umulh x13, x5, x8\n\t" + "adds x12, x12, x2\n\t" + "adc x13, x13, xzr\n\t" /* A[1] * A[2] */ - "mul x11, x3, x4\n\t" - "umulh x12, x3, x4\n\t" - "adds x9, x9, x11\n\t" - "adcs x10, x10, x12\n\t" + "mul x2, x6, x7\n\t" + "umulh x3, x6, x7\n\t" + "adds x12, x12, x2\n\t" + "adcs x13, x13, x3\n\t" "adc x14, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x11, x3, x5\n\t" - "umulh x12, x3, x5\n\t" - "adds x10, x10, x11\n\t" - "adc x14, x14, x12\n\t" + "mul x2, x6, x8\n\t" + "umulh x3, x6, x8\n\t" + "adds x13, x13, x2\n\t" + "adc x14, x14, x3\n\t" /* A[2] * A[3] */ - "mul x11, x4, x5\n\t" - "umulh x15, x4, x5\n\t" - "adds x14, x14, x11\n\t" + "mul x2, x7, x8\n\t" + "umulh x15, x7, x8\n\t" + "adds x14, x14, x2\n\t" "adc x15, x15, xzr\n\t" /* Double */ - "adds x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" + "adds x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" "adcs x14, x14, x14\n\t" "adcs x15, x15, x15\n\t" "adc x16, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x6, x2, x2\n\t" - "umulh x17, x2, x2\n\t" + "mul x9, x5, x5\n\t" + "umulh x17, x5, x5\n\t" /* A[1] * A[1] */ - "mul x11, x3, x3\n\t" - "umulh x12, x3, x3\n\t" - "adds x7, x7, x17\n\t" - "adcs x8, x8, x11\n\t" - "adc x17, x12, xzr\n\t" + "mul x2, x6, x6\n\t" + "umulh x3, x6, x6\n\t" + "adds x10, x10, x17\n\t" + "adcs x11, x11, x2\n\t" + "adc x17, x3, xzr\n\t" /* A[2] * A[2] */ - "mul x11, x4, x4\n\t" - "umulh x12, x4, x4\n\t" - "adds x9, x9, x17\n\t" - "adcs x10, x10, x11\n\t" - "adc x17, x12, xzr\n\t" + "mul x2, x7, x7\n\t" + "umulh x3, x7, x7\n\t" + "adds x12, x12, x17\n\t" + "adcs x13, x13, x2\n\t" + "adc x17, x3, xzr\n\t" /* A[3] * A[3] */ - "mul x11, x5, x5\n\t" - "umulh x12, x5, x5\n\t" + "mul x2, x8, x8\n\t" + "umulh x3, x8, x8\n\t" "adds x14, x14, x17\n\t" - "adcs x15, x15, x11\n\t" - "adc x16, x16, x12\n\t" + "adcs x15, x15, x2\n\t" + "adc x16, x16, x3\n\t" /* Double and Reduce */ - "mov x11, #0x169\n\t" + "mov x2, #0x169\n\t" /* Move top half into t4-t7 and remove top bit from t3 */ "lsr x17, x16, #61\n\t" "extr x16, x16, x15, #62\n\t" "extr x15, x15, x14, #62\n\t" - "extr x14, x14, x10, #62\n\t" - "extr x10, x10, x9, #62\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "lsl x6, x6, #1\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" + "extr x14, x14, x13, #62\n\t" + "extr x13, x13, x12, #62\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "lsl x9, x9, #1\n\t" + "and x12, x12, #0x7fffffffffffffff\n\t" /* Two left, only one right */ "and x16, x16, #0x7fffffffffffffff\n\t" /* Multiply top bits by 19*19 */ - "mul x17, x17, x11\n\t" + "mul x17, x17, x2\n\t" /* Multiply top half by 19 */ - "mov x11, #19\n\t" - "mul x12, x11, x10\n\t" - "umulh x10, x11, x10\n\t" - "adds x6, x6, x12\n\t" - "mul x12, x11, x14\n\t" - "umulh x14, x11, x14\n\t" - "adcs x7, x7, x12\n\t" - "mul x12, x11, x15\n\t" - "umulh x15, x11, x15\n\t" - "adcs x8, x8, x12\n\t" - "mul x12, x11, x16\n\t" - "umulh x13, x11, x16\n\t" - "adcs x9, x9, x12\n\t" - "adc x13, x13, xzr\n\t" + "mov x2, #19\n\t" + "mul x3, x2, x13\n\t" + "umulh x13, x2, x13\n\t" + "adds x9, x9, x3\n\t" + "mul x3, x2, x14\n\t" + "umulh x14, x2, x14\n\t" + "adcs x10, x10, x3\n\t" + "mul x3, x2, x15\n\t" + "umulh x15, x2, x15\n\t" + "adcs x11, x11, x3\n\t" + "mul x3, x2, x16\n\t" + "umulh x4, x2, x16\n\t" + "adcs x12, x12, x3\n\t" + "adc x4, x4, xzr\n\t" /* Add remaining product results in */ - "adds x6, x6, x17\n\t" - "adcs x7, x7, x10\n\t" - "adcs x8, x8, x14\n\t" - "adcs x9, x9, x15\n\t" - "adc x13, x13, xzr\n\t" + "adds x9, x9, x17\n\t" + "adcs x10, x10, x13\n\t" + "adcs x11, x11, x14\n\t" + "adcs x12, x12, x15\n\t" + "adc x4, x4, xzr\n\t" /* Overflow */ - "extr x13, x13, x9, #63\n\t" - "mul x13, x13, x11\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x13\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" + "extr x4, x4, x12, #63\n\t" + "mul x4, x4, x2\n\t" + "and x12, x12, #0x7fffffffffffffff\n\t" + "adds x9, x9, x4\n\t" + "adcs x10, x10, xzr\n\t" + "adcs x11, x11, xzr\n\t" + "adc x12, x12, xzr\n\t" /* Reduce if top bit set */ - "lsr x13, x9, #63\n\t" - "mul x13, x13, x11\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x13\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" + "asr x4, x12, #63\n\t" + "and x4, x4, x2\n\t" + "and x12, x12, #0x7fffffffffffffff\n\t" + "adds x9, x9, x4\n\t" + "adcs x10, x10, xzr\n\t" + "adcs x11, x11, xzr\n\t" + "adc x12, x12, xzr\n\t" /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x9, x10, [x0]\n\t" + "stp x11, x12, [x0, #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "x11", "x12", "x13", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x18" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); } @@ -1115,8 +1111,6 @@ void fe_invert(fe r, const fe a) "ldr x0, [x29, #144]\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" - "ldr %[a], [x29, #152]\n\t" - "ldr %[r], [x29, #144]\n\t" "ldp x29, x30, [sp], #0xa0\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -1159,1065 +1153,38 @@ int curve25519(byte* r, byte* n, byte* a) "eor x22, x22, x23\n\t" /* Conditional Swap */ "cmp x22, #1\n\t" - "ldp x6, x7, [x0]\n\t" - "ldp x8, x9, [x0, #16]\n\t" - "ldp x10, x11, [x29, #80]\n\t" - "ldp x12, x13, [x29, #96]\n\t" - "csel x14, x6, x10, eq\n\t" - "csel x6, x10, x6, eq\n\t" - "csel x15, x7, x11, eq\n\t" - "csel x7, x11, x7, eq\n\t" - "csel x16, x8, x12, eq\n\t" - "csel x8, x12, x8, eq\n\t" - "csel x17, x9, x13, eq\n\t" - "csel x9, x13, x9, eq\n\t" - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" - "stp x14, x15, [x29, #80]\n\t" - "stp x16, x17, [x29, #96]\n\t" - /* Conditional Swap */ - "cmp x22, #1\n\t" - "ldp x6, x7, [x29, #16]\n\t" - "ldp x8, x9, [x29, #32]\n\t" - "ldp x10, x11, [x29, #48]\n\t" - "ldp x12, x13, [x29, #64]\n\t" - "csel x14, x6, x10, eq\n\t" - "csel x6, x10, x6, eq\n\t" - "csel x15, x7, x11, eq\n\t" - "csel x7, x11, x7, eq\n\t" - "csel x16, x8, x12, eq\n\t" - "csel x8, x12, x8, eq\n\t" - "csel x17, x9, x13, eq\n\t" - "csel x9, x13, x9, eq\n\t" - "stp x6, x7, [x29, #16]\n\t" - "stp x8, x9, [x29, #32]\n\t" - "stp x14, x15, [x29, #48]\n\t" - "stp x16, x17, [x29, #64]\n\t" - "mov x22, x23\n\t" - /* Add */ - "ldp x6, x7, [x0]\n\t" - "ldp x8, x9, [x0, #16]\n\t" - "ldp x10, x11, [x29, #16]\n\t" - "ldp x12, x13, [x29, #32]\n\t" - "adds x14, x6, x10\n\t" - "adcs x15, x7, x11\n\t" - "adcs x16, x8, x12\n\t" - "adc x17, x9, x13\n\t" - "mov x3, #-19\n\t" - "asr x23, x17, #63\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x14, x14, x3\n\t" - "sbcs x15, x15, x23\n\t" - "sbcs x16, x16, x23\n\t" - "sbc x17, x17, x4\n\t" - /* Sub */ - "subs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "sbcs x8, x8, x12\n\t" - "sbcs x9, x9, x13\n\t" - "mov x3, #-19\n\t" - "csetm x23, cc\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x6, x6, x3\n\t" - "adcs x7, x7, x23\n\t" - "adcs x8, x8, x23\n\t" - "adc x9, x9, x4\n\t" - "stp x14, x15, [x0]\n\t" - "stp x16, x17, [x0, #16]\n\t" - "stp x6, x7, [x29, #144]\n\t" - "stp x8, x9, [x29, #160]\n\t" - /* Add */ + "ldp x10, x11, [x0]\n\t" + "ldp x12, x13, [x0, #16]\n\t" "ldp x6, x7, [x29, #80]\n\t" "ldp x8, x9, [x29, #96]\n\t" - "ldp x10, x11, [x29, #48]\n\t" - "ldp x12, x13, [x29, #64]\n\t" - "adds x14, x6, x10\n\t" - "adcs x15, x7, x11\n\t" - "adcs x16, x8, x12\n\t" - "adc x17, x9, x13\n\t" - "mov x3, #-19\n\t" - "asr x23, x17, #63\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x14, x14, x3\n\t" - "sbcs x15, x15, x23\n\t" - "sbcs x16, x16, x23\n\t" - "sbc x17, x17, x4\n\t" - /* Sub */ - "subs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "sbcs x8, x8, x12\n\t" - "sbcs x9, x9, x13\n\t" - "mov x3, #-19\n\t" - "csetm x23, cc\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x6, x6, x3\n\t" - "adcs x7, x7, x23\n\t" - "adcs x8, x8, x23\n\t" - "adc x9, x9, x4\n\t" - "stp x14, x15, [x29, #16]\n\t" - "stp x16, x17, [x29, #32]\n\t" - "stp x6, x7, [x29, #112]\n\t" - "stp x8, x9, [x29, #128]\n\t" - /* Multiply */ - "ldp x18, x19, [x29, #112]\n\t" - "ldp x20, x21, [x29, #128]\n\t" - "ldp x14, x15, [x0]\n\t" - "ldp x16, x17, [x0, #16]\n\t" - /* A[0] * B[0] */ - "mul x6, x18, x14\n\t" - "umulh x7, x18, x14\n\t" - /* A[0] * B[1] */ - "mul x3, x18, x15\n\t" - "umulh x8, x18, x15\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[0] */ - "mul x3, x19, x14\n\t" - "umulh x4, x19, x14\n\t" - "adds x7, x7, x3\n\t" - "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x18, x16\n\t" - "umulh x4, x18, x16\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x19, x15\n\t" - "umulh x4, x19, x15\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x20, x14\n\t" - "umulh x4, x20, x14\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x18, x17\n\t" - "umulh x4, x18, x17\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x19, x16\n\t" - "umulh x4, x19, x16\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[2] * B[1] */ - "mul x3, x20, x15\n\t" - "umulh x4, x20, x15\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x21, x14\n\t" - "umulh x4, x21, x14\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x19, x17\n\t" - "umulh x4, x19, x17\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x20, x16\n\t" - "umulh x4, x20, x16\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" - /* A[3] * B[1] */ - "mul x3, x21, x15\n\t" - "umulh x4, x21, x15\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" - /* A[2] * B[3] */ - "mul x3, x20, x17\n\t" - "umulh x4, x20, x17\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x3, x21, x16\n\t" - "umulh x4, x21, x16\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x21, x17\n\t" - "umulh x4, x21, x17\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #48]\n\t" - "stp x8, x9, [x29, #64]\n\t" - /* Multiply */ + "csel x14, x10, x6, eq\n\t" + "csel x10, x6, x10, eq\n\t" + "csel x15, x11, x7, eq\n\t" + "csel x11, x7, x11, eq\n\t" + "csel x16, x12, x8, eq\n\t" + "csel x12, x8, x12, eq\n\t" + "csel x17, x13, x9, eq\n\t" + "csel x13, x9, x13, eq\n\t" + /* Conditional Swap */ + "cmp x22, #1\n\t" "ldp x18, x19, [x29, #16]\n\t" "ldp x20, x21, [x29, #32]\n\t" - "ldp x14, x15, [x29, #144]\n\t" - "ldp x16, x17, [x29, #160]\n\t" - /* A[0] * B[0] */ - "mul x6, x18, x14\n\t" - "umulh x7, x18, x14\n\t" - /* A[0] * B[1] */ - "mul x3, x18, x15\n\t" - "umulh x8, x18, x15\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[0] */ - "mul x3, x19, x14\n\t" - "umulh x4, x19, x14\n\t" - "adds x7, x7, x3\n\t" - "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x18, x16\n\t" - "umulh x4, x18, x16\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x19, x15\n\t" - "umulh x4, x19, x15\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x20, x14\n\t" - "umulh x4, x20, x14\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x18, x17\n\t" - "umulh x4, x18, x17\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x19, x16\n\t" - "umulh x4, x19, x16\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[2] * B[1] */ - "mul x3, x20, x15\n\t" - "umulh x4, x20, x15\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x21, x14\n\t" - "umulh x4, x21, x14\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x19, x17\n\t" - "umulh x4, x19, x17\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x20, x16\n\t" - "umulh x4, x20, x16\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" - /* A[3] * B[1] */ - "mul x3, x21, x15\n\t" - "umulh x4, x21, x15\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" - /* A[2] * B[3] */ - "mul x3, x20, x17\n\t" - "umulh x4, x20, x17\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x3, x21, x16\n\t" - "umulh x4, x21, x16\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x21, x17\n\t" - "umulh x4, x21, x17\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #16]\n\t" - "stp x8, x9, [x29, #32]\n\t" - /* Square */ - "ldp x18, x19, [x29, #144]\n\t" - "ldp x20, x21, [x29, #160]\n\t" - /* A[0] * A[1] */ - "mul x7, x18, x19\n\t" - "umulh x8, x18, x19\n\t" - /* A[0] * A[2] */ - "mul x3, x18, x20\n\t" - "umulh x9, x18, x20\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x18, x21\n\t" - "umulh x10, x18, x21\n\t" - "adds x9, x9, x3\n\t" - "adc x10, x10, xzr\n\t" - /* A[1] * A[2] */ - "mul x3, x19, x20\n\t" - "umulh x4, x19, x20\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x19, x21\n\t" - "umulh x4, x19, x21\n\t" - "adds x10, x10, x3\n\t" - "adc x11, x11, x4\n\t" - /* A[2] * A[3] */ - "mul x3, x20, x21\n\t" - "umulh x12, x20, x21\n\t" - "adds x11, x11, x3\n\t" - "adc x12, x12, xzr\n\t" - /* Double */ - "adds x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adc x13, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x6, x18, x18\n\t" - "umulh x23, x18, x18\n\t" - /* A[1] * A[1] */ - "mul x3, x19, x19\n\t" - "umulh x4, x19, x19\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[2] * A[2] */ - "mul x3, x20, x20\n\t" - "umulh x4, x20, x20\n\t" - "adds x9, x9, x23\n\t" - "adcs x10, x10, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[3] * A[3] */ - "mul x3, x21, x21\n\t" - "umulh x4, x21, x21\n\t" - "adds x11, x11, x23\n\t" - "adcs x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #112]\n\t" - "stp x8, x9, [x29, #128]\n\t" - /* Square */ - "ldp x18, x19, [x0]\n\t" - "ldp x20, x21, [x0, #16]\n\t" - /* A[0] * A[1] */ - "mul x7, x18, x19\n\t" - "umulh x8, x18, x19\n\t" - /* A[0] * A[2] */ - "mul x3, x18, x20\n\t" - "umulh x9, x18, x20\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x18, x21\n\t" - "umulh x10, x18, x21\n\t" - "adds x9, x9, x3\n\t" - "adc x10, x10, xzr\n\t" - /* A[1] * A[2] */ - "mul x3, x19, x20\n\t" - "umulh x4, x19, x20\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x19, x21\n\t" - "umulh x4, x19, x21\n\t" - "adds x10, x10, x3\n\t" - "adc x11, x11, x4\n\t" - /* A[2] * A[3] */ - "mul x3, x20, x21\n\t" - "umulh x12, x20, x21\n\t" - "adds x11, x11, x3\n\t" - "adc x12, x12, xzr\n\t" - /* Double */ - "adds x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adc x13, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x6, x18, x18\n\t" - "umulh x23, x18, x18\n\t" - /* A[1] * A[1] */ - "mul x3, x19, x19\n\t" - "umulh x4, x19, x19\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[2] * A[2] */ - "mul x3, x20, x20\n\t" - "umulh x4, x20, x20\n\t" - "adds x9, x9, x23\n\t" - "adcs x10, x10, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[3] * A[3] */ - "mul x3, x21, x21\n\t" - "umulh x4, x21, x21\n\t" - "adds x11, x11, x23\n\t" - "adcs x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #144]\n\t" - "stp x8, x9, [x29, #160]\n\t" - /* Add */ "ldp x6, x7, [x29, #48]\n\t" "ldp x8, x9, [x29, #64]\n\t" - "ldp x10, x11, [x29, #16]\n\t" - "ldp x12, x13, [x29, #32]\n\t" - "adds x14, x6, x10\n\t" - "adcs x15, x7, x11\n\t" - "adcs x16, x8, x12\n\t" - "adc x17, x9, x13\n\t" - "mov x3, #-19\n\t" - "asr x23, x17, #63\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x14, x14, x3\n\t" - "sbcs x15, x15, x23\n\t" - "sbcs x16, x16, x23\n\t" - "sbc x17, x17, x4\n\t" - /* Sub */ - "subs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "sbcs x8, x8, x12\n\t" - "sbcs x9, x9, x13\n\t" - "mov x3, #-19\n\t" - "csetm x23, cc\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x6, x6, x3\n\t" - "adcs x7, x7, x23\n\t" - "adcs x8, x8, x23\n\t" - "adc x9, x9, x4\n\t" - "stp x14, x15, [x29, #80]\n\t" - "stp x16, x17, [x29, #96]\n\t" - "stp x6, x7, [x29, #16]\n\t" - "stp x8, x9, [x29, #32]\n\t" - /* Multiply */ - "ldp x18, x19, [x29, #144]\n\t" - "ldp x20, x21, [x29, #160]\n\t" - "ldp x14, x15, [x29, #112]\n\t" - "ldp x16, x17, [x29, #128]\n\t" - /* A[0] * B[0] */ - "mul x6, x18, x14\n\t" - "umulh x7, x18, x14\n\t" - /* A[0] * B[1] */ - "mul x3, x18, x15\n\t" - "umulh x8, x18, x15\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[0] */ - "mul x3, x19, x14\n\t" - "umulh x4, x19, x14\n\t" - "adds x7, x7, x3\n\t" - "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x18, x16\n\t" - "umulh x4, x18, x16\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x19, x15\n\t" - "umulh x4, x19, x15\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x20, x14\n\t" - "umulh x4, x20, x14\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x18, x17\n\t" - "umulh x4, x18, x17\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x19, x16\n\t" - "umulh x4, x19, x16\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[2] * B[1] */ - "mul x3, x20, x15\n\t" - "umulh x4, x20, x15\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x21, x14\n\t" - "umulh x4, x21, x14\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x19, x17\n\t" - "umulh x4, x19, x17\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x20, x16\n\t" - "umulh x4, x20, x16\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" - /* A[3] * B[1] */ - "mul x3, x21, x15\n\t" - "umulh x4, x21, x15\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" - /* A[2] * B[3] */ - "mul x3, x20, x17\n\t" - "umulh x4, x20, x17\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x3, x21, x16\n\t" - "umulh x4, x21, x16\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x21, x17\n\t" - "umulh x4, x21, x17\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" - /* Sub */ - "ldp x6, x7, [x29, #144]\n\t" - "ldp x8, x9, [x29, #160]\n\t" - "ldp x10, x11, [x29, #112]\n\t" - "ldp x12, x13, [x29, #128]\n\t" - "subs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "sbcs x8, x8, x12\n\t" - "sbcs x9, x9, x13\n\t" - "mov x3, #-19\n\t" - "csetm x23, cc\n\t" - /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x6, x6, x3\n\t" - "adcs x7, x7, x23\n\t" - "adcs x8, x8, x23\n\t" - "adc x9, x9, x4\n\t" - "stp x6, x7, [x29, #144]\n\t" - "stp x8, x9, [x29, #160]\n\t" - /* Square */ - "ldp x18, x19, [x29, #16]\n\t" - "ldp x20, x21, [x29, #32]\n\t" - /* A[0] * A[1] */ - "mul x7, x18, x19\n\t" - "umulh x8, x18, x19\n\t" - /* A[0] * A[2] */ - "mul x3, x18, x20\n\t" - "umulh x9, x18, x20\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x18, x21\n\t" - "umulh x10, x18, x21\n\t" - "adds x9, x9, x3\n\t" - "adc x10, x10, xzr\n\t" - /* A[1] * A[2] */ - "mul x3, x19, x20\n\t" - "umulh x4, x19, x20\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x19, x21\n\t" - "umulh x4, x19, x21\n\t" - "adds x10, x10, x3\n\t" - "adc x11, x11, x4\n\t" - /* A[2] * A[3] */ - "mul x3, x20, x21\n\t" - "umulh x12, x20, x21\n\t" - "adds x11, x11, x3\n\t" - "adc x12, x12, xzr\n\t" - /* Double */ - "adds x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adc x13, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x6, x18, x18\n\t" - "umulh x23, x18, x18\n\t" - /* A[1] * A[1] */ - "mul x3, x19, x19\n\t" - "umulh x4, x19, x19\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[2] * A[2] */ - "mul x3, x20, x20\n\t" - "umulh x4, x20, x20\n\t" - "adds x9, x9, x23\n\t" - "adcs x10, x10, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[3] * A[3] */ - "mul x3, x21, x21\n\t" - "umulh x4, x21, x21\n\t" - "adds x11, x11, x23\n\t" - "adcs x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #16]\n\t" - "stp x8, x9, [x29, #32]\n\t" - /* Multiply by 121666 */ - "ldp x18, x19, [x29, #144]\n\t" - "ldp x20, x21, [x29, #160]\n\t" - "mov x5, #0xdb42\n\t" - "movk x5, #1, lsl 16\n\t" - "mul x6, x18, x5\n\t" - "umulh x7, x18, x5\n\t" - "mul x3, x19, x5\n\t" - "umulh x4, x19, x5\n\t" - "adds x7, x7, x3\n\t" - "adc x8, xzr, x4\n\t" - "mul x3, x20, x5\n\t" - "umulh x4, x20, x5\n\t" - "adds x8, x8, x3\n\t" - "adc x9, xzr, x4\n\t" - "mul x3, x21, x5\n\t" - "umulh x4, x21, x5\n\t" - "adds x9, x9, x3\n\t" - "adc x4, xzr, x4\n\t" - "mov x5, #19\n\t" - "extr x4, x4, x9, #63\n\t" - "mul x4, x4, x5\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x4\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - "stp x6, x7, [x29, #48]\n\t" - "stp x8, x9, [x29, #64]\n\t" - /* Square */ - "ldp x18, x19, [x29, #80]\n\t" - "ldp x20, x21, [x29, #96]\n\t" - /* A[0] * A[1] */ - "mul x7, x18, x19\n\t" - "umulh x8, x18, x19\n\t" - /* A[0] * A[2] */ - "mul x3, x18, x20\n\t" - "umulh x9, x18, x20\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x18, x21\n\t" - "umulh x10, x18, x21\n\t" - "adds x9, x9, x3\n\t" - "adc x10, x10, xzr\n\t" - /* A[1] * A[2] */ - "mul x3, x19, x20\n\t" - "umulh x4, x19, x20\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x19, x21\n\t" - "umulh x4, x19, x21\n\t" - "adds x10, x10, x3\n\t" - "adc x11, x11, x4\n\t" - /* A[2] * A[3] */ - "mul x3, x20, x21\n\t" - "umulh x12, x20, x21\n\t" - "adds x11, x11, x3\n\t" - "adc x12, x12, xzr\n\t" - /* Double */ - "adds x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adc x13, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x6, x18, x18\n\t" - "umulh x23, x18, x18\n\t" - /* A[1] * A[1] */ - "mul x3, x19, x19\n\t" - "umulh x4, x19, x19\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[2] * A[2] */ - "mul x3, x20, x20\n\t" - "umulh x4, x20, x20\n\t" - "adds x9, x9, x23\n\t" - "adcs x10, x10, x3\n\t" - "adc x23, x4, xzr\n\t" - /* A[3] * A[3] */ - "mul x3, x21, x21\n\t" - "umulh x4, x21, x21\n\t" - "adds x11, x11, x23\n\t" - "adcs x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #80]\n\t" - "stp x8, x9, [x29, #96]\n\t" + "csel x5, x18, x6, eq\n\t" + "csel x18, x6, x18, eq\n\t" + "csel x26, x19, x7, eq\n\t" + "csel x19, x7, x19, eq\n\t" + "csel x27, x20, x8, eq\n\t" + "csel x20, x8, x20, eq\n\t" + "csel x28, x21, x9, eq\n\t" + "csel x21, x9, x21, eq\n\t" + "mov x22, x23\n\t" /* Add */ - "ldp x6, x7, [x29, #112]\n\t" - "ldp x8, x9, [x29, #128]\n\t" - "ldp x10, x11, [x29, #48]\n\t" - "ldp x12, x13, [x29, #64]\n\t" - "adds x6, x6, x10\n\t" - "adcs x7, x7, x11\n\t" - "adcs x8, x8, x12\n\t" - "adc x9, x9, x13\n\t" + "adds x6, x10, x18\n\t" + "adcs x7, x11, x19\n\t" + "adcs x8, x12, x20\n\t" + "adc x9, x13, x21\n\t" "mov x3, #-19\n\t" "asr x23, x9, #63\n\t" /* Mask the modulus */ @@ -2228,129 +1195,650 @@ int curve25519(byte* r, byte* n, byte* a) "sbcs x7, x7, x23\n\t" "sbcs x8, x8, x23\n\t" "sbc x9, x9, x4\n\t" - "stp x6, x7, [x29, #112]\n\t" - "stp x8, x9, [x29, #128]\n\t" + /* Sub */ + "subs x18, x10, x18\n\t" + "sbcs x19, x11, x19\n\t" + "sbcs x20, x12, x20\n\t" + "sbcs x21, x13, x21\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x18, x18, x3\n\t" + "adcs x19, x19, x23\n\t" + "adcs x20, x20, x23\n\t" + "adc x21, x21, x4\n\t" + "stp x18, x19, [x29, #144]\n\t" + "stp x20, x21, [x29, #160]\n\t" + /* Add */ + "adds x10, x14, x5\n\t" + "adcs x11, x15, x26\n\t" + "adcs x12, x16, x27\n\t" + "adc x13, x17, x28\n\t" + "mov x3, #-19\n\t" + "asr x23, x13, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x10, x10, x3\n\t" + "sbcs x11, x11, x23\n\t" + "sbcs x12, x12, x23\n\t" + "sbc x13, x13, x4\n\t" + /* Sub */ + "subs x14, x14, x5\n\t" + "sbcs x15, x15, x26\n\t" + "sbcs x16, x16, x27\n\t" + "sbcs x17, x17, x28\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x14, x14, x3\n\t" + "adcs x15, x15, x23\n\t" + "adcs x16, x16, x23\n\t" + "adc x17, x17, x4\n\t" /* Multiply */ - "ldp x18, x19, [x2]\n\t" - "ldp x20, x21, [x2, #16]\n\t" - "ldp x14, x15, [x29, #16]\n\t" - "ldp x16, x17, [x29, #32]\n\t" /* A[0] * B[0] */ - "mul x6, x18, x14\n\t" - "umulh x7, x18, x14\n\t" + "mul x18, x14, x6\n\t" + "umulh x19, x14, x6\n\t" /* A[0] * B[1] */ - "mul x3, x18, x15\n\t" - "umulh x8, x18, x15\n\t" + "mul x3, x14, x7\n\t" + "umulh x20, x14, x7\n\t" + "adds x19, x19, x3\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x6\n\t" + "umulh x4, x15, x6\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x8\n\t" + "umulh x4, x14, x8\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x7\n\t" + "umulh x4, x15, x7\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x23, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x6\n\t" + "umulh x4, x16, x6\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x23, x23, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x9\n\t" + "umulh x4, x14, x9\n\t" + "adds x21, x21, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x8\n\t" + "umulh x4, x15, x8\n\t" + "adds x21, x21, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x7\n\t" + "umulh x4, x16, x7\n\t" + "adds x21, x21, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x6\n\t" + "umulh x4, x17, x6\n\t" + "adds x21, x21, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x9\n\t" + "umulh x4, x15, x9\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x8\n\t" + "umulh x4, x16, x8\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x7\n\t" + "umulh x4, x17, x7\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x9\n\t" + "umulh x4, x16, x9\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x8\n\t" + "umulh x4, x17, x8\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x9\n\t" + "umulh x4, x17, x9\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x21, #63\n\t" + "and x21, x21, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" + "adds x18, x18, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x19, x19, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x20, x20, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x21, x21, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x19, x19, x23\n\t" + "adcs x20, x20, x26\n\t" + "adcs x21, x21, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x21, #63\n\t" + "mul x5, x5, x3\n\t" + "and x21, x21, #0x7fffffffffffffff\n\t" + "adds x18, x18, x5\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x21, #63\n\t" + "and x5, x5, x3\n\t" + "and x21, x21, #0x7fffffffffffffff\n\t" + "adds x18, x18, x5\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* Store */ + "stp x18, x19, [x29, #112]\n\t" + "stp x20, x21, [x29, #128]\n\t" + /* Multiply */ + "ldp x23, x26, [x29, #144]\n\t" + "ldp x27, x28, [x29, #160]\n\t" + /* A[0] * B[0] */ + "mul x18, x10, x23\n\t" + "umulh x19, x10, x23\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x26\n\t" + "umulh x20, x10, x26\n\t" + "adds x19, x19, x3\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x23\n\t" + "umulh x4, x11, x23\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x27\n\t" + "umulh x4, x10, x27\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x26\n\t" + "umulh x4, x11, x26\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x14, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x12, x23\n\t" + "umulh x4, x12, x23\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x14, x14, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x28\n\t" + "umulh x4, x10, x28\n\t" + "adds x21, x21, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x27\n\t" + "umulh x4, x11, x27\n\t" + "adds x21, x21, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x26\n\t" + "umulh x4, x12, x26\n\t" + "adds x21, x21, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x23\n\t" + "umulh x4, x13, x23\n\t" + "adds x21, x21, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x11, x28\n\t" + "umulh x4, x11, x28\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x27\n\t" + "umulh x4, x12, x27\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, x16, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x26\n\t" + "umulh x4, x13, x26\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, x16, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x28\n\t" + "umulh x4, x12, x28\n\t" + "adds x15, x15, x3\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x27\n\t" + "umulh x4, x13, x27\n\t" + "adds x15, x15, x3\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x28\n\t" + "umulh x4, x13, x28\n\t" + "adds x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x21, #63\n\t" + "and x21, x21, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x14\n\t" + "umulh x14, x3, x14\n\t" + "adds x18, x18, x4\n\t" + "mul x4, x3, x15\n\t" + "umulh x15, x3, x15\n\t" + "adcs x19, x19, x4\n\t" + "mul x4, x3, x16\n\t" + "umulh x16, x3, x16\n\t" + "adcs x20, x20, x4\n\t" + "mul x4, x3, x17\n\t" + "umulh x5, x3, x17\n\t" + "adcs x21, x21, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x19, x19, x14\n\t" + "adcs x20, x20, x15\n\t" + "adcs x21, x21, x16\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x21, #63\n\t" + "mul x5, x5, x3\n\t" + "and x21, x21, #0x7fffffffffffffff\n\t" + "adds x18, x18, x5\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x21, #63\n\t" + "and x5, x5, x3\n\t" + "and x21, x21, #0x7fffffffffffffff\n\t" + "adds x18, x18, x5\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* Store */ + /* Square */ + /* A[0] * A[1] */ + "mul x11, x23, x26\n\t" + "umulh x12, x23, x26\n\t" + /* A[0] * A[2] */ + "mul x3, x23, x27\n\t" + "umulh x13, x23, x27\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x23, x28\n\t" + "umulh x14, x23, x28\n\t" + "adds x13, x13, x3\n\t" + "adc x14, x14, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x26, x27\n\t" + "umulh x4, x26, x27\n\t" + "adds x13, x13, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x26, x28\n\t" + "umulh x4, x26, x28\n\t" + "adds x14, x14, x3\n\t" + "adc x15, x15, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x27, x28\n\t" + "umulh x16, x27, x28\n\t" + "adds x15, x15, x3\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x10, x23, x23\n\t" + "umulh x5, x23, x23\n\t" + /* A[1] * A[1] */ + "mul x3, x26, x26\n\t" + "umulh x4, x26, x26\n\t" + "adds x11, x11, x5\n\t" + "adcs x12, x12, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x27, x27\n\t" + "umulh x4, x27, x27\n\t" + "adds x13, x13, x5\n\t" + "adcs x14, x14, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x28, x28\n\t" + "umulh x4, x28, x28\n\t" + "adds x15, x15, x5\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x13, #63\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x14\n\t" + "umulh x14, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "mul x4, x3, x15\n\t" + "umulh x15, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "mul x4, x3, x16\n\t" + "umulh x16, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "mul x4, x3, x17\n\t" + "umulh x5, x3, x17\n\t" + "adcs x13, x13, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x11, x11, x14\n\t" + "adcs x12, x12, x15\n\t" + "adcs x13, x13, x16\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x13, #63\n\t" + "and x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Store */ + /* Square */ + /* A[0] * A[1] */ + "mul x15, x6, x7\n\t" + "umulh x16, x6, x7\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "umulh x17, x6, x8\n\t" + "adds x16, x16, x3\n\t" + "adc x17, x17, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x6, x9\n\t" + "umulh x23, x6, x9\n\t" + "adds x17, x17, x3\n\t" + "adc x23, x23, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "umulh x4, x7, x8\n\t" + "adds x17, x17, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "umulh x4, x7, x9\n\t" + "adds x23, x23, x3\n\t" + "adc x26, x26, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "umulh x27, x8, x9\n\t" + "adds x26, x26, x3\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x23, x23, x23\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x14, x6, x6\n\t" + "umulh x5, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "umulh x4, x7, x7\n\t" + "adds x15, x15, x5\n\t" + "adcs x16, x16, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "umulh x4, x8, x8\n\t" + "adds x17, x17, x5\n\t" + "adcs x23, x23, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "umulh x4, x9, x9\n\t" + "adds x26, x26, x5\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x17, #63\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" + "adds x14, x14, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x17, x17, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x15, x15, x23\n\t" + "adcs x16, x16, x26\n\t" + "adcs x17, x17, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x17, #63\n\t" + "and x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + /* Store */ + /* Multiply */ + /* A[0] * B[0] */ + "mul x6, x14, x10\n\t" + "umulh x7, x14, x10\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" + "umulh x8, x14, x11\n\t" "adds x7, x7, x3\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[0] */ - "mul x3, x19, x14\n\t" - "umulh x4, x19, x14\n\t" + "mul x3, x15, x10\n\t" + "umulh x4, x15, x10\n\t" "adds x7, x7, x3\n\t" "adcs x8, x8, x4\n\t" "adc x9, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x3, x18, x16\n\t" - "umulh x4, x18, x16\n\t" + "mul x3, x14, x12\n\t" + "umulh x4, x14, x12\n\t" "adds x8, x8, x3\n\t" "adc x9, x9, x4\n\t" /* A[1] * B[1] */ - "mul x3, x19, x15\n\t" - "umulh x4, x19, x15\n\t" + "mul x3, x15, x11\n\t" + "umulh x4, x15, x11\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" + "adc x23, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x3, x20, x14\n\t" - "umulh x4, x20, x14\n\t" + "mul x3, x16, x10\n\t" + "umulh x4, x16, x10\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" + "adc x23, x23, xzr\n\t" /* A[0] * B[3] */ - "mul x3, x18, x17\n\t" - "umulh x4, x18, x17\n\t" + "mul x3, x14, x13\n\t" + "umulh x4, x14, x13\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x3, x19, x16\n\t" - "umulh x4, x19, x16\n\t" + "mul x3, x15, x12\n\t" + "umulh x4, x15, x12\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" /* A[2] * B[1] */ - "mul x3, x20, x15\n\t" - "umulh x4, x20, x15\n\t" + "mul x3, x16, x11\n\t" + "umulh x4, x16, x11\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" /* A[3] * B[0] */ - "mul x3, x21, x14\n\t" - "umulh x4, x21, x14\n\t" + "mul x3, x17, x10\n\t" + "umulh x4, x17, x10\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" /* A[1] * B[3] */ - "mul x3, x19, x17\n\t" - "umulh x4, x19, x17\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, xzr, xzr\n\t" + "mul x3, x15, x13\n\t" + "umulh x4, x15, x13\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x3, x20, x16\n\t" - "umulh x4, x20, x16\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" + "mul x3, x16, x12\n\t" + "umulh x4, x16, x12\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" /* A[3] * B[1] */ - "mul x3, x21, x15\n\t" - "umulh x4, x21, x15\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" + "mul x3, x17, x11\n\t" + "umulh x4, x17, x11\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" /* A[2] * B[3] */ - "mul x3, x20, x17\n\t" - "umulh x4, x20, x17\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" + "mul x3, x16, x13\n\t" + "umulh x4, x16, x13\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x3, x21, x16\n\t" - "umulh x4, x21, x16\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" + "mul x3, x17, x12\n\t" + "umulh x4, x17, x12\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" /* A[3] * B[3] */ - "mul x3, x21, x17\n\t" - "umulh x4, x21, x17\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" + "mul x3, x17, x13\n\t" + "umulh x4, x17, x13\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x9, #63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" /* Overflow */ "extr x5, x5, x9, #63\n\t" @@ -2361,137 +1849,188 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" + "asr x5, x9, #63\n\t" + "and x5, x5, x3\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ - "stp x6, x7, [x29, #48]\n\t" - "stp x8, x9, [x29, #64]\n\t" + "stp x6, x7, [x0]\n\t" + "stp x8, x9, [x0, #16]\n\t" + /* Sub */ + "subs x14, x14, x10\n\t" + "sbcs x15, x15, x11\n\t" + "sbcs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x14, x14, x3\n\t" + "adcs x15, x15, x23\n\t" + "adcs x16, x16, x23\n\t" + "adc x17, x17, x4\n\t" + /* Multiply by 121666 */ + "mov x5, #0xdb42\n\t" + "movk x5, #1, lsl 16\n\t" + "mul x6, x14, x5\n\t" + "umulh x7, x14, x5\n\t" + "mul x3, x15, x5\n\t" + "umulh x4, x15, x5\n\t" + "adds x7, x7, x3\n\t" + "adc x8, xzr, x4\n\t" + "mul x3, x16, x5\n\t" + "umulh x4, x16, x5\n\t" + "adds x8, x8, x3\n\t" + "adc x9, xzr, x4\n\t" + "mul x3, x17, x5\n\t" + "umulh x4, x17, x5\n\t" + "adds x9, x9, x3\n\t" + "adc x4, xzr, x4\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x9, #63\n\t" + "mul x4, x4, x5\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Add */ + "adds x10, x10, x6\n\t" + "adcs x11, x11, x7\n\t" + "adcs x12, x12, x8\n\t" + "adc x13, x13, x9\n\t" + "mov x3, #-19\n\t" + "asr x23, x13, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x10, x10, x3\n\t" + "sbcs x11, x11, x23\n\t" + "sbcs x12, x12, x23\n\t" + "sbc x13, x13, x4\n\t" /* Multiply */ - "ldp x18, x19, [x29, #144]\n\t" - "ldp x20, x21, [x29, #160]\n\t" - "ldp x14, x15, [x29, #112]\n\t" - "ldp x16, x17, [x29, #128]\n\t" /* A[0] * B[0] */ - "mul x6, x18, x14\n\t" - "umulh x7, x18, x14\n\t" + "mul x6, x14, x10\n\t" + "umulh x7, x14, x10\n\t" /* A[0] * B[1] */ - "mul x3, x18, x15\n\t" - "umulh x8, x18, x15\n\t" + "mul x3, x14, x11\n\t" + "umulh x8, x14, x11\n\t" "adds x7, x7, x3\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[0] */ - "mul x3, x19, x14\n\t" - "umulh x4, x19, x14\n\t" + "mul x3, x15, x10\n\t" + "umulh x4, x15, x10\n\t" "adds x7, x7, x3\n\t" "adcs x8, x8, x4\n\t" "adc x9, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x3, x18, x16\n\t" - "umulh x4, x18, x16\n\t" + "mul x3, x14, x12\n\t" + "umulh x4, x14, x12\n\t" "adds x8, x8, x3\n\t" "adc x9, x9, x4\n\t" /* A[1] * B[1] */ - "mul x3, x19, x15\n\t" - "umulh x4, x19, x15\n\t" + "mul x3, x15, x11\n\t" + "umulh x4, x15, x11\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" + "adc x23, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x3, x20, x14\n\t" - "umulh x4, x20, x14\n\t" + "mul x3, x16, x10\n\t" + "umulh x4, x16, x10\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" + "adc x23, x23, xzr\n\t" /* A[0] * B[3] */ - "mul x3, x18, x17\n\t" - "umulh x4, x18, x17\n\t" + "mul x3, x14, x13\n\t" + "umulh x4, x14, x13\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x3, x19, x16\n\t" - "umulh x4, x19, x16\n\t" + "mul x3, x15, x12\n\t" + "umulh x4, x15, x12\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" /* A[2] * B[1] */ - "mul x3, x20, x15\n\t" - "umulh x4, x20, x15\n\t" + "mul x3, x16, x11\n\t" + "umulh x4, x16, x11\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" /* A[3] * B[0] */ - "mul x3, x21, x14\n\t" - "umulh x4, x21, x14\n\t" + "mul x3, x17, x10\n\t" + "umulh x4, x17, x10\n\t" "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" /* A[1] * B[3] */ - "mul x3, x19, x17\n\t" - "umulh x4, x19, x17\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, xzr, xzr\n\t" + "mul x3, x15, x13\n\t" + "umulh x4, x15, x13\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x3, x20, x16\n\t" - "umulh x4, x20, x16\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" + "mul x3, x16, x12\n\t" + "umulh x4, x16, x12\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" /* A[3] * B[1] */ - "mul x3, x21, x15\n\t" - "umulh x4, x21, x15\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" + "mul x3, x17, x11\n\t" + "umulh x4, x17, x11\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" /* A[2] * B[3] */ - "mul x3, x20, x17\n\t" - "umulh x4, x20, x17\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" + "mul x3, x16, x13\n\t" + "umulh x4, x16, x13\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x3, x21, x16\n\t" - "umulh x4, x21, x16\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" + "mul x3, x17, x12\n\t" + "umulh x4, x17, x12\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" /* A[3] * B[3] */ - "mul x3, x21, x17\n\t" - "umulh x4, x21, x17\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" + "mul x3, x17, x13\n\t" + "umulh x4, x17, x13\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x9, #63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" /* Overflow */ "extr x5, x5, x9, #63\n\t" @@ -2502,8 +2041,8 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" + "asr x5, x9, #63\n\t" + "and x5, x5, x3\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2512,6 +2051,385 @@ int curve25519(byte* r, byte* n, byte* a) /* Store */ "stp x6, x7, [x29, #16]\n\t" "stp x8, x9, [x29, #32]\n\t" + /* Add */ + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "adds x10, x6, x18\n\t" + "adcs x11, x7, x19\n\t" + "adcs x12, x8, x20\n\t" + "adc x13, x9, x21\n\t" + "mov x3, #-19\n\t" + "asr x23, x13, #63\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x10, x10, x3\n\t" + "sbcs x11, x11, x23\n\t" + "sbcs x12, x12, x23\n\t" + "sbc x13, x13, x4\n\t" + /* Sub */ + "subs x18, x6, x18\n\t" + "sbcs x19, x7, x19\n\t" + "sbcs x20, x8, x20\n\t" + "sbcs x21, x9, x21\n\t" + "mov x3, #-19\n\t" + "csetm x23, cc\n\t" + /* Mask the modulus */ + "and x3, x23, x3\n\t" + "and x4, x23, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x18, x18, x3\n\t" + "adcs x19, x19, x23\n\t" + "adcs x20, x20, x23\n\t" + "adc x21, x21, x4\n\t" + /* Square */ + /* A[0] * A[1] */ + "mul x7, x10, x11\n\t" + "umulh x8, x10, x11\n\t" + /* A[0] * A[2] */ + "mul x3, x10, x12\n\t" + "umulh x9, x10, x12\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x10, x13\n\t" + "umulh x23, x10, x13\n\t" + "adds x9, x9, x3\n\t" + "adc x23, x23, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x11, x12\n\t" + "umulh x4, x11, x12\n\t" + "adds x9, x9, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x11, x13\n\t" + "umulh x4, x11, x13\n\t" + "adds x23, x23, x3\n\t" + "adc x26, x26, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x12, x13\n\t" + "umulh x27, x12, x13\n\t" + "adds x26, x26, x3\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x23, x23, x23\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x10, x10\n\t" + "umulh x5, x10, x10\n\t" + /* A[1] * A[1] */ + "mul x3, x11, x11\n\t" + "umulh x4, x11, x11\n\t" + "adds x7, x7, x5\n\t" + "adcs x8, x8, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x12, x12\n\t" + "umulh x4, x12, x12\n\t" + "adds x9, x9, x5\n\t" + "adcs x23, x23, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x13, x13\n\t" + "umulh x4, x13, x13\n\t" + "adds x26, x26, x5\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x23\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x9, #63\n\t" + "and x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + /* Square */ + /* A[0] * A[1] */ + "mul x7, x18, x19\n\t" + "umulh x8, x18, x19\n\t" + /* A[0] * A[2] */ + "mul x3, x18, x20\n\t" + "umulh x9, x18, x20\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x18, x21\n\t" + "umulh x23, x18, x21\n\t" + "adds x9, x9, x3\n\t" + "adc x23, x23, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x19, x20\n\t" + "umulh x4, x19, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x19, x21\n\t" + "umulh x4, x19, x21\n\t" + "adds x23, x23, x3\n\t" + "adc x26, x26, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x20, x21\n\t" + "umulh x27, x20, x21\n\t" + "adds x26, x26, x3\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x23, x23, x23\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x18, x18\n\t" + "umulh x5, x18, x18\n\t" + /* A[1] * A[1] */ + "mul x3, x19, x19\n\t" + "umulh x4, x19, x19\n\t" + "adds x7, x7, x5\n\t" + "adcs x8, x8, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" + "adds x9, x9, x5\n\t" + "adcs x23, x23, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x21, x21\n\t" + "umulh x4, x21, x21\n\t" + "adds x26, x26, x5\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x23\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x9, #63\n\t" + "and x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + /* Multiply */ + "ldp x14, x15, [x2]\n\t" + "ldp x16, x17, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x10, x14, x6\n\t" + "umulh x11, x14, x6\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x7\n\t" + "umulh x12, x14, x7\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x6\n\t" + "umulh x4, x15, x6\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x8\n\t" + "umulh x4, x14, x8\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x7\n\t" + "umulh x4, x15, x7\n\t" + "adds x12, x12, x3\n\t" + "adcs x13, x13, x4\n\t" + "adc x23, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x6\n\t" + "umulh x4, x16, x6\n\t" + "adds x12, x12, x3\n\t" + "adcs x13, x13, x4\n\t" + "adc x23, x23, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x9\n\t" + "umulh x4, x14, x9\n\t" + "adds x13, x13, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x8\n\t" + "umulh x4, x15, x8\n\t" + "adds x13, x13, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x7\n\t" + "umulh x4, x16, x7\n\t" + "adds x13, x13, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x6\n\t" + "umulh x4, x17, x6\n\t" + "adds x13, x13, x3\n\t" + "adcs x23, x23, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x9\n\t" + "umulh x4, x15, x9\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x8\n\t" + "umulh x4, x16, x8\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x7\n\t" + "umulh x4, x17, x7\n\t" + "adds x23, x23, x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x9\n\t" + "umulh x4, x16, x9\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x8\n\t" + "umulh x4, x17, x8\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x9\n\t" + "umulh x4, x17, x9\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, x23, #63\n\t" + "extr x23, x23, x13, #63\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x23\n\t" + "umulh x23, x3, x23\n\t" + "adds x10, x10, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x11, x11, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x12, x12, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x13, x13, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x11, x11, x23\n\t" + "adcs x12, x12, x26\n\t" + "adcs x13, x13, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Reduce if top bit set */ + "asr x5, x13, #63\n\t" + "and x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Store */ + "stp x10, x11, [x29, #48]\n\t" + "stp x12, x13, [x29, #64]\n\t" "sub x25, x25, #1\n\t" "cmp x25, #0\n\t" "bge L_curve25519_bits\n\t" @@ -2641,151 +2559,151 @@ int curve25519(byte* r, byte* n, byte* a) "bl fe_mul\n\t" "ldr %[r], [x29, #176]\n\t" /* Multiply */ - "ldp x18, x19, [x0]\n\t" - "ldp x20, x21, [x0, #16]\n\t" - "ldp x14, x15, [x29, #16]\n\t" - "ldp x16, x17, [x29, #32]\n\t" + "ldp x6, x7, [x0]\n\t" + "ldp x8, x9, [x0, #16]\n\t" + "ldp x10, x11, [x29, #16]\n\t" + "ldp x12, x13, [x29, #32]\n\t" /* A[0] * B[0] */ - "mul x6, x18, x14\n\t" - "umulh x7, x18, x14\n\t" + "mul x14, x6, x10\n\t" + "umulh x15, x6, x10\n\t" /* A[0] * B[1] */ - "mul x3, x18, x15\n\t" - "umulh x8, x18, x15\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" + "mul x3, x6, x11\n\t" + "umulh x16, x6, x11\n\t" + "adds x15, x15, x3\n\t" + "adc x16, x16, xzr\n\t" /* A[1] * B[0] */ - "mul x3, x19, x14\n\t" - "umulh x4, x19, x14\n\t" - "adds x7, x7, x3\n\t" - "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" + "mul x3, x7, x10\n\t" + "umulh x4, x7, x10\n\t" + "adds x15, x15, x3\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x3, x18, x16\n\t" - "umulh x4, x18, x16\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" + "mul x3, x6, x12\n\t" + "umulh x4, x6, x12\n\t" + "adds x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" /* A[1] * B[1] */ - "mul x3, x19, x15\n\t" - "umulh x4, x19, x15\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" + "mul x3, x7, x11\n\t" + "umulh x4, x7, x11\n\t" + "adds x16, x16, x3\n\t" + "adcs x17, x17, x4\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x3, x20, x14\n\t" - "umulh x4, x20, x14\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" + "mul x3, x8, x10\n\t" + "umulh x4, x8, x10\n\t" + "adds x16, x16, x3\n\t" + "adcs x17, x17, x4\n\t" + "adc x18, x18, xzr\n\t" /* A[0] * B[3] */ - "mul x3, x18, x17\n\t" - "umulh x4, x18, x17\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" + "mul x3, x6, x13\n\t" + "umulh x4, x6, x13\n\t" + "adds x17, x17, x3\n\t" + "adcs x18, x18, x4\n\t" + "adc x19, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x3, x19, x16\n\t" - "umulh x4, x19, x16\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "mul x3, x7, x12\n\t" + "umulh x4, x7, x12\n\t" + "adds x17, x17, x3\n\t" + "adcs x18, x18, x4\n\t" + "adc x19, x19, xzr\n\t" /* A[2] * B[1] */ - "mul x3, x20, x15\n\t" - "umulh x4, x20, x15\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "mul x3, x8, x11\n\t" + "umulh x4, x8, x11\n\t" + "adds x17, x17, x3\n\t" + "adcs x18, x18, x4\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[0] */ - "mul x3, x21, x14\n\t" - "umulh x4, x21, x14\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" + "mul x3, x9, x10\n\t" + "umulh x4, x9, x10\n\t" + "adds x17, x17, x3\n\t" + "adcs x18, x18, x4\n\t" + "adc x19, x19, xzr\n\t" /* A[1] * B[3] */ - "mul x3, x19, x17\n\t" - "umulh x4, x19, x17\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, xzr, xzr\n\t" + "mul x3, x7, x13\n\t" + "umulh x4, x7, x13\n\t" + "adds x18, x18, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x3, x20, x16\n\t" - "umulh x4, x20, x16\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" + "mul x3, x8, x12\n\t" + "umulh x4, x8, x12\n\t" + "adds x18, x18, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[1] */ - "mul x3, x21, x15\n\t" - "umulh x4, x21, x15\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" - "adc x12, x12, xzr\n\t" + "mul x3, x9, x11\n\t" + "umulh x4, x9, x11\n\t" + "adds x18, x18, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" /* A[2] * B[3] */ - "mul x3, x20, x17\n\t" - "umulh x4, x20, x17\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" + "mul x3, x8, x13\n\t" + "umulh x4, x8, x13\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x3, x21, x16\n\t" - "umulh x4, x21, x16\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" + "mul x3, x9, x12\n\t" + "umulh x4, x9, x12\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" /* A[3] * B[3] */ - "mul x3, x21, x17\n\t" - "umulh x4, x21, x17\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" + "mul x3, x9, x13\n\t" + "umulh x4, x9, x13\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, x4\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" + "extr x21, x21, x20, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" + "mul x4, x3, x18\n\t" + "umulh x18, x3, x18\n\t" + "adds x14, x14, x4\n\t" + "mul x4, x3, x19\n\t" + "umulh x19, x3, x19\n\t" + "adcs x15, x15, x4\n\t" + "mul x4, x3, x20\n\t" + "umulh x20, x3, x20\n\t" + "adcs x16, x16, x4\n\t" + "mul x4, x3, x21\n\t" + "umulh x5, x3, x21\n\t" + "adcs x17, x17, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" + "adds x15, x15, x18\n\t" + "adcs x16, x16, x19\n\t" + "adcs x17, x17, x20\n\t" "adc x5, x5, xzr\n\t" /* Overflow */ - "extr x5, x5, x9, #63\n\t" + "extr x5, x5, x17, #63\n\t" "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" /* Reduce if top bit set */ - "lsr x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" + "asr x5, x17, #63\n\t" + "and x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" "mov x0, xzr\n\t" "ldp x29, x30, [sp], #0xc0\n\t" : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); return (uint32_t)(size_t)r; } @@ -2915,8 +2833,6 @@ void fe_pow22523(fe r, const fe a) "ldr x0, [x29, #112]\n\t" "ldr x2, [x29, #120]\n\t" "bl fe_mul\n\t" - "ldr %[a], [x29, #120]\n\t" - "ldr %[r], [x29, #112]\n\t" "ldp x29, x30, [sp], #0x80\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -2938,100 +2854,100 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "ldr x1, [x29, #32]\n\t" "ldr x2, [x29, #56]\n\t" /* Multiply */ - "ldp x11, x16, [x1]\n\t" - "ldp x17, x18, [x1, #16]\n\t" - "ldp x19, x20, [x2]\n\t" - "ldp x21, x22, [x2, #16]\n\t" + "ldp x11, x12, [x1]\n\t" + "ldp x13, x14, [x1, #16]\n\t" + "ldp x15, x16, [x2]\n\t" + "ldp x17, x18, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" + "mul x19, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x19\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" + "mul x19, x12, x15\n\t" + "umulh x20, x12, x15\n\t" + "adds x4, x4, x19\n\t" + "adcs x5, x5, x20\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" + "mul x19, x11, x17\n\t" + "umulh x20, x11, x17\n\t" + "adds x5, x5, x19\n\t" + "adc x6, x6, x20\n\t" /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x19, x12, x16\n\t" + "umulh x20, x12, x16\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x19, x13, x15\n\t" + "umulh x20, x13, x15\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x11, x18\n\t" + "umulh x20, x11, x18\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x12, x17\n\t" + "umulh x20, x12, x17\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x13, x16\n\t" + "umulh x20, x13, x16\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x14, x15\n\t" + "umulh x20, x14, x15\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x12, x18\n\t" + "umulh x20, x12, x18\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x13, x17\n\t" + "umulh x20, x13, x17\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x14, x16\n\t" + "umulh x20, x14, x16\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x19, x13, x18\n\t" + "umulh x20, x13, x18\n\t" + "adds x8, x8, x19\n\t" + "adcs x9, x9, x20\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x19, x14, x17\n\t" + "umulh x20, x14, x17\n\t" + "adds x8, x8, x19\n\t" + "adcs x9, x9, x20\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" + "mul x19, x14, x18\n\t" + "umulh x20, x14, x18\n\t" + "adds x9, x9, x19\n\t" + "adc x10, x10, x20\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3040,38 +2956,38 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x19, #19\n\t" + "mul x20, x19, x7\n\t" + "umulh x7, x19, x7\n\t" + "adds x3, x3, x20\n\t" + "mul x20, x19, x8\n\t" + "umulh x8, x19, x8\n\t" + "adcs x4, x4, x20\n\t" + "mul x20, x19, x9\n\t" + "umulh x9, x19, x9\n\t" + "adcs x5, x5, x20\n\t" + "mul x20, x19, x10\n\t" + "umulh x21, x19, x10\n\t" + "adcs x6, x6, x20\n\t" + "adc x21, x21, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" + "adc x21, x21, xzr\n\t" /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x21, x21, x6, #63\n\t" + "mul x21, x21, x19\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x21, x6, #63\n\t" + "and x21, x21, x19\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3082,100 +2998,100 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "ldr x1, [x29, #40]\n\t" "ldr x2, [x29, #48]\n\t" /* Multiply */ - "ldp x11, x16, [x1]\n\t" - "ldp x17, x18, [x1, #16]\n\t" - "ldp x19, x20, [x2]\n\t" - "ldp x21, x22, [x2, #16]\n\t" + "ldp x11, x12, [x1]\n\t" + "ldp x13, x14, [x1, #16]\n\t" + "ldp x15, x16, [x2]\n\t" + "ldp x17, x18, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" + "mul x19, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x19\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" + "mul x19, x12, x15\n\t" + "umulh x20, x12, x15\n\t" + "adds x4, x4, x19\n\t" + "adcs x5, x5, x20\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" + "mul x19, x11, x17\n\t" + "umulh x20, x11, x17\n\t" + "adds x5, x5, x19\n\t" + "adc x6, x6, x20\n\t" /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x19, x12, x16\n\t" + "umulh x20, x12, x16\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x19, x13, x15\n\t" + "umulh x20, x13, x15\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x11, x18\n\t" + "umulh x20, x11, x18\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x12, x17\n\t" + "umulh x20, x12, x17\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x13, x16\n\t" + "umulh x20, x13, x16\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x14, x15\n\t" + "umulh x20, x14, x15\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x12, x18\n\t" + "umulh x20, x12, x18\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x13, x17\n\t" + "umulh x20, x13, x17\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x14, x16\n\t" + "umulh x20, x14, x16\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x19, x13, x18\n\t" + "umulh x20, x13, x18\n\t" + "adds x8, x8, x19\n\t" + "adcs x9, x9, x20\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x19, x14, x17\n\t" + "umulh x20, x14, x17\n\t" + "adds x8, x8, x19\n\t" + "adcs x9, x9, x20\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" + "mul x19, x14, x18\n\t" + "umulh x20, x14, x18\n\t" + "adds x9, x9, x19\n\t" + "adc x10, x10, x20\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3184,38 +3100,38 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x19, #19\n\t" + "mul x20, x19, x7\n\t" + "umulh x7, x19, x7\n\t" + "adds x3, x3, x20\n\t" + "mul x20, x19, x8\n\t" + "umulh x8, x19, x8\n\t" + "adcs x4, x4, x20\n\t" + "mul x20, x19, x9\n\t" + "umulh x9, x19, x9\n\t" + "adcs x5, x5, x20\n\t" + "mul x20, x19, x10\n\t" + "umulh x21, x19, x10\n\t" + "adcs x6, x6, x20\n\t" + "adc x21, x21, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" + "adc x21, x21, xzr\n\t" /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x21, x21, x6, #63\n\t" + "mul x21, x21, x19\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x21, x6, #63\n\t" + "and x21, x21, x19\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3223,102 +3139,100 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "stp x3, x4, [x0]\n\t" "stp x5, x6, [x0, #16]\n\t" "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #56]\n\t" + "ldr x2, [x29, #56]\n\t" /* Multiply */ - "ldp x11, x16, [x2]\n\t" - "ldp x17, x18, [x2, #16]\n\t" - "ldp x19, x20, [x1]\n\t" - "ldp x21, x22, [x1, #16]\n\t" + "ldp x11, x12, [x2]\n\t" + "ldp x13, x14, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" + "mul x3, x15, x11\n\t" + "umulh x4, x15, x11\n\t" /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" + "mul x19, x15, x12\n\t" + "umulh x5, x15, x12\n\t" + "adds x4, x4, x19\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" + "mul x19, x16, x11\n\t" + "umulh x20, x16, x11\n\t" + "adds x4, x4, x19\n\t" + "adcs x5, x5, x20\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" + "mul x19, x15, x13\n\t" + "umulh x20, x15, x13\n\t" + "adds x5, x5, x19\n\t" + "adc x6, x6, x20\n\t" /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x19, x16, x12\n\t" + "umulh x20, x16, x12\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x19, x17, x11\n\t" + "umulh x20, x17, x11\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x15, x14\n\t" + "umulh x20, x15, x14\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x16, x13\n\t" + "umulh x20, x16, x13\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x17, x12\n\t" + "umulh x20, x17, x12\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x19, x18, x11\n\t" + "umulh x20, x18, x11\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x16, x14\n\t" + "umulh x20, x16, x14\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x17, x13\n\t" + "umulh x20, x17, x13\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x19, x18, x12\n\t" + "umulh x20, x18, x12\n\t" + "adds x7, x7, x19\n\t" + "adcs x8, x8, x20\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x19, x17, x14\n\t" + "umulh x20, x17, x14\n\t" + "adds x8, x8, x19\n\t" + "adcs x9, x9, x20\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x19, x18, x13\n\t" + "umulh x20, x18, x13\n\t" + "adds x8, x8, x19\n\t" + "adcs x9, x9, x20\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" + "mul x19, x18, x14\n\t" + "umulh x20, x18, x14\n\t" + "adds x9, x9, x19\n\t" + "adc x10, x10, x20\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3327,38 +3241,38 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x19, #19\n\t" + "mul x20, x19, x7\n\t" + "umulh x7, x19, x7\n\t" + "adds x3, x3, x20\n\t" + "mul x20, x19, x8\n\t" + "umulh x8, x19, x8\n\t" + "adcs x4, x4, x20\n\t" + "mul x20, x19, x9\n\t" + "umulh x9, x19, x9\n\t" + "adcs x5, x5, x20\n\t" + "mul x20, x19, x10\n\t" + "umulh x21, x19, x10\n\t" + "adcs x6, x6, x20\n\t" + "adc x21, x21, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" + "adc x21, x21, xzr\n\t" /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x21, x21, x6, #63\n\t" + "mul x21, x21, x19\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x21, x6, #63\n\t" + "and x21, x21, x19\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3368,7 +3282,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "ldp x29, x30, [sp], #0x40\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x12", "x13", "x14", "x15", "x7", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" ); } @@ -3387,387 +3301,100 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "ldr x1, [x29, #40]\n\t" "ldr x2, [x29, #64]\n\t" /* Multiply */ - "ldp x11, x16, [x1]\n\t" - "ldp x17, x18, [x1, #16]\n\t" - "ldp x19, x20, [x2]\n\t" - "ldp x21, x22, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" - /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" - /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" - /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #16]\n\t" - "ldr x1, [x29, #48]\n\t" - "ldr x2, [x29, #56]\n\t" - /* Multiply */ - "ldp x11, x16, [x1]\n\t" - "ldp x17, x18, [x1, #16]\n\t" - "ldp x19, x20, [x2]\n\t" - "ldp x21, x22, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" - /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" - /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" - /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #64]\n\t" - /* Multiply */ - "ldp x11, x16, [x2]\n\t" + "ldp x11, x12, [x1]\n\t" + "ldp x13, x14, [x1, #16]\n\t" + "ldp x15, x16, [x2]\n\t" "ldp x17, x18, [x2, #16]\n\t" - "ldp x19, x20, [x1]\n\t" - "ldp x21, x22, [x1, #16]\n\t" /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" + "mul x23, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x23\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" + "mul x23, x12, x15\n\t" + "umulh x24, x12, x15\n\t" + "adds x4, x4, x23\n\t" + "adcs x5, x5, x24\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" + "mul x23, x11, x17\n\t" + "umulh x24, x11, x17\n\t" + "adds x5, x5, x23\n\t" + "adc x6, x6, x24\n\t" /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x23, x12, x16\n\t" + "umulh x24, x12, x16\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x23, x13, x15\n\t" + "umulh x24, x13, x15\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x11, x18\n\t" + "umulh x24, x11, x18\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x12, x17\n\t" + "umulh x24, x12, x17\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x13, x16\n\t" + "umulh x24, x13, x16\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x14, x15\n\t" + "umulh x24, x14, x15\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x23, x12, x18\n\t" + "umulh x24, x12, x18\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x23, x13, x17\n\t" + "umulh x24, x13, x17\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x23, x14, x16\n\t" + "umulh x24, x14, x16\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x23, x13, x18\n\t" + "umulh x24, x13, x18\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x23, x14, x17\n\t" + "umulh x24, x14, x17\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" + "mul x23, x14, x18\n\t" + "umulh x24, x14, x18\n\t" + "adds x9, x9, x23\n\t" + "adc x10, x10, x24\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3776,38 +3403,38 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x23, #19\n\t" + "mul x24, x23, x7\n\t" + "umulh x7, x23, x7\n\t" + "adds x3, x3, x24\n\t" + "mul x24, x23, x8\n\t" + "umulh x8, x23, x8\n\t" + "adcs x4, x4, x24\n\t" + "mul x24, x23, x9\n\t" + "umulh x9, x23, x9\n\t" + "adcs x5, x5, x24\n\t" + "mul x24, x23, x10\n\t" + "umulh x25, x23, x10\n\t" + "adcs x6, x6, x24\n\t" + "adc x25, x25, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" + "adc x25, x25, xzr\n\t" /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x25, x25, x6, #63\n\t" + "mul x25, x25, x23\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x25, x6, #63\n\t" + "and x25, x25, x23\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3815,103 +3442,100 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "stp x3, x4, [x0]\n\t" "stp x5, x6, [x0, #16]\n\t" "ldr x0, [x29, #32]\n\t" - "ldr x1, [x29, #40]\n\t" "ldr x2, [x29, #48]\n\t" /* Multiply */ - "ldp x11, x16, [x1]\n\t" - "ldp x17, x18, [x1, #16]\n\t" "ldp x19, x20, [x2]\n\t" "ldp x21, x22, [x2, #16]\n\t" /* A[0] * B[0] */ "mul x3, x11, x19\n\t" "umulh x4, x11, x19\n\t" /* A[0] * B[1] */ - "mul x12, x11, x20\n\t" + "mul x23, x11, x20\n\t" "umulh x5, x11, x20\n\t" - "adds x4, x4, x12\n\t" + "adds x4, x4, x23\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x16, x19\n\t" - "umulh x13, x16, x19\n\t" - "adds x4, x4, x12\n\t" - "adcs x5, x5, x13\n\t" + "mul x23, x12, x19\n\t" + "umulh x24, x12, x19\n\t" + "adds x4, x4, x23\n\t" + "adcs x5, x5, x24\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x11, x21\n\t" - "umulh x13, x11, x21\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, x13\n\t" + "mul x23, x11, x21\n\t" + "umulh x24, x11, x21\n\t" + "adds x5, x5, x23\n\t" + "adc x6, x6, x24\n\t" /* A[1] * B[1] */ - "mul x12, x16, x20\n\t" - "umulh x13, x16, x20\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x23, x12, x20\n\t" + "umulh x24, x12, x20\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x17, x19\n\t" - "umulh x13, x17, x19\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x23, x13, x19\n\t" + "umulh x24, x13, x19\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x11, x22\n\t" - "umulh x13, x11, x22\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x11, x22\n\t" + "umulh x24, x11, x22\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x16, x21\n\t" - "umulh x13, x16, x21\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x12, x21\n\t" + "umulh x24, x12, x21\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x17, x20\n\t" - "umulh x13, x17, x20\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x13, x20\n\t" + "umulh x24, x13, x20\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x18, x19\n\t" - "umulh x13, x18, x19\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x23, x14, x19\n\t" + "umulh x24, x14, x19\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x16, x22\n\t" - "umulh x13, x16, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x23, x12, x22\n\t" + "umulh x24, x12, x22\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x17, x21\n\t" - "umulh x13, x17, x21\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x23, x13, x21\n\t" + "umulh x24, x13, x21\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x18, x20\n\t" - "umulh x13, x18, x20\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x23, x14, x20\n\t" + "umulh x24, x14, x20\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x17, x22\n\t" - "umulh x13, x17, x22\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x23, x13, x22\n\t" + "umulh x24, x13, x22\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x18, x21\n\t" - "umulh x13, x18, x21\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x23, x14, x21\n\t" + "umulh x24, x14, x21\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x18, x22\n\t" - "umulh x13, x18, x22\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, x13\n\t" + "mul x23, x14, x22\n\t" + "umulh x24, x14, x22\n\t" + "adds x9, x9, x23\n\t" + "adc x10, x10, x24\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3920,38 +3544,317 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x7\n\t" - "umulh x7, x12, x7\n\t" - "adds x3, x3, x13\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adcs x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x14, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x23, #19\n\t" + "mul x24, x23, x7\n\t" + "umulh x7, x23, x7\n\t" + "adds x3, x3, x24\n\t" + "mul x24, x23, x8\n\t" + "umulh x8, x23, x8\n\t" + "adcs x4, x4, x24\n\t" + "mul x24, x23, x9\n\t" + "umulh x9, x23, x9\n\t" + "adcs x5, x5, x24\n\t" + "mul x24, x23, x10\n\t" + "umulh x25, x23, x10\n\t" + "adcs x6, x6, x24\n\t" + "adc x25, x25, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x14, x14, xzr\n\t" + "adc x25, x25, xzr\n\t" /* Overflow */ - "extr x14, x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x25, x25, x6, #63\n\t" + "mul x25, x25, x23\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x6, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x25, x6, #63\n\t" + "and x25, x25, x23\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x14\n\t" + "adds x3, x3, x25\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x12, [x2]\n\t" + "ldp x13, x14, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x19, x11\n\t" + "umulh x4, x19, x11\n\t" + /* A[0] * B[1] */ + "mul x23, x19, x12\n\t" + "umulh x5, x19, x12\n\t" + "adds x4, x4, x23\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x23, x20, x11\n\t" + "umulh x24, x20, x11\n\t" + "adds x4, x4, x23\n\t" + "adcs x5, x5, x24\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x23, x19, x13\n\t" + "umulh x24, x19, x13\n\t" + "adds x5, x5, x23\n\t" + "adc x6, x6, x24\n\t" + /* A[1] * B[1] */ + "mul x23, x20, x12\n\t" + "umulh x24, x20, x12\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x23, x21, x11\n\t" + "umulh x24, x21, x11\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x23, x19, x14\n\t" + "umulh x24, x19, x14\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x23, x20, x13\n\t" + "umulh x24, x20, x13\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x23, x21, x12\n\t" + "umulh x24, x21, x12\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x23, x22, x11\n\t" + "umulh x24, x22, x11\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x23, x20, x14\n\t" + "umulh x24, x20, x14\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x23, x21, x13\n\t" + "umulh x24, x21, x13\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x23, x22, x12\n\t" + "umulh x24, x22, x12\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x23, x21, x14\n\t" + "umulh x24, x21, x14\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x23, x22, x13\n\t" + "umulh x24, x22, x13\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x23, x22, x14\n\t" + "umulh x24, x22, x14\n\t" + "adds x9, x9, x23\n\t" + "adc x10, x10, x24\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x23, #19\n\t" + "mul x24, x23, x7\n\t" + "umulh x7, x23, x7\n\t" + "adds x3, x3, x24\n\t" + "mul x24, x23, x8\n\t" + "umulh x8, x23, x8\n\t" + "adcs x4, x4, x24\n\t" + "mul x24, x23, x9\n\t" + "umulh x9, x23, x9\n\t" + "adcs x5, x5, x24\n\t" + "mul x24, x23, x10\n\t" + "umulh x25, x23, x10\n\t" + "adcs x6, x6, x24\n\t" + "adc x25, x25, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x25, x25, xzr\n\t" + /* Overflow */ + "extr x25, x25, x6, #63\n\t" + "mul x25, x25, x23\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x25\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "asr x25, x6, #63\n\t" + "and x25, x25, x23\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x25\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" + /* A[0] * B[1] */ + "mul x23, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x23\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x23, x12, x15\n\t" + "umulh x24, x12, x15\n\t" + "adds x4, x4, x23\n\t" + "adcs x5, x5, x24\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x23, x11, x17\n\t" + "umulh x24, x11, x17\n\t" + "adds x5, x5, x23\n\t" + "adc x6, x6, x24\n\t" + /* A[1] * B[1] */ + "mul x23, x12, x16\n\t" + "umulh x24, x12, x16\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x23, x13, x15\n\t" + "umulh x24, x13, x15\n\t" + "adds x5, x5, x23\n\t" + "adcs x6, x6, x24\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x23, x11, x18\n\t" + "umulh x24, x11, x18\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x23, x12, x17\n\t" + "umulh x24, x12, x17\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x23, x13, x16\n\t" + "umulh x24, x13, x16\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x23, x14, x15\n\t" + "umulh x24, x14, x15\n\t" + "adds x6, x6, x23\n\t" + "adcs x7, x7, x24\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x23, x12, x18\n\t" + "umulh x24, x12, x18\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x23, x13, x17\n\t" + "umulh x24, x13, x17\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x23, x14, x16\n\t" + "umulh x24, x14, x16\n\t" + "adds x7, x7, x23\n\t" + "adcs x8, x8, x24\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x23, x13, x18\n\t" + "umulh x24, x13, x18\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x23, x14, x17\n\t" + "umulh x24, x14, x17\n\t" + "adds x8, x8, x23\n\t" + "adcs x9, x9, x24\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x23, x14, x18\n\t" + "umulh x24, x14, x18\n\t" + "adds x9, x9, x23\n\t" + "adc x10, x10, x24\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x23, #19\n\t" + "mul x24, x23, x7\n\t" + "umulh x7, x23, x7\n\t" + "adds x3, x3, x24\n\t" + "mul x24, x23, x8\n\t" + "umulh x8, x23, x8\n\t" + "adcs x4, x4, x24\n\t" + "mul x24, x23, x9\n\t" + "umulh x9, x23, x9\n\t" + "adcs x5, x5, x24\n\t" + "mul x24, x23, x10\n\t" + "umulh x25, x23, x10\n\t" + "adcs x6, x6, x24\n\t" + "adc x25, x25, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x25, x25, xzr\n\t" + /* Overflow */ + "extr x25, x25, x6, #63\n\t" + "mul x25, x25, x23\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x25\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "asr x25, x6, #63\n\t" + "and x25, x25, x23\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3961,7 +3864,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25" ); } @@ -3979,36 +3882,36 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "str %[pz], [x29, #64]\n\t" "ldr x1, [x29, #48]\n\t" /* Square */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" /* A[0] * A[1] */ - "mul x5, x20, x21\n\t" - "umulh x6, x20, x21\n\t" + "mul x5, x12, x13\n\t" + "umulh x6, x12, x13\n\t" /* A[0] * A[2] */ - "mul x12, x20, x22\n\t" - "umulh x7, x20, x22\n\t" - "adds x6, x6, x12\n\t" + "mul x24, x12, x14\n\t" + "umulh x7, x12, x14\n\t" + "adds x6, x6, x24\n\t" "adc x7, x7, xzr\n\t" /* A[0] * A[3] */ - "mul x12, x20, x23\n\t" - "umulh x8, x20, x23\n\t" - "adds x7, x7, x12\n\t" + "mul x24, x12, x15\n\t" + "umulh x8, x12, x15\n\t" + "adds x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[1] * A[2] */ - "mul x12, x21, x22\n\t" - "umulh x13, x21, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x13, x14\n\t" + "umulh x25, x13, x14\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x12, x21, x23\n\t" - "umulh x13, x21, x23\n\t" - "adds x8, x8, x12\n\t" - "adc x9, x9, x13\n\t" + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x8, x8, x24\n\t" + "adc x9, x9, x25\n\t" /* A[2] * A[3] */ - "mul x12, x22, x23\n\t" - "umulh x10, x22, x23\n\t" - "adds x9, x9, x12\n\t" + "mul x24, x14, x15\n\t" + "umulh x10, x14, x15\n\t" + "adds x9, x9, x24\n\t" "adc x10, x10, xzr\n\t" /* Double */ "adds x5, x5, x5\n\t" @@ -4019,26 +3922,26 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, x10\n\t" "adc x11, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x4, x20, x20\n\t" - "umulh x15, x20, x20\n\t" + "mul x4, x12, x12\n\t" + "umulh x26, x12, x12\n\t" /* A[1] * A[1] */ - "mul x12, x21, x21\n\t" - "umulh x13, x21, x21\n\t" - "adds x5, x5, x15\n\t" - "adcs x6, x6, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x13, x13\n\t" + "umulh x25, x13, x13\n\t" + "adds x5, x5, x26\n\t" + "adcs x6, x6, x24\n\t" + "adc x26, x25, xzr\n\t" /* A[2] * A[2] */ - "mul x12, x22, x22\n\t" - "umulh x13, x22, x22\n\t" - "adds x7, x7, x15\n\t" - "adcs x8, x8, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x14, x14\n\t" + "umulh x25, x14, x14\n\t" + "adds x7, x7, x26\n\t" + "adcs x8, x8, x24\n\t" + "adc x26, x25, xzr\n\t" /* A[3] * A[3] */ - "mul x12, x23, x23\n\t" - "umulh x13, x23, x23\n\t" - "adds x9, x9, x15\n\t" - "adcs x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x15, x15\n\t" + "umulh x25, x15, x15\n\t" + "adds x9, x9, x26\n\t" + "adcs x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -4047,38 +3950,38 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x10, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x11\n\t" + "umulh x26, x24, x11\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -4086,338 +3989,323 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "stp x4, x5, [x0]\n\t" "stp x6, x7, [x0, #16]\n\t" "ldr x0, [x29, #32]\n\t" - "ldr x2, [x29, #56]\n\t" + "ldr x1, [x29, #56]\n\t" /* Square */ - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x20, x21, [x1]\n\t" + "ldp x22, x23, [x1, #16]\n\t" /* A[0] * A[1] */ - "mul x5, x20, x21\n\t" - "umulh x6, x20, x21\n\t" + "mul x9, x20, x21\n\t" + "umulh x10, x20, x21\n\t" /* A[0] * A[2] */ - "mul x12, x20, x22\n\t" - "umulh x7, x20, x22\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, xzr\n\t" + "mul x24, x20, x22\n\t" + "umulh x11, x20, x22\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, xzr\n\t" /* A[0] * A[3] */ - "mul x12, x20, x23\n\t" - "umulh x8, x20, x23\n\t" - "adds x7, x7, x12\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x20, x23\n\t" + "umulh x16, x20, x23\n\t" + "adds x11, x11, x24\n\t" + "adc x16, x16, xzr\n\t" /* A[1] * A[2] */ - "mul x12, x21, x22\n\t" - "umulh x13, x21, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x21, x22\n\t" + "umulh x25, x21, x22\n\t" + "adds x11, x11, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x12, x21, x23\n\t" - "umulh x13, x21, x23\n\t" - "adds x8, x8, x12\n\t" - "adc x9, x9, x13\n\t" + "mul x24, x21, x23\n\t" + "umulh x25, x21, x23\n\t" + "adds x16, x16, x24\n\t" + "adc x17, x17, x25\n\t" /* A[2] * A[3] */ - "mul x12, x22, x23\n\t" - "umulh x10, x22, x23\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x22, x23\n\t" + "umulh x18, x22, x23\n\t" + "adds x17, x17, x24\n\t" + "adc x18, x18, xzr\n\t" /* Double */ - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" + "adds x9, x9, x9\n\t" "adcs x10, x10, x10\n\t" - "adc x11, xzr, xzr\n\t" + "adcs x11, x11, x11\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x18, x18, x18\n\t" + "adc x19, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x4, x20, x20\n\t" - "umulh x15, x20, x20\n\t" + "mul x8, x20, x20\n\t" + "umulh x26, x20, x20\n\t" /* A[1] * A[1] */ - "mul x12, x21, x21\n\t" - "umulh x13, x21, x21\n\t" - "adds x5, x5, x15\n\t" - "adcs x6, x6, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x21, x21\n\t" + "umulh x25, x21, x21\n\t" + "adds x9, x9, x26\n\t" + "adcs x10, x10, x24\n\t" + "adc x26, x25, xzr\n\t" /* A[2] * A[2] */ - "mul x12, x22, x22\n\t" - "umulh x13, x22, x22\n\t" - "adds x7, x7, x15\n\t" - "adcs x8, x8, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x22, x22\n\t" + "umulh x25, x22, x22\n\t" + "adds x11, x11, x26\n\t" + "adcs x16, x16, x24\n\t" + "adc x26, x25, xzr\n\t" /* A[3] * A[3] */ - "mul x12, x23, x23\n\t" - "umulh x13, x23, x23\n\t" - "adds x9, x9, x15\n\t" - "adcs x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x23, x23\n\t" + "umulh x25, x23, x23\n\t" + "adds x17, x17, x26\n\t" + "adcs x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x8, x8, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x9, x9, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x10, x10, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x11, x11, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x9, x9, x16\n\t" + "adcs x10, x10, x17\n\t" + "adcs x11, x11, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "extr x26, x26, x11, #63\n\t" + "mul x26, x26, x24\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "asr x26, x11, #63\n\t" + "and x26, x26, x24\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" "ldr x0, [x29, #24]\n\t" /* Add */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "ldp x8, x9, [x2]\n\t" - "ldp x10, x11, [x2, #16]\n\t" - "adds x4, x4, x8\n\t" - "adcs x5, x5, x9\n\t" - "adcs x6, x6, x10\n\t" - "adc x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x7, #63\n\t" + "adds x12, x12, x20\n\t" + "adcs x13, x13, x21\n\t" + "adcs x14, x14, x22\n\t" + "adc x15, x15, x23\n\t" + "mov x24, #-19\n\t" + "asr x27, x15, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x12\n\t" - "sbcs x5, x5, x15\n\t" - "sbcs x6, x6, x15\n\t" - "sbc x7, x7, x13\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldr x1, [x29, #40]\n\t" + "subs x12, x12, x24\n\t" + "sbcs x13, x13, x27\n\t" + "sbcs x14, x14, x27\n\t" + "sbc x15, x15, x25\n\t" + "ldr x0, [x29, #40]\n\t" /* Square */ - "ldp x20, x21, [x0]\n\t" - "ldp x22, x23, [x0, #16]\n\t" /* A[0] * A[1] */ - "mul x5, x20, x21\n\t" - "umulh x6, x20, x21\n\t" + "mul x17, x12, x13\n\t" + "umulh x18, x12, x13\n\t" /* A[0] * A[2] */ - "mul x12, x20, x22\n\t" - "umulh x7, x20, x22\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, xzr\n\t" + "mul x24, x12, x14\n\t" + "umulh x19, x12, x14\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, xzr\n\t" /* A[0] * A[3] */ - "mul x12, x20, x23\n\t" - "umulh x8, x20, x23\n\t" - "adds x7, x7, x12\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x12, x15\n\t" + "umulh x20, x12, x15\n\t" + "adds x19, x19, x24\n\t" + "adc x20, x20, xzr\n\t" /* A[1] * A[2] */ - "mul x12, x21, x22\n\t" - "umulh x13, x21, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x13, x14\n\t" + "umulh x25, x13, x14\n\t" + "adds x19, x19, x24\n\t" + "adcs x20, x20, x25\n\t" + "adc x21, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x12, x21, x23\n\t" - "umulh x13, x21, x23\n\t" - "adds x8, x8, x12\n\t" - "adc x9, x9, x13\n\t" + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x20, x20, x24\n\t" + "adc x21, x21, x25\n\t" /* A[2] * A[3] */ - "mul x12, x22, x23\n\t" - "umulh x10, x22, x23\n\t" - "adds x9, x9, x12\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x14, x15\n\t" + "umulh x22, x14, x15\n\t" + "adds x21, x21, x24\n\t" + "adc x22, x22, xzr\n\t" /* Double */ - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adc x11, xzr, xzr\n\t" + "adds x17, x17, x17\n\t" + "adcs x18, x18, x18\n\t" + "adcs x19, x19, x19\n\t" + "adcs x20, x20, x20\n\t" + "adcs x21, x21, x21\n\t" + "adcs x22, x22, x22\n\t" + "adc x23, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x4, x20, x20\n\t" - "umulh x15, x20, x20\n\t" + "mul x16, x12, x12\n\t" + "umulh x26, x12, x12\n\t" /* A[1] * A[1] */ - "mul x12, x21, x21\n\t" - "umulh x13, x21, x21\n\t" - "adds x5, x5, x15\n\t" - "adcs x6, x6, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x13, x13\n\t" + "umulh x25, x13, x13\n\t" + "adds x17, x17, x26\n\t" + "adcs x18, x18, x24\n\t" + "adc x26, x25, xzr\n\t" /* A[2] * A[2] */ - "mul x12, x22, x22\n\t" - "umulh x13, x22, x22\n\t" - "adds x7, x7, x15\n\t" - "adcs x8, x8, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x14, x14\n\t" + "umulh x25, x14, x14\n\t" + "adds x19, x19, x26\n\t" + "adcs x20, x20, x24\n\t" + "adc x26, x25, xzr\n\t" /* A[3] * A[3] */ - "mul x12, x23, x23\n\t" - "umulh x13, x23, x23\n\t" - "adds x9, x9, x15\n\t" - "adcs x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x15, x15\n\t" + "umulh x25, x15, x15\n\t" + "adds x21, x21, x26\n\t" + "adcs x22, x22, x24\n\t" + "adc x23, x23, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + "extr x23, x23, x22, #63\n\t" + "extr x22, x22, x21, #63\n\t" + "extr x21, x21, x20, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "and x19, x19, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x20\n\t" + "umulh x20, x24, x20\n\t" + "adds x16, x16, x25\n\t" + "mul x25, x24, x21\n\t" + "umulh x21, x24, x21\n\t" + "adcs x17, x17, x25\n\t" + "mul x25, x24, x22\n\t" + "umulh x22, x24, x22\n\t" + "adcs x18, x18, x25\n\t" + "mul x25, x24, x23\n\t" + "umulh x26, x24, x23\n\t" + "adcs x19, x19, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x17, x17, x20\n\t" + "adcs x18, x18, x21\n\t" + "adcs x19, x19, x22\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "extr x26, x26, x19, #63\n\t" + "mul x26, x26, x24\n\t" + "and x19, x19, #0x7fffffffffffffff\n\t" + "adds x16, x16, x26\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x18, x18, xzr\n\t" + "adc x19, x19, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "asr x26, x19, #63\n\t" + "and x26, x26, x24\n\t" + "and x19, x19, #0x7fffffffffffffff\n\t" + "adds x16, x16, x26\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x18, x18, xzr\n\t" + "adc x19, x19, xzr\n\t" /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x1, [x29, #32]\n\t" - "ldr x2, [x29, #16]\n\t" - /* Add */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "ldp x8, x9, [x2]\n\t" - "ldp x10, x11, [x2, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" - /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" "stp x16, x17, [x0]\n\t" "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x1, [x29, #40]\n\t" - /* Sub */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #32]\n\t" + /* Add */ + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x15, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x24\n\t" + "sbcs x13, x13, x27\n\t" + "sbcs x14, x14, x27\n\t" + "sbc x15, x15, x25\n\t" + /* Sub */ + "subs x20, x8, x4\n\t" + "sbcs x21, x9, x5\n\t" + "sbcs x22, x10, x6\n\t" + "sbcs x23, x11, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" - "ldr x0, [x29, #64]\n\t" + "adds x20, x20, x24\n\t" + "adcs x21, x21, x27\n\t" + "adcs x22, x22, x27\n\t" + "adc x23, x23, x25\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x20, x21, [x1]\n\t" + "stp x22, x23, [x1, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + /* Sub */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "sbcs x18, x18, x14\n\t" + "sbcs x19, x19, x15\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x16, x17, [x0]\n\t" + "stp x18, x19, [x0, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #64]\n\t" /* Square * 2 */ - "ldp x20, x21, [x0]\n\t" - "ldp x22, x23, [x0, #16]\n\t" + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" /* A[0] * A[1] */ - "mul x5, x20, x21\n\t" - "umulh x6, x20, x21\n\t" + "mul x5, x12, x13\n\t" + "umulh x6, x12, x13\n\t" /* A[0] * A[2] */ - "mul x12, x20, x22\n\t" - "umulh x7, x20, x22\n\t" - "adds x6, x6, x12\n\t" + "mul x24, x12, x14\n\t" + "umulh x7, x12, x14\n\t" + "adds x6, x6, x24\n\t" "adc x7, x7, xzr\n\t" /* A[0] * A[3] */ - "mul x12, x20, x23\n\t" - "umulh x8, x20, x23\n\t" - "adds x7, x7, x12\n\t" + "mul x24, x12, x15\n\t" + "umulh x8, x12, x15\n\t" + "adds x7, x7, x24\n\t" "adc x8, x8, xzr\n\t" /* A[1] * A[2] */ - "mul x12, x21, x22\n\t" - "umulh x13, x21, x22\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x13, x14\n\t" + "umulh x25, x13, x14\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x12, x21, x23\n\t" - "umulh x13, x21, x23\n\t" - "adds x8, x8, x12\n\t" - "adc x9, x9, x13\n\t" + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x8, x8, x24\n\t" + "adc x9, x9, x25\n\t" /* A[2] * A[3] */ - "mul x12, x22, x23\n\t" - "umulh x10, x22, x23\n\t" - "adds x9, x9, x12\n\t" + "mul x24, x14, x15\n\t" + "umulh x10, x14, x15\n\t" + "adds x9, x9, x24\n\t" "adc x10, x10, xzr\n\t" /* Double */ "adds x5, x5, x5\n\t" @@ -4428,30 +4316,30 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, x10\n\t" "adc x11, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x4, x20, x20\n\t" - "umulh x15, x20, x20\n\t" + "mul x4, x12, x12\n\t" + "umulh x27, x12, x12\n\t" /* A[1] * A[1] */ - "mul x12, x21, x21\n\t" - "umulh x13, x21, x21\n\t" - "adds x5, x5, x15\n\t" - "adcs x6, x6, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x13, x13\n\t" + "umulh x25, x13, x13\n\t" + "adds x5, x5, x27\n\t" + "adcs x6, x6, x24\n\t" + "adc x27, x25, xzr\n\t" /* A[2] * A[2] */ - "mul x12, x22, x22\n\t" - "umulh x13, x22, x22\n\t" - "adds x7, x7, x15\n\t" - "adcs x8, x8, x12\n\t" - "adc x15, x13, xzr\n\t" + "mul x24, x14, x14\n\t" + "umulh x25, x14, x14\n\t" + "adds x7, x7, x27\n\t" + "adcs x8, x8, x24\n\t" + "adc x27, x25, xzr\n\t" /* A[3] * A[3] */ - "mul x12, x23, x23\n\t" - "umulh x13, x23, x23\n\t" - "adds x9, x9, x15\n\t" - "adcs x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x15, x15\n\t" + "umulh x25, x15, x15\n\t" + "adds x9, x9, x27\n\t" + "adcs x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" /* Double and Reduce */ - "mov x12, #0x169\n\t" + "mov x24, #0x169\n\t" /* Move top half into t4-t7 and remove top bit from t3 */ - "lsr x15, x11, #61\n\t" + "lsr x27, x11, #61\n\t" "extr x11, x11, x10, #62\n\t" "extr x10, x10, x9, #62\n\t" "extr x9, x9, x8, #62\n\t" @@ -4464,73 +4352,67 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz /* Two left, only one right */ "and x11, x11, #0x7fffffffffffffff\n\t" /* Multiply top bits by 19*19 */ - "mul x15, x15, x12\n\t" + "mul x27, x27, x24\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x10, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x11\n\t" + "umulh x26, x24, x11\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x4, x4, x15\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x0, [x29, #32]\n\t" + "ldr x0, [x29, #40]\n\t" /* Sub */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x4, x4, x20\n\t" + "sbcs x5, x5, x21\n\t" + "sbcs x6, x6, x22\n\t" + "sbcs x7, x7, x23\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x27\n\t" + "adcs x6, x6, x27\n\t" + "adc x7, x7, x25\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) : - : "memory", "x12", "x13", "x14", "x15", "x7", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23" + : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } @@ -4547,430 +4429,454 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "str %[py], [x29, #56]\n\t" "str %[pz], [x29, #64]\n\t" "str %[pt], [x29, #72]\n\t" - "ldr x1, [x29, #24]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x3]\n\t" - "ldp x10, x11, [x3, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x18, x19, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x18\n\t" + "adc x7, x15, x19\n\t" + "mov x24, #-19\n\t" + "asr x27, x7, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x4, x4, x24\n\t" + "sbcs x5, x5, x27\n\t" + "sbcs x6, x6, x27\n\t" + "sbc x7, x7, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x18\n\t" + "sbcs x11, x15, x19\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x2, [x29, #32]\n\t" - "ldr x3, [x29, #168]\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x27\n\t" + "adcs x10, x10, x27\n\t" + "adc x11, x11, x25\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #168]\n\t" /* Multiply */ - "ldp x20, x21, [x0]\n\t" - "ldp x22, x23, [x0, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x12, x4, x20\n\t" + "umulh x13, x4, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" + "mul x24, x4, x21\n\t" + "umulh x14, x4, x21\n\t" + "adds x13, x13, x24\n\t" + "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" + "mul x24, x5, x20\n\t" + "umulh x25, x5, x20\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x4, x22\n\t" + "umulh x25, x4, x22\n\t" + "adds x14, x14, x24\n\t" + "adc x15, x15, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" + "mul x24, x5, x21\n\t" + "umulh x25, x5, x21\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x6, x20\n\t" + "umulh x25, x6, x20\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x4, x23\n\t" + "umulh x25, x4, x23\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x5, x22\n\t" + "umulh x25, x5, x22\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x6, x21\n\t" + "umulh x25, x6, x21\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x7, x20\n\t" + "umulh x25, x7, x20\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" + "mul x24, x5, x23\n\t" + "umulh x25, x5, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x6, x22\n\t" + "umulh x25, x6, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x7, x21\n\t" + "umulh x25, x7, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" + "mul x24, x6, x23\n\t" + "umulh x25, x6, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" + "mul x24, x7, x22\n\t" + "umulh x25, x7, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x7, x23\n\t" + "umulh x25, x7, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x12, x12, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x13, x13, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x14, x14, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x15, x15, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "extr x26, x26, x15, #63\n\t" + "mul x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "asr x26, x15, #63\n\t" + "and x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Store */ - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" - "ldr x0, [x29, #176]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #176]\n\t" /* Multiply */ "ldp x20, x21, [x1]\n\t" "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x0]\n\t" - "ldp x26, x27, [x0, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x4, x8, x20\n\t" + "umulh x5, x8, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" + "mul x24, x8, x21\n\t" + "umulh x6, x8, x21\n\t" + "adds x5, x5, x24\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x24, x9, x20\n\t" + "umulh x25, x9, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x8, x22\n\t" + "umulh x25, x8, x22\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" + "mul x24, x9, x21\n\t" + "umulh x25, x9, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x10, x20\n\t" + "umulh x25, x10, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x8, x23\n\t" + "umulh x25, x8, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x9, x22\n\t" + "umulh x25, x9, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x10, x21\n\t" + "umulh x25, x10, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x11, x20\n\t" + "umulh x25, x11, x20\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" + "mul x24, x9, x23\n\t" + "umulh x25, x9, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x10, x22\n\t" + "umulh x25, x10, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x11, x21\n\t" + "umulh x25, x11, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" + "mul x24, x10, x23\n\t" + "umulh x25, x10, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" + "mul x24, x11, x22\n\t" + "umulh x25, x11, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x11, x23\n\t" + "umulh x25, x11, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x11, #63\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x24\n\t" + "sbcs x9, x9, x27\n\t" + "sbcs x10, x10, x27\n\t" + "sbc x11, x11, x25\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x18, x14, x6\n\t" + "sbcs x19, x15, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" "ldr x0, [x29, #40]\n\t" "ldr x1, [x29, #160]\n\t" "ldr x3, [x29, #72]\n\t" /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" + "ldp x16, x17, [x1]\n\t" + "ldp x18, x19, [x1, #16]\n\t" + "ldp x20, x21, [x3]\n\t" + "ldp x22, x23, [x3, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x4, x16, x20\n\t" + "umulh x5, x16, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" + "mul x24, x16, x21\n\t" + "umulh x6, x16, x21\n\t" + "adds x5, x5, x24\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x24, x17, x20\n\t" + "umulh x25, x17, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x16, x22\n\t" + "umulh x25, x16, x22\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x17, x21\n\t" + "umulh x25, x17, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x18, x20\n\t" + "umulh x25, x18, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x16, x23\n\t" + "umulh x25, x16, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x17, x22\n\t" + "umulh x25, x17, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x18, x21\n\t" + "umulh x25, x18, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x19, x20\n\t" + "umulh x25, x19, x20\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x17, x23\n\t" + "umulh x25, x17, x23\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x18, x22\n\t" + "umulh x25, x18, x22\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x19, x21\n\t" + "umulh x25, x19, x21\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x18, x23\n\t" + "umulh x25, x18, x23\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x19, x22\n\t" + "umulh x25, x19, x22\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x19, x23\n\t" + "umulh x25, x19, x23\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -4979,147 +4885,100 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x10, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x11\n\t" + "umulh x26, x24, x11\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #16]\n\t" - /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" - /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x0, [x29, #64]\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #64]\n\t" /* Double */ - "ldp x4, x5, [x0]\n\t" - "ldp x6, x7, [x0, #16]\n\t" - "adds x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "mov x12, #-19\n\t" - "asr x15, x7, #63\n\t" + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "adds x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, x11, x11\n\t" + "mov x24, #-19\n\t" + "asr x27, x11, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x12\n\t" - "sbcs x5, x5, x15\n\t" - "sbcs x6, x6, x15\n\t" - "sbc x7, x7, x13\n\t" - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" - "ldr x0, [x29, #40]\n\t" + "subs x8, x8, x24\n\t" + "sbcs x9, x9, x27\n\t" + "sbcs x10, x10, x27\n\t" + "sbc x11, x11, x25\n\t" + "ldr x1, [x29, #40]\n\t" /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x15, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x12, x12, x24\n\t" + "sbcs x13, x13, x27\n\t" + "sbcs x14, x14, x27\n\t" + "sbc x15, x15, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x16, x8, x4\n\t" + "sbcs x17, x9, x5\n\t" + "sbcs x18, x10, x6\n\t" + "sbcs x19, x11, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x2]\n\t" - "stp x18, x19, [x2, #16]\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); (void)qxy2d; (void)qyplusx; @@ -5139,430 +4998,454 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "str %[py], [x29, #56]\n\t" "str %[pz], [x29, #64]\n\t" "str %[pt], [x29, #72]\n\t" - "ldr x1, [x29, #24]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x3]\n\t" - "ldp x10, x11, [x3, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x18, x19, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x18\n\t" + "adc x7, x15, x19\n\t" + "mov x24, #-19\n\t" + "asr x27, x7, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x4, x4, x24\n\t" + "sbcs x5, x5, x27\n\t" + "sbcs x6, x6, x27\n\t" + "sbc x7, x7, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x18\n\t" + "sbcs x11, x15, x19\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x2, [x29, #32]\n\t" - "ldr x3, [x29, #176]\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x27\n\t" + "adcs x10, x10, x27\n\t" + "adc x11, x11, x25\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #176]\n\t" /* Multiply */ - "ldp x20, x21, [x0]\n\t" - "ldp x22, x23, [x0, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x12, x4, x20\n\t" + "umulh x13, x4, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" + "mul x24, x4, x21\n\t" + "umulh x14, x4, x21\n\t" + "adds x13, x13, x24\n\t" + "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" + "mul x24, x5, x20\n\t" + "umulh x25, x5, x20\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x4, x22\n\t" + "umulh x25, x4, x22\n\t" + "adds x14, x14, x24\n\t" + "adc x15, x15, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" + "mul x24, x5, x21\n\t" + "umulh x25, x5, x21\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x6, x20\n\t" + "umulh x25, x6, x20\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x4, x23\n\t" + "umulh x25, x4, x23\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x5, x22\n\t" + "umulh x25, x5, x22\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x6, x21\n\t" + "umulh x25, x6, x21\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x7, x20\n\t" + "umulh x25, x7, x20\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" + "mul x24, x5, x23\n\t" + "umulh x25, x5, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x6, x22\n\t" + "umulh x25, x6, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x7, x21\n\t" + "umulh x25, x7, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" + "mul x24, x6, x23\n\t" + "umulh x25, x6, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" + "mul x24, x7, x22\n\t" + "umulh x25, x7, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x7, x23\n\t" + "umulh x25, x7, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x12, x12, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x13, x13, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x14, x14, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x15, x15, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "extr x26, x26, x15, #63\n\t" + "mul x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "asr x26, x15, #63\n\t" + "and x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Store */ - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" - "ldr x0, [x29, #168]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #168]\n\t" /* Multiply */ "ldp x20, x21, [x1]\n\t" "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x0]\n\t" - "ldp x26, x27, [x0, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x4, x8, x20\n\t" + "umulh x5, x8, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" + "mul x24, x8, x21\n\t" + "umulh x6, x8, x21\n\t" + "adds x5, x5, x24\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x24, x9, x20\n\t" + "umulh x25, x9, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x8, x22\n\t" + "umulh x25, x8, x22\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" + "mul x24, x9, x21\n\t" + "umulh x25, x9, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x10, x20\n\t" + "umulh x25, x10, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x8, x23\n\t" + "umulh x25, x8, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x9, x22\n\t" + "umulh x25, x9, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x10, x21\n\t" + "umulh x25, x10, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x11, x20\n\t" + "umulh x25, x11, x20\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" + "mul x24, x9, x23\n\t" + "umulh x25, x9, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x10, x22\n\t" + "umulh x25, x10, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x11, x21\n\t" + "umulh x25, x11, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" + "mul x24, x10, x23\n\t" + "umulh x25, x10, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" + "mul x24, x11, x22\n\t" + "umulh x25, x11, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x11, x23\n\t" + "umulh x25, x11, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x11, #63\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x24\n\t" + "sbcs x9, x9, x27\n\t" + "sbcs x10, x10, x27\n\t" + "sbc x11, x11, x25\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x18, x14, x6\n\t" + "sbcs x19, x15, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" "ldr x0, [x29, #40]\n\t" "ldr x1, [x29, #160]\n\t" "ldr x3, [x29, #72]\n\t" /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" + "ldp x16, x17, [x1]\n\t" + "ldp x18, x19, [x1, #16]\n\t" + "ldp x20, x21, [x3]\n\t" + "ldp x22, x23, [x3, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x4, x16, x20\n\t" + "umulh x5, x16, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" + "mul x24, x16, x21\n\t" + "umulh x6, x16, x21\n\t" + "adds x5, x5, x24\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x24, x17, x20\n\t" + "umulh x25, x17, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x16, x22\n\t" + "umulh x25, x16, x22\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x17, x21\n\t" + "umulh x25, x17, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x18, x20\n\t" + "umulh x25, x18, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x16, x23\n\t" + "umulh x25, x16, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x17, x22\n\t" + "umulh x25, x17, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x18, x21\n\t" + "umulh x25, x18, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x19, x20\n\t" + "umulh x25, x19, x20\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x17, x23\n\t" + "umulh x25, x17, x23\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x18, x22\n\t" + "umulh x25, x18, x22\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x19, x21\n\t" + "umulh x25, x19, x21\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x18, x23\n\t" + "umulh x25, x18, x23\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x19, x22\n\t" + "umulh x25, x19, x22\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x19, x23\n\t" + "umulh x25, x19, x23\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -5571,146 +5454,100 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x10, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x11\n\t" + "umulh x26, x24, x11\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldr x1, [x29, #24]\n\t" - "ldr x3, [x29, #16]\n\t" - /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x1]\n\t" - "ldp x10, x11, [x1, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" - /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" - "stp x4, x5, [x3]\n\t" - "stp x6, x7, [x3, #16]\n\t" + "ldr x0, [x29, #32]\n\t" "ldr x1, [x29, #64]\n\t" /* Double */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "adds x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "mov x12, #-19\n\t" - "asr x15, x7, #63\n\t" + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "adds x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, x11, x11\n\t" + "mov x24, #-19\n\t" + "asr x27, x11, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x12\n\t" - "sbcs x5, x5, x15\n\t" - "sbcs x6, x6, x15\n\t" - "sbc x7, x7, x13\n\t" - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" + "subs x8, x8, x24\n\t" + "sbcs x9, x9, x27\n\t" + "sbcs x10, x10, x27\n\t" + "sbc x11, x11, x25\n\t" + "ldr x1, [x29, #40]\n\t" /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x15, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x12, x12, x24\n\t" + "sbcs x13, x13, x27\n\t" + "sbcs x14, x14, x27\n\t" + "sbc x15, x15, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x16, x8, x4\n\t" + "sbcs x17, x9, x5\n\t" + "sbcs x18, x10, x6\n\t" + "sbcs x19, x11, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" "stp x16, x17, [x0]\n\t" "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); (void)qxy2d; (void)qyplusx; @@ -5720,7 +5557,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) { __asm__ __volatile__ ( - "stp x29, x30, [sp, #-112]!\n\t" + "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" "str %[rx], [x29, #16]\n\t" "str %[ry], [x29, #24]\n\t" @@ -5730,573 +5567,454 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "str %[py], [x29, #56]\n\t" "str %[pz], [x29, #64]\n\t" "str %[pt], [x29, #72]\n\t" - "ldr x1, [x29, #24]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x3]\n\t" - "ldp x10, x11, [x3, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x18, x19, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x18\n\t" + "adc x7, x15, x19\n\t" + "mov x24, #-19\n\t" + "asr x27, x7, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x4, x4, x24\n\t" + "sbcs x5, x5, x27\n\t" + "sbcs x6, x6, x27\n\t" + "sbc x7, x7, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x18\n\t" + "sbcs x11, x15, x19\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x2, [x29, #32]\n\t" - "ldr x3, [x29, #208]\n\t" - /* Multiply */ - "ldp x20, x21, [x0]\n\t" - "ldp x22, x23, [x0, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" - /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" - /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" - /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" - "ldr x2, [x29, #216]\n\t" - /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x2]\n\t" - "ldp x26, x27, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" - /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" - /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" - /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x1, [x29, #40]\n\t" - "ldr x2, [x29, #200]\n\t" - "ldr x3, [x29, #72]\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x27\n\t" + "adcs x10, x10, x27\n\t" + "adc x11, x11, x25\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #176]\n\t" /* Multiply */ "ldp x20, x21, [x2]\n\t" "ldp x22, x23, [x2, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x12, x4, x20\n\t" + "umulh x13, x4, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" + "mul x24, x4, x21\n\t" + "umulh x14, x4, x21\n\t" + "adds x13, x13, x24\n\t" + "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" + "mul x24, x5, x20\n\t" + "umulh x25, x5, x20\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x4, x22\n\t" + "umulh x25, x4, x22\n\t" + "adds x14, x14, x24\n\t" + "adc x15, x15, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" + "mul x24, x5, x21\n\t" + "umulh x25, x5, x21\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x6, x20\n\t" + "umulh x25, x6, x20\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x4, x23\n\t" + "umulh x25, x4, x23\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x5, x22\n\t" + "umulh x25, x5, x22\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x6, x21\n\t" + "umulh x25, x6, x21\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x7, x20\n\t" + "umulh x25, x7, x20\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" + "mul x24, x5, x23\n\t" + "umulh x25, x5, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x6, x22\n\t" + "umulh x25, x6, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x7, x21\n\t" + "umulh x25, x7, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" + "mul x24, x6, x23\n\t" + "umulh x25, x6, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" + "mul x24, x7, x22\n\t" + "umulh x25, x7, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x7, x23\n\t" + "umulh x25, x7, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x12, x12, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x13, x13, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x14, x14, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x15, x15, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "extr x26, x26, x15, #63\n\t" + "mul x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "asr x26, x15, #63\n\t" + "and x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x1, [x29, #64]\n\t" - "ldr x2, [x29, #192]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #184]\n\t" /* Multiply */ "ldp x20, x21, [x1]\n\t" "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x2]\n\t" - "ldp x26, x27, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x4, x8, x20\n\t" + "umulh x5, x8, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" + "mul x24, x8, x21\n\t" + "umulh x6, x8, x21\n\t" + "adds x5, x5, x24\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x24, x9, x20\n\t" + "umulh x25, x9, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x8, x22\n\t" + "umulh x25, x8, x22\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x9, x21\n\t" + "umulh x25, x9, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x10, x20\n\t" + "umulh x25, x10, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x8, x23\n\t" + "umulh x25, x8, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x9, x22\n\t" + "umulh x25, x9, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x10, x21\n\t" + "umulh x25, x10, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x11, x20\n\t" + "umulh x25, x11, x20\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x9, x23\n\t" + "umulh x25, x9, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x10, x22\n\t" + "umulh x25, x10, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x11, x21\n\t" + "umulh x25, x11, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x10, x23\n\t" + "umulh x25, x10, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x11, x22\n\t" + "umulh x25, x11, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x11, x23\n\t" + "umulh x25, x11, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x18\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x11, #63\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x24\n\t" + "sbcs x9, x9, x27\n\t" + "sbcs x10, x10, x27\n\t" + "sbc x11, x11, x25\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x18, x14, x6\n\t" + "sbcs x19, x15, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" + "ldr x0, [x29, #48]\n\t" + "ldr x1, [x29, #64]\n\t" + "ldr x2, [x29, #160]\n\t" + /* Multiply */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + "ldp x16, x17, [x2]\n\t" + "ldp x18, x19, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x12, x16\n\t" + "umulh x5, x12, x16\n\t" + /* A[0] * B[1] */ + "mul x24, x12, x17\n\t" + "umulh x6, x12, x17\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x13, x16\n\t" + "umulh x25, x13, x16\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x12, x18\n\t" + "umulh x25, x12, x18\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x13, x17\n\t" + "umulh x25, x13, x17\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x14, x16\n\t" + "umulh x25, x14, x16\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x12, x19\n\t" + "umulh x25, x12, x19\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x13, x18\n\t" + "umulh x25, x13, x18\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x14, x17\n\t" + "umulh x25, x14, x17\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x15, x16\n\t" + "umulh x25, x15, x16\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x13, x19\n\t" + "umulh x25, x13, x19\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x14, x18\n\t" + "umulh x25, x14, x18\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x15, x17\n\t" + "umulh x25, x15, x17\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x14, x19\n\t" + "umulh x25, x14, x19\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x15, x18\n\t" + "umulh x25, x15, x18\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x15, x19\n\t" + "umulh x25, x15, x19\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -6305,147 +6023,240 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x10, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x11\n\t" + "umulh x26, x24, x11\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "add x1, x29, #80\n\t" + "ldr x0, [x29, #48]\n\t" /* Double */ - "ldp x4, x5, [x0]\n\t" - "ldp x6, x7, [x0, #16]\n\t" "adds x4, x4, x4\n\t" "adcs x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" "adc x7, x7, x7\n\t" - "mov x12, #-19\n\t" - "asr x15, x7, #63\n\t" + "mov x24, #-19\n\t" + "asr x27, x7, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x12\n\t" - "sbcs x5, x5, x15\n\t" - "sbcs x6, x6, x15\n\t" - "sbc x7, x7, x13\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x2, [x29, #24]\n\t" - "ldr x3, [x29, #32]\n\t" - /* Add */ - "ldp x4, x5, [x3]\n\t" - "ldp x6, x7, [x3, #16]\n\t" - "ldp x8, x9, [x2]\n\t" - "ldp x10, x11, [x2, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" - /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x2]\n\t" - "stp x18, x19, [x2, #16]\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" + "subs x4, x4, x24\n\t" + "sbcs x5, x5, x27\n\t" + "sbcs x6, x6, x27\n\t" + "sbc x7, x7, x25\n\t" "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #168]\n\t" + "ldr x2, [x29, #72]\n\t" + /* Multiply */ + "ldp x16, x17, [x1]\n\t" + "ldp x18, x19, [x1, #16]\n\t" + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x8, x16, x20\n\t" + "umulh x9, x16, x20\n\t" + /* A[0] * B[1] */ + "mul x24, x16, x21\n\t" + "umulh x10, x16, x21\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x17, x20\n\t" + "umulh x25, x17, x20\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x16, x22\n\t" + "umulh x25, x16, x22\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x17, x21\n\t" + "umulh x25, x17, x21\n\t" + "adds x10, x10, x24\n\t" + "adcs x11, x11, x25\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x18, x20\n\t" + "umulh x25, x18, x20\n\t" + "adds x10, x10, x24\n\t" + "adcs x11, x11, x25\n\t" + "adc x12, x12, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x16, x23\n\t" + "umulh x25, x16, x23\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x17, x22\n\t" + "umulh x25, x17, x22\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, x13, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x18, x21\n\t" + "umulh x25, x18, x21\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x19, x20\n\t" + "umulh x25, x19, x20\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, x13, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x17, x23\n\t" + "umulh x25, x17, x23\n\t" + "adds x12, x12, x24\n\t" + "adcs x13, x13, x25\n\t" + "adc x14, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x18, x22\n\t" + "umulh x25, x18, x22\n\t" + "adds x12, x12, x24\n\t" + "adcs x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x19, x21\n\t" + "umulh x25, x19, x21\n\t" + "adds x12, x12, x24\n\t" + "adcs x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x18, x23\n\t" + "umulh x25, x18, x23\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x19, x22\n\t" + "umulh x25, x19, x22\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, x15, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x19, x23\n\t" + "umulh x25, x19, x23\n\t" + "adds x14, x14, x24\n\t" + "adc x15, x15, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x13, #63\n\t" + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x12\n\t" + "umulh x12, x24, x12\n\t" + "adds x8, x8, x25\n\t" + "mul x25, x24, x13\n\t" + "umulh x13, x24, x13\n\t" + "adcs x9, x9, x25\n\t" + "mul x25, x24, x14\n\t" + "umulh x14, x24, x14\n\t" + "adcs x10, x10, x25\n\t" + "mul x25, x24, x15\n\t" + "umulh x26, x24, x15\n\t" + "adcs x11, x11, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adcs x11, x11, x14\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x11, #63\n\t" + "mul x26, x26, x24\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce if top bit set */ + "asr x26, x11, #63\n\t" + "and x26, x26, x24\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Store */ + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #40]\n\t" /* Add */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "adds x12, x4, x8\n\t" + "adcs x13, x5, x9\n\t" + "adcs x14, x6, x10\n\t" + "adc x15, x7, x11\n\t" + "mov x24, #-19\n\t" + "asr x27, x15, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x12, x12, x24\n\t" + "sbcs x13, x13, x27\n\t" + "sbcs x14, x14, x27\n\t" + "sbc x15, x15, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x16, x4, x8\n\t" + "sbcs x17, x5, x9\n\t" + "sbcs x18, x6, x10\n\t" + "sbcs x19, x7, x11\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x3]\n\t" - "stp x18, x19, [x3, #16]\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldp x29, x30, [sp], #0x70\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); (void)qz; (void)qt2d; @@ -6456,7 +6267,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) { __asm__ __volatile__ ( - "stp x29, x30, [sp, #-112]!\n\t" + "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" "str %[rx], [x29, #16]\n\t" "str %[ry], [x29, #24]\n\t" @@ -6466,573 +6277,454 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "str %[py], [x29, #56]\n\t" "str %[pz], [x29, #64]\n\t" "str %[pt], [x29, #72]\n\t" - "ldr x1, [x29, #24]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ - "ldp x4, x5, [x2]\n\t" - "ldp x6, x7, [x2, #16]\n\t" - "ldp x8, x9, [x3]\n\t" - "ldp x10, x11, [x3, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x18, x19, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x18\n\t" + "adc x7, x15, x19\n\t" + "mov x24, #-19\n\t" + "asr x27, x7, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x4, x4, x24\n\t" + "sbcs x5, x5, x27\n\t" + "sbcs x6, x6, x27\n\t" + "sbc x7, x7, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x18\n\t" + "sbcs x11, x15, x19\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x2, [x29, #32]\n\t" - "ldr x3, [x29, #216]\n\t" - /* Multiply */ - "ldp x20, x21, [x0]\n\t" - "ldp x22, x23, [x0, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" - /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" - /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" - /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "stp x4, x5, [x2]\n\t" - "stp x6, x7, [x2, #16]\n\t" - "ldr x2, [x29, #208]\n\t" - /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x2]\n\t" - "ldp x26, x27, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" - /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" - /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" - /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x1, [x29, #40]\n\t" - "ldr x2, [x29, #200]\n\t" - "ldr x3, [x29, #72]\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x27\n\t" + "adcs x10, x10, x27\n\t" + "adc x11, x11, x25\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #184]\n\t" /* Multiply */ "ldp x20, x21, [x2]\n\t" "ldp x22, x23, [x2, #16]\n\t" - "ldp x24, x25, [x3]\n\t" - "ldp x26, x27, [x3, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x12, x4, x20\n\t" + "umulh x13, x4, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" - "adc x6, x6, xzr\n\t" + "mul x24, x4, x21\n\t" + "umulh x14, x4, x21\n\t" + "adds x13, x13, x24\n\t" + "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "adc x7, xzr, xzr\n\t" + "mul x24, x5, x20\n\t" + "umulh x25, x5, x20\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x4, x22\n\t" + "umulh x25, x4, x22\n\t" + "adds x14, x14, x24\n\t" + "adc x15, x15, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, xzr, xzr\n\t" + "mul x24, x5, x21\n\t" + "umulh x25, x5, x21\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adc x8, x8, xzr\n\t" + "mul x24, x6, x20\n\t" + "umulh x25, x6, x20\n\t" + "adds x14, x14, x24\n\t" + "adcs x15, x15, x25\n\t" + "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, xzr, xzr\n\t" + "mul x24, x4, x23\n\t" + "umulh x25, x4, x23\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x5, x22\n\t" + "umulh x25, x5, x22\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x6, x21\n\t" + "umulh x25, x6, x21\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" - "adc x9, x9, xzr\n\t" + "mul x24, x7, x20\n\t" + "umulh x25, x7, x20\n\t" + "adds x15, x15, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, xzr, xzr\n\t" + "mul x24, x5, x23\n\t" + "umulh x25, x5, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x6, x22\n\t" + "umulh x25, x6, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" - "adc x10, x10, xzr\n\t" + "mul x24, x7, x21\n\t" + "umulh x25, x7, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, xzr, xzr\n\t" + "mul x24, x6, x23\n\t" + "umulh x25, x6, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adc x11, x11, xzr\n\t" + "mul x24, x7, x22\n\t" + "umulh x25, x7, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x7, x23\n\t" + "umulh x25, x7, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x12, x12, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x13, x13, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x14, x14, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x15, x15, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x18\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "extr x26, x26, x15, #63\n\t" + "mul x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "asr x26, x15, #63\n\t" + "and x26, x26, x24\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x26\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Store */ - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x1, [x29, #64]\n\t" - "ldr x2, [x29, #192]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #176]\n\t" /* Multiply */ "ldp x20, x21, [x1]\n\t" "ldp x22, x23, [x1, #16]\n\t" - "ldp x24, x25, [x2]\n\t" - "ldp x26, x27, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x20, x24\n\t" - "umulh x5, x20, x24\n\t" + "mul x4, x8, x20\n\t" + "umulh x5, x8, x20\n\t" /* A[0] * B[1] */ - "mul x12, x20, x25\n\t" - "umulh x6, x20, x25\n\t" - "adds x5, x5, x12\n\t" + "mul x24, x8, x21\n\t" + "umulh x6, x8, x21\n\t" + "adds x5, x5, x24\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x12, x21, x24\n\t" - "umulh x13, x21, x24\n\t" - "adds x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" + "mul x24, x9, x20\n\t" + "umulh x25, x9, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x12, x20, x26\n\t" - "umulh x13, x20, x26\n\t" - "adds x6, x6, x12\n\t" - "adc x7, x7, x13\n\t" + "mul x24, x8, x22\n\t" + "umulh x25, x8, x22\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" /* A[1] * B[1] */ - "mul x12, x21, x25\n\t" - "umulh x13, x21, x25\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x9, x21\n\t" + "umulh x25, x9, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x10, x20\n\t" + "umulh x25, x10, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x8, x23\n\t" + "umulh x25, x8, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x9, x22\n\t" + "umulh x25, x9, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x10, x21\n\t" + "umulh x25, x10, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x11, x20\n\t" + "umulh x25, x11, x20\n\t" + "adds x7, x7, x24\n\t" + "adcs x16, x16, x25\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x9, x23\n\t" + "umulh x25, x9, x23\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x10, x22\n\t" + "umulh x25, x10, x22\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x11, x21\n\t" + "umulh x25, x11, x21\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x25\n\t" + "adc x18, x18, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x10, x23\n\t" + "umulh x25, x10, x23\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x11, x22\n\t" + "umulh x25, x11, x22\n\t" + "adds x17, x17, x24\n\t" + "adcs x18, x18, x25\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x11, x23\n\t" + "umulh x25, x11, x23\n\t" + "adds x18, x18, x24\n\t" + "adc x19, x19, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x19, x19, x18, #63\n\t" + "extr x18, x18, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x16\n\t" + "umulh x16, x24, x16\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x17\n\t" + "umulh x17, x24, x17\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x18\n\t" + "umulh x18, x24, x18\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x19\n\t" + "umulh x26, x24, x19\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x18\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x24, #-19\n\t" + "asr x27, x11, #63\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x24\n\t" + "sbcs x9, x9, x27\n\t" + "sbcs x10, x10, x27\n\t" + "sbc x11, x11, x25\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x18, x14, x6\n\t" + "sbcs x19, x15, x7\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" + /* Mask the modulus */ + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" + "ldr x0, [x29, #48]\n\t" + "ldr x1, [x29, #64]\n\t" + "ldr x2, [x29, #160]\n\t" + /* Multiply */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + "ldp x16, x17, [x2]\n\t" + "ldp x18, x19, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x12, x16\n\t" + "umulh x5, x12, x16\n\t" + /* A[0] * B[1] */ + "mul x24, x12, x17\n\t" + "umulh x6, x12, x17\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x13, x16\n\t" + "umulh x25, x13, x16\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x12, x18\n\t" + "umulh x25, x12, x18\n\t" + "adds x6, x6, x24\n\t" + "adc x7, x7, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x13, x17\n\t" + "umulh x25, x13, x17\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x12, x22, x24\n\t" - "umulh x13, x22, x24\n\t" - "adds x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" + "mul x24, x14, x16\n\t" + "umulh x25, x14, x16\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x12, x20, x27\n\t" - "umulh x13, x20, x27\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x12, x19\n\t" + "umulh x25, x12, x19\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x12, x21, x26\n\t" - "umulh x13, x21, x26\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x13, x18\n\t" + "umulh x25, x13, x18\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x12, x22, x25\n\t" - "umulh x13, x22, x25\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x14, x17\n\t" + "umulh x25, x14, x17\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x12, x23, x24\n\t" - "umulh x13, x23, x24\n\t" - "adds x7, x7, x12\n\t" - "adcs x8, x8, x13\n\t" + "mul x24, x15, x16\n\t" + "umulh x25, x15, x16\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x12, x21, x27\n\t" - "umulh x13, x21, x27\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x13, x19\n\t" + "umulh x25, x13, x19\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x12, x22, x26\n\t" - "umulh x13, x22, x26\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x14, x18\n\t" + "umulh x25, x14, x18\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x12, x23, x25\n\t" - "umulh x13, x23, x25\n\t" - "adds x8, x8, x12\n\t" - "adcs x9, x9, x13\n\t" + "mul x24, x15, x17\n\t" + "umulh x25, x15, x17\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x12, x22, x27\n\t" - "umulh x13, x22, x27\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x14, x19\n\t" + "umulh x25, x14, x19\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x12, x23, x26\n\t" - "umulh x13, x23, x26\n\t" - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" + "mul x24, x15, x18\n\t" + "umulh x25, x15, x18\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x12, x23, x27\n\t" - "umulh x13, x23, x27\n\t" - "adds x10, x10, x12\n\t" - "adc x11, x11, x13\n\t" + "mul x24, x15, x19\n\t" + "umulh x25, x15, x19\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -7041,147 +6733,240 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x12, #19\n\t" - "mul x13, x12, x8\n\t" - "umulh x8, x12, x8\n\t" - "adds x4, x4, x13\n\t" - "mul x13, x12, x9\n\t" - "umulh x9, x12, x9\n\t" - "adcs x5, x5, x13\n\t" - "mul x13, x12, x10\n\t" - "umulh x10, x12, x10\n\t" - "adcs x6, x6, x13\n\t" - "mul x13, x12, x11\n\t" - "umulh x14, x12, x11\n\t" - "adcs x7, x7, x13\n\t" - "adc x14, x14, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adds x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x10, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "mul x25, x24, x11\n\t" + "umulh x26, x24, x11\n\t" + "adcs x7, x7, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x14, x14, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x14, x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "extr x26, x26, x7, #63\n\t" + "mul x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "lsr x14, x7, #63\n\t" - "mul x14, x14, x12\n\t" + "asr x26, x7, #63\n\t" + "and x26, x26, x24\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x14\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "add x1, x29, #80\n\t" + "ldr x0, [x29, #48]\n\t" /* Double */ - "ldp x4, x5, [x0]\n\t" - "ldp x6, x7, [x0, #16]\n\t" "adds x4, x4, x4\n\t" "adcs x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" "adc x7, x7, x7\n\t" - "mov x12, #-19\n\t" - "asr x15, x7, #63\n\t" + "mov x24, #-19\n\t" + "asr x27, x7, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x12\n\t" - "sbcs x5, x5, x15\n\t" - "sbcs x6, x6, x15\n\t" - "sbc x7, x7, x13\n\t" - "stp x4, x5, [x1]\n\t" - "stp x6, x7, [x1, #16]\n\t" - "ldr x2, [x29, #24]\n\t" - "ldr x3, [x29, #32]\n\t" - /* Add */ - "ldp x4, x5, [x3]\n\t" - "ldp x6, x7, [x3, #16]\n\t" - "ldp x8, x9, [x2]\n\t" - "ldp x10, x11, [x2, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" - /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" - /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x2]\n\t" - "stp x18, x19, [x2, #16]\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" + "subs x4, x4, x24\n\t" + "sbcs x5, x5, x27\n\t" + "sbcs x6, x6, x27\n\t" + "sbc x7, x7, x25\n\t" "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #168]\n\t" + "ldr x2, [x29, #72]\n\t" + /* Multiply */ + "ldp x16, x17, [x1]\n\t" + "ldp x18, x19, [x1, #16]\n\t" + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x8, x16, x20\n\t" + "umulh x9, x16, x20\n\t" + /* A[0] * B[1] */ + "mul x24, x16, x21\n\t" + "umulh x10, x16, x21\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x17, x20\n\t" + "umulh x25, x17, x20\n\t" + "adds x9, x9, x24\n\t" + "adcs x10, x10, x25\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x16, x22\n\t" + "umulh x25, x16, x22\n\t" + "adds x10, x10, x24\n\t" + "adc x11, x11, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x17, x21\n\t" + "umulh x25, x17, x21\n\t" + "adds x10, x10, x24\n\t" + "adcs x11, x11, x25\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x18, x20\n\t" + "umulh x25, x18, x20\n\t" + "adds x10, x10, x24\n\t" + "adcs x11, x11, x25\n\t" + "adc x12, x12, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x16, x23\n\t" + "umulh x25, x16, x23\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x17, x22\n\t" + "umulh x25, x17, x22\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, x13, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x18, x21\n\t" + "umulh x25, x18, x21\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x19, x20\n\t" + "umulh x25, x19, x20\n\t" + "adds x11, x11, x24\n\t" + "adcs x12, x12, x25\n\t" + "adc x13, x13, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x17, x23\n\t" + "umulh x25, x17, x23\n\t" + "adds x12, x12, x24\n\t" + "adcs x13, x13, x25\n\t" + "adc x14, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x18, x22\n\t" + "umulh x25, x18, x22\n\t" + "adds x12, x12, x24\n\t" + "adcs x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x19, x21\n\t" + "umulh x25, x19, x21\n\t" + "adds x12, x12, x24\n\t" + "adcs x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x18, x23\n\t" + "umulh x25, x18, x23\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x19, x22\n\t" + "umulh x25, x19, x22\n\t" + "adds x13, x13, x24\n\t" + "adcs x14, x14, x25\n\t" + "adc x15, x15, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x19, x23\n\t" + "umulh x25, x19, x23\n\t" + "adds x14, x14, x24\n\t" + "adc x15, x15, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x13, #63\n\t" + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x12\n\t" + "umulh x12, x24, x12\n\t" + "adds x8, x8, x25\n\t" + "mul x25, x24, x13\n\t" + "umulh x13, x24, x13\n\t" + "adcs x9, x9, x25\n\t" + "mul x25, x24, x14\n\t" + "umulh x14, x24, x14\n\t" + "adcs x10, x10, x25\n\t" + "mul x25, x24, x15\n\t" + "umulh x26, x24, x15\n\t" + "adcs x11, x11, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adcs x11, x11, x14\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x11, #63\n\t" + "mul x26, x26, x24\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce if top bit set */ + "asr x26, x11, #63\n\t" + "and x26, x26, x24\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Store */ + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #32]\n\t" /* Add */ - "ldp x4, x5, [x1]\n\t" - "ldp x6, x7, [x1, #16]\n\t" - "ldp x8, x9, [x0]\n\t" - "ldp x10, x11, [x0, #16]\n\t" - "adds x16, x4, x8\n\t" - "adcs x17, x5, x9\n\t" - "adcs x18, x6, x10\n\t" - "adc x19, x7, x11\n\t" - "mov x12, #-19\n\t" - "asr x15, x19, #63\n\t" + "adds x12, x4, x8\n\t" + "adcs x13, x5, x9\n\t" + "adcs x14, x6, x10\n\t" + "adc x15, x7, x11\n\t" + "mov x24, #-19\n\t" + "asr x27, x15, #63\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x15\n\t" - "sbcs x18, x18, x15\n\t" - "sbc x19, x19, x13\n\t" + "subs x12, x12, x24\n\t" + "sbcs x13, x13, x27\n\t" + "sbcs x14, x14, x27\n\t" + "sbc x15, x15, x25\n\t" /* Sub */ - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x12, #-19\n\t" - "csetm x15, cc\n\t" + "subs x16, x4, x8\n\t" + "sbcs x17, x5, x9\n\t" + "sbcs x18, x6, x10\n\t" + "sbcs x19, x7, x11\n\t" + "mov x24, #-19\n\t" + "csetm x27, cc\n\t" /* Mask the modulus */ - "and x12, x15, x12\n\t" - "and x13, x15, #0x7fffffffffffffff\n\t" + "and x24, x27, x24\n\t" + "and x25, x27, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x12\n\t" - "adcs x5, x5, x15\n\t" - "adcs x6, x6, x15\n\t" - "adc x7, x7, x13\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "stp x4, x5, [x3]\n\t" - "stp x6, x7, [x3, #16]\n\t" - "ldp x29, x30, [sp], #0x70\n\t" + "adds x16, x16, x24\n\t" + "adcs x17, x17, x27\n\t" + "adcs x18, x18, x27\n\t" + "adc x19, x19, x25\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x18, x19, [x1, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x12", "x13", "x14", "x15", "x8", "x9", "x10", "x11", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); (void)qz; (void)qt2d; @@ -7189,3 +6974,4 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz (void)qyminusx; } +#endif /* __aarch64__ */ diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h index e9cef000c..61d17bb3f 100644 --- a/wolfssl/wolfcrypt/fe_operations.h +++ b/wolfssl/wolfcrypt/fe_operations.h @@ -39,8 +39,17 @@ #define CURVED25519_128BIT #endif -#if defined(CURVED25519_X64) || defined(WOLFSSL_ARMASM) +#if defined(CURVED25519_X64) #define CURVED25519_ASM_64BIT + #define CURVED25519_ASM +#endif +#if defined(WOLFSSL_ARMASM) + #ifdef __aarch64__ + #define CURVED25519_ASM_64BIT + #else + #define CURVED25519_ASM_32BIT + #endif + #define CURVED25519_ASM #endif /* @@ -78,6 +87,8 @@ WOLFSSL_LOCAL int curve25519(byte * q, byte * n, byte * p); #ifdef CURVED25519_ASM_64BIT typedef int64_t fe[4]; +#elif defined(CURVED25519_ASM_32BIT) + typedef int32_t fe[8]; #elif defined(CURVED25519_128BIT) typedef int64_t fe[5]; #else @@ -112,7 +123,7 @@ WOLFSSL_LOCAL void fe_pow22523(fe,const fe); WOLFSSL_LOCAL uint64_t load_3(const unsigned char *in); WOLFSSL_LOCAL uint64_t load_4(const unsigned char *in); -#ifdef CURVED25519_ASM_64BIT +#ifdef CURVED25519_ASM WOLFSSL_LOCAL void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt); WOLFSSL_LOCAL void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, @@ -136,7 +147,7 @@ WOLFSSL_LOCAL void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe qt2d, const fe qyplusx, const fe qyminusx); WOLFSSL_LOCAL void fe_cmov_table(fe* r, fe* base, signed char b); -#endif /* CURVED25519_ASM_64BIT */ +#endif /* CURVED25519_ASM */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ /* Use less memory and only 32bit types or less, but is slower diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h index fe2ebdfca..69de2fc8c 100644 --- a/wolfssl/wolfcrypt/ge_operations.h +++ b/wolfssl/wolfcrypt/ge_operations.h @@ -49,6 +49,8 @@ Representations: typedef byte ge[F25519_SIZE]; #elif defined(CURVED25519_ASM_64BIT) typedef int64_t ge[4]; +#elif defined(CURVED25519_ASM_32BIT) + typedef int32_t ge[8]; #elif defined(CURVED25519_128BIT) typedef int64_t ge[5]; #else