diff --git a/tests/api.c b/tests/api.c index a46feb756..f53c50ba4 100644 --- a/tests/api.c +++ b/tests/api.c @@ -12494,6 +12494,7 @@ static int test_wc_Sha256Update(void) #ifndef NO_SHA256 wc_Sha256 sha256; byte hash[WC_SHA256_DIGEST_SIZE]; + byte hash_unaligned[WC_SHA256_DIGEST_SIZE+1]; testVector a, b, c; ExpectIntEQ(wc_InitSha256(&sha256), 0); @@ -12517,6 +12518,11 @@ static int test_wc_Sha256Update(void) ExpectIntEQ(wc_Sha256Final(&sha256, hash), 0); ExpectIntEQ(XMEMCMP(hash, a.output, WC_SHA256_DIGEST_SIZE), 0); + /* Unaligned check. */ + ExpectIntEQ(wc_Sha256Update(&sha256, (byte*)a.input+1, (word32)a.inLen-1), + 0); + ExpectIntEQ(wc_Sha256Final(&sha256, hash_unaligned + 1), 0); + /* Try passing in bad values */ b.input = NULL; b.inLen = 0; @@ -12721,6 +12727,7 @@ static int test_wc_Sha512Update(void) #ifdef WOLFSSL_SHA512 wc_Sha512 sha512; byte hash[WC_SHA512_DIGEST_SIZE]; + byte hash_unaligned[WC_SHA512_DIGEST_SIZE + 1]; testVector a, b, c; ExpectIntEQ(wc_InitSha512(&sha512), 0); @@ -12747,6 +12754,11 @@ static int test_wc_Sha512Update(void) ExpectIntEQ(XMEMCMP(hash, a.output, WC_SHA512_DIGEST_SIZE), 0); + /* Unaligned check. */ + ExpectIntEQ(wc_Sha512Update(&sha512, (byte*)a.input+1, (word32)a.inLen-1), + 0); + ExpectIntEQ(wc_Sha512Final(&sha512, hash_unaligned+1), 0); + /* Try passing in bad values */ b.input = NULL; b.inLen = 0; @@ -20091,7 +20103,8 @@ static int test_wc_ed25519_make_key(void) #if defined(HAVE_ED25519) && defined(HAVE_ED25519_MAKE_KEY) ed25519_key key; WC_RNG rng; - unsigned char pubkey[ED25519_PUB_KEY_SIZE]; + unsigned char pubkey[ED25519_PUB_KEY_SIZE+1]; + int pubkey_sz = ED25519_PUB_KEY_SIZE; XMEMSET(&key, 0, sizeof(ed25519_key)); XMEMSET(&rng, 0, sizeof(WC_RNG)); @@ -20099,7 +20112,9 @@ static int test_wc_ed25519_make_key(void) ExpectIntEQ(wc_ed25519_init(&key), 0); ExpectIntEQ(wc_InitRng(&rng), 0); - ExpectIntEQ(wc_ed25519_make_public(&key, pubkey, sizeof(pubkey)), + ExpectIntEQ(wc_ed25519_make_public(&key, pubkey, pubkey_sz), + ECC_PRIV_KEY_E); + ExpectIntEQ(wc_ed25519_make_public(&key, pubkey+1, pubkey_sz), ECC_PRIV_KEY_E); ExpectIntEQ(wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key), 0); @@ -20149,10 +20164,10 @@ static int test_wc_ed25519_sign_msg(void) WC_RNG rng; ed25519_key key; byte msg[] = "Everybody gets Friday off.\n"; - byte sig[ED25519_SIG_SIZE]; + byte sig[ED25519_SIG_SIZE+1]; word32 msglen = sizeof(msg); - word32 siglen = sizeof(sig); - word32 badSigLen = sizeof(sig) - 1; + word32 siglen = ED25519_SIG_SIZE; + word32 badSigLen = ED25519_SIG_SIZE - 1; #ifdef HAVE_ED25519_VERIFY int verify_ok = 0; /*1 = Verify success.*/ #endif @@ -20160,7 +20175,7 @@ static int test_wc_ed25519_sign_msg(void) /* Initialize stack variables. */ XMEMSET(&key, 0, sizeof(ed25519_key)); XMEMSET(&rng, 0, sizeof(WC_RNG)); - XMEMSET(sig, 0, siglen); + XMEMSET(sig, 0, sizeof(sig)); /* Initialize key. */ ExpectIntEQ(wc_ed25519_init(&key), 0); @@ -20169,6 +20184,8 @@ static int test_wc_ed25519_sign_msg(void) ExpectIntEQ(wc_ed25519_sign_msg(msg, msglen, sig, &siglen, &key), 0); ExpectIntEQ(siglen, ED25519_SIG_SIZE); + ExpectIntEQ(wc_ed25519_sign_msg(msg, msglen, sig+1, &siglen, &key), 0); + ExpectIntEQ(siglen, ED25519_SIG_SIZE); /* Test bad args. */ ExpectIntEQ(wc_ed25519_sign_msg(NULL, msglen, sig, &siglen, &key), @@ -20185,24 +20202,24 @@ static int test_wc_ed25519_sign_msg(void) badSigLen -= 1; #ifdef HAVE_ED25519_VERIFY - ExpectIntEQ(wc_ed25519_verify_msg(sig, siglen, msg, msglen, &verify_ok, + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, siglen, msg, msglen, &verify_ok, &key), 0); ExpectIntEQ(verify_ok, 1); /* Test bad args. */ - ExpectIntEQ(wc_ed25519_verify_msg(sig, siglen - 1, msg, msglen, &verify_ok, - &key), BAD_FUNC_ARG); - ExpectIntEQ(wc_ed25519_verify_msg(sig, siglen + 1, msg, msglen, &verify_ok, - &key), BAD_FUNC_ARG); + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, siglen - 1, msg, msglen, + &verify_ok, &key), BAD_FUNC_ARG); + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, siglen + 1, msg, msglen, + &verify_ok, &key), BAD_FUNC_ARG); ExpectIntEQ(wc_ed25519_verify_msg(NULL, siglen, msg, msglen, &verify_ok, &key), BAD_FUNC_ARG); - ExpectIntEQ(wc_ed25519_verify_msg(sig, siglen, NULL, msglen, &verify_ok, + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, siglen, NULL, msglen, &verify_ok, &key), BAD_FUNC_ARG); - ExpectIntEQ(wc_ed25519_verify_msg(sig, siglen, msg, msglen, NULL, &key), + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, siglen, msg, msglen, NULL, &key), BAD_FUNC_ARG); - ExpectIntEQ(wc_ed25519_verify_msg(sig, siglen, msg, msglen, &verify_ok, + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, siglen, msg, msglen, &verify_ok, NULL), BAD_FUNC_ARG); - ExpectIntEQ(wc_ed25519_verify_msg(sig, badSigLen, msg, msglen, &verify_ok, + ExpectIntEQ(wc_ed25519_verify_msg(sig+1, badSigLen, msg, msglen, &verify_ok, &key), BAD_FUNC_ARG); #endif /* Verify. */ diff --git a/wolfcrypt/src/asn.c b/wolfcrypt/src/asn.c index 483a659f2..e10b60b10 100644 --- a/wolfcrypt/src/asn.c +++ b/wolfcrypt/src/asn.c @@ -33810,7 +33810,7 @@ int wc_Ed25519PrivateKeyDecode(const byte* input, word32* inOutIdx, ed25519_key* key, word32 inSz) { int ret; - byte privKey[ED25519_KEY_SIZE], pubKey[ED25519_PUB_KEY_SIZE]; + byte privKey[ED25519_KEY_SIZE], pubKey[2*ED25519_PUB_KEY_SIZE+1]; word32 privKeyLen = (word32)sizeof(privKey); word32 pubKeyLen = (word32)sizeof(pubKey); @@ -33836,7 +33836,7 @@ int wc_Ed25519PublicKeyDecode(const byte* input, word32* inOutIdx, ed25519_key* key, word32 inSz) { int ret; - byte pubKey[ED25519_PUB_KEY_SIZE]; + byte pubKey[2*ED25519_PUB_KEY_SIZE+1]; word32 pubKeyLen = (word32)sizeof(pubKey); if (input == NULL || inOutIdx == NULL || key == NULL || inSz == 0) { @@ -34127,7 +34127,7 @@ int wc_Ed448PublicKeyDecode(const byte* input, word32* inOutIdx, ed448_key* key, word32 inSz) { int ret; - byte pubKey[ED448_PUB_KEY_SIZE]; + byte pubKey[2 * ED448_PUB_KEY_SIZE + 1]; word32 pubKeyLen = (word32)sizeof(pubKey); if (input == NULL || inOutIdx == NULL || key == NULL || inSz == 0) { diff --git a/wolfcrypt/src/ed25519.c b/wolfcrypt/src/ed25519.c index aa82590d4..6dfb7a0f2 100644 --- a/wolfcrypt/src/ed25519.c +++ b/wolfcrypt/src/ed25519.c @@ -187,7 +187,7 @@ int wc_ed25519_make_public(ed25519_key* key, unsigned char* pubKey, word32 pubKeySz) { int ret = 0; - byte az[ED25519_PRV_KEY_SIZE]; + ALIGN16 byte az[ED25519_PRV_KEY_SIZE]; #if !defined(FREESCALE_LTC_ECC) ge_p3 A; #endif @@ -296,14 +296,14 @@ int wc_ed25519_sign_msg_ex(const byte* in, word32 inLen, byte* out, ret = se050_ed25519_sign_msg(in, inLen, out, outLen, key); #else #ifdef FREESCALE_LTC_ECC - byte tempBuf[ED25519_PRV_KEY_SIZE]; + ALIGN16 byte tempBuf[ED25519_PRV_KEY_SIZE]; ltc_pkha_ecc_point_t ltcPoint = {0}; #else ge_p3 R; #endif - byte nonce[WC_SHA512_DIGEST_SIZE]; - byte hram[WC_SHA512_DIGEST_SIZE]; - byte az[ED25519_PRV_KEY_SIZE]; + ALIGN16 byte nonce[WC_SHA512_DIGEST_SIZE]; + ALIGN16 byte hram[WC_SHA512_DIGEST_SIZE]; + ALIGN16 byte az[ED25519_PRV_KEY_SIZE]; /* sanity check on arguments */ if (in == NULL || out == NULL || outLen == NULL || key == NULL || @@ -617,8 +617,8 @@ static int ed25519_verify_msg_final_with_sha(const byte* sig, word32 sigLen, int* res, ed25519_key* key, wc_Sha512 *sha) { - byte rcheck[ED25519_KEY_SIZE]; - byte h[WC_SHA512_DIGEST_SIZE]; + ALIGN16 byte rcheck[ED25519_KEY_SIZE]; + ALIGN16 byte h[WC_SHA512_DIGEST_SIZE]; #ifndef FREESCALE_LTC_ECC ge_p3 A; ge_p2 R; @@ -1239,7 +1239,7 @@ int wc_ed25519_check_key(ed25519_key* key) { int ret = 0; #ifdef HAVE_ED25519_MAKE_KEY - unsigned char pubKey[ED25519_PUB_KEY_SIZE]; + ALIGN16 unsigned char pubKey[ED25519_PUB_KEY_SIZE]; if (!key->pubKeySet) ret = PUBLIC_KEY_E; diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c index 995e1c59c..57a838cda 100644 --- a/wolfcrypt/src/ge_operations.c +++ b/wolfcrypt/src/ge_operations.c @@ -920,13 +920,13 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) int ge_compress_key(byte* out, const byte* xIn, const byte* yIn, word32 keySz) { ge_p2 g; - byte bArray[ED25519_KEY_SIZE]; - byte x[ED25519_KEY_SIZE]; - byte y[ED25519_KEY_SIZE]; + ALIGN16 byte bArray[ED25519_KEY_SIZE]; + ALIGN16 byte x[ED25519_PUB_KEY_SIZE]; + ALIGN16 byte y[ED25519_PUB_KEY_SIZE]; word32 i; - XMEMCPY(x, xIn, ED25519_KEY_SIZE); - XMEMCPY(y, yIn, ED25519_KEY_SIZE); + XMEMCPY(x, xIn, ED25519_PUB_KEY_SIZE); + XMEMCPY(y, yIn, ED25519_PUB_KEY_SIZE); fe_frombytes(g.X, x); fe_frombytes(g.Y, y); fe_1(g.Z); diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S index 365d0c342..cb238f8e5 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S @@ -1447,10 +1447,10 @@ L_AES_CTR_encrypt_loop_block_256: ldr r9, [lr, #4] ldr r10, [lr, #8] ldr r11, [lr, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] @@ -1489,10 +1489,10 @@ L_AES_CTR_encrypt_loop_block_192: ldr r9, [lr, #4] ldr r10, [lr, #8] ldr r11, [lr, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] @@ -1531,10 +1531,10 @@ L_AES_CTR_encrypt_loop_block_128: ldr r9, [lr, #4] ldr r10, [lr, #8] ldr r11, [lr, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] @@ -3172,10 +3172,10 @@ L_AES_GCM_encrypt_loop_block_256: ldr r9, [lr, #4] ldr r10, [lr, #8] ldr r11, [lr, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] @@ -3211,10 +3211,10 @@ L_AES_GCM_encrypt_loop_block_192: ldr r9, [lr, #4] ldr r10, [lr, #8] ldr r11, [lr, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] @@ -3250,10 +3250,10 @@ L_AES_GCM_encrypt_loop_block_128: ldr r9, [lr, #4] ldr r10, [lr, #8] ldr r11, [lr, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index cfa532a58..a2da62598 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -1110,10 +1110,10 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" "ldr r11, [lr, #12]\n\t" - "eor r4, r8\n\t" - "eor r5, r9\n\t" - "eor r6, r10\n\t" - "eor r7, r11\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" "ldr r8, [sp, #4]\n\t" "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" @@ -1154,10 +1154,10 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" "ldr r11, [lr, #12]\n\t" - "eor r4, r8\n\t" - "eor r5, r9\n\t" - "eor r6, r10\n\t" - "eor r7, r11\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" "ldr r8, [sp, #4]\n\t" "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" @@ -1198,10 +1198,10 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" "ldr r11, [lr, #12]\n\t" - "eor r4, r8\n\t" - "eor r5, r9\n\t" - "eor r6, r10\n\t" - "eor r7, r11\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" "ldr r8, [sp, #4]\n\t" "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" @@ -2651,10 +2651,10 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" "ldr r11, [lr, #12]\n\t" - "eor r4, r8\n\t" - "eor r5, r9\n\t" - "eor r6, r10\n\t" - "eor r7, r11\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" "ldr r8, [sp, #4]\n\t" "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" @@ -2692,10 +2692,10 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" "ldr r11, [lr, #12]\n\t" - "eor r4, r8\n\t" - "eor r5, r9\n\t" - "eor r6, r10\n\t" - "eor r7, r11\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" "ldr r8, [sp, #4]\n\t" "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" @@ -2733,10 +2733,10 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" "ldr r11, [lr, #12]\n\t" - "eor r4, r8\n\t" - "eor r5, r9\n\t" - "eor r6, r10\n\t" - "eor r7, r11\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" "ldr r8, [sp, #4]\n\t" "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index 1f83fcc62..45be9a90e 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -32,10 +32,6 @@ #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__arm__) #ifndef WOLFSSL_ARMASM_INLINE -/* Based on work by: Emil Lenngren - * https://github.com/pornin/X25519-Cortex-M4 - */ - #if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) #if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) @@ -348,9 +344,23 @@ fe_add: .type fe_frombytes, %function fe_frombytes: push {r4, r5, r6, r7, r8, r9, lr} - ldm r1, {r2, r3, r4, r5, r6, r7, r8, r9} + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r4, [r1, #8] + ldr r5, [r1, #12] + ldr r6, [r1, #16] + ldr r7, [r1, #20] + ldr r8, [r1, #24] + ldr r9, [r1, #28] bfc r9, #31, #1 - stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + str r2, [r0] + str r3, [r0, #4] + str r4, [r0, #8] + str r5, [r0, #12] + str r6, [r0, #16] + str r7, [r0, #20] + str r8, [r0, #24] + str r9, [r0, #28] pop {r4, r5, r6, r7, r8, r9, pc} .size fe_frombytes,.-fe_frombytes .text @@ -379,7 +389,14 @@ fe_tobytes: adcs r8, r8, #0 adc r9, r9, #0 bfc r9, #31, #1 - stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + str r2, [r0] + str r3, [r0, #4] + str r4, [r0, #8] + str r5, [r0, #12] + str r6, [r0, #16] + str r7, [r0, #20] + str r8, [r0, #24] + str r9, [r0, #28] pop {r4, r5, r6, r7, r8, r9, pc} .size fe_tobytes,.-fe_tobytes .text @@ -387,69 +404,36 @@ fe_tobytes: .globl fe_1 .type fe_1, %function fe_1: + push {r4, r5, r6, r7, r8, r9, lr} # Set one mov r2, #1 mov r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0] - str r3, [r0, #4] -#else - strd r2, r3, [r0] -#endif - mov r2, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #8] - str r3, [r0, #12] -#else - strd r2, r3, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #16] - str r3, [r0, #20] -#else - strd r2, r3, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #24] - str r3, [r0, #28] -#else - strd r2, r3, [r0, #24] -#endif - bx lr + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + pop {r4, r5, r6, r7, r8, r9, pc} .size fe_1,.-fe_1 .text .align 4 .globl fe_0 .type fe_0, %function fe_0: + push {r4, r5, r6, r7, r8, r9, lr} # Set zero mov r2, #0 mov r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0] - str r3, [r0, #4] -#else - strd r2, r3, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #8] - str r3, [r0, #12] -#else - strd r2, r3, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #16] - str r3, [r0, #20] -#else - strd r2, r3, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #24] - str r3, [r0, #28] -#else - strd r2, r3, [r0, #24] -#endif - bx lr + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + pop {r4, r5, r6, r7, r8, r9, pc} .size fe_0,.-fe_0 .text .align 4 @@ -588,6 +572,7 @@ fe_isnegative: eor r0, r0, r1 pop {r4, r5, pc} .size fe_isnegative,.-fe_isnegative +#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) #ifndef WC_NO_CACHE_RESISTANT .text .align 4 @@ -2394,6 +2379,7 @@ fe_cmov_table: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_cmov_table,.-fe_cmov_table #endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ .text .align 4 @@ -2671,6 +2657,7 @@ fe_sq: bl fe_sq_op pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_sq,.-fe_sq +#ifdef HAVE_CURVE25519 .text .align 4 .globl fe_mul121666 @@ -2725,89 +2712,20 @@ curve25519: str r2, [sp, #168] mov r1, #0 str r1, [sp, #172] - # Set one - mov r10, #1 - mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0] - str r11, [r0, #4] -#else - strd r10, r11, [r0] -#endif - mov r10, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #16] - str r11, [r0, #20] -#else - strd r10, r11, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - # Set zero + mov r4, #1 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 mov r10, #0 mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp] - str r11, [sp, #4] -#else - strd r10, r11, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #8] - str r11, [sp, #12] -#else - strd r10, r11, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #16] - str r11, [sp, #20] -#else - strd r10, r11, [sp, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #24] - str r11, [sp, #28] -#else - strd r10, r11, [sp, #24] -#endif - # Set one - mov r10, #1 - mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #32] - str r11, [sp, #36] -#else - strd r10, r11, [sp, #32] -#endif - mov r10, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #40] - str r11, [sp, #44] -#else - strd r10, r11, [sp, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #48] - str r11, [sp, #52] -#else - strd r10, r11, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #56] - str r11, [sp, #60] -#else - strd r10, r11, [sp, #56] -#endif + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + add r3, sp, #32 + stm r3, {r4, r5, r6, r7, r8, r9, r10, r11} + mov r4, #0 + mov r3, sp + stm r3, {r4, r5, r6, r7, r8, r9, r10, r11} add r3, sp, #0x40 # Copy ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -2830,18 +2748,10 @@ L_curve25519_bits: ldr r0, [sp, #160] # Conditional Swap rsb r1, r1, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #64] - ldr r7, [sp, #68] -#else - ldrd r6, r7, [sp, #64] -#endif + mov r3, r0 + add r12, sp, #0x40 + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -2850,30 +2760,10 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #64] - str r7, [sp, #68] -#else - strd r6, r7, [sp, #64] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #72] - ldr r7, [sp, #76] -#else - ldrd r6, r7, [sp, #72] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -2882,30 +2772,10 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #72] - str r7, [sp, #76] -#else - strd r6, r7, [sp, #72] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #80] - ldr r7, [sp, #84] -#else - ldrd r6, r7, [sp, #80] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -2914,30 +2784,10 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #80] - str r7, [sp, #84] -#else - strd r6, r7, [sp, #80] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #24] - ldr r5, [r0, #28] -#else - ldrd r4, r5, [r0, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #88] - ldr r7, [sp, #92] -#else - ldrd r6, r7, [sp, #88] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -2946,33 +2796,15 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #24] - str r5, [r0, #28] -#else - strd r4, r5, [r0, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #88] - str r7, [sp, #92] -#else - strd r6, r7, [sp, #88] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} ldr r1, [sp, #172] # Conditional Swap rsb r1, r1, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] -#else - ldrd r4, r5, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #32] - ldr r7, [sp, #36] -#else - ldrd r6, r7, [sp, #32] -#endif + mov r3, sp + add r12, sp, #32 + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -2981,30 +2813,10 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp] - str r5, [sp, #4] -#else - strd r4, r5, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #32] - str r7, [sp, #36] -#else - strd r6, r7, [sp, #32] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #8] - ldr r5, [sp, #12] -#else - ldrd r4, r5, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #40] - ldr r7, [sp, #44] -#else - ldrd r6, r7, [sp, #40] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -3013,30 +2825,10 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #8] - str r5, [sp, #12] -#else - strd r4, r5, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #40] - str r7, [sp, #44] -#else - strd r6, r7, [sp, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #16] - ldr r5, [sp, #20] -#else - ldrd r4, r5, [sp, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #48] - ldr r7, [sp, #52] -#else - ldrd r6, r7, [sp, #48] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -3045,30 +2837,10 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #16] - str r5, [sp, #20] -#else - strd r4, r5, [sp, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #48] - str r7, [sp, #52] -#else - strd r6, r7, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #24] - ldr r5, [sp, #28] -#else - ldrd r4, r5, [sp, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #56] - ldr r7, [sp, #60] -#else - ldrd r6, r7, [sp, #56] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} + ldm r3, {r4, r5} + ldm r12, {r6, r7} eor r8, r4, r6 eor r9, r5, r7 and r8, r8, r1 @@ -3077,18 +2849,8 @@ L_curve25519_bits: eor r5, r5, r9 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #24] - str r5, [sp, #28] -#else - strd r4, r5, [sp, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #56] - str r7, [sp, #60] -#else - strd r6, r7, [sp, #56] -#endif + stm r3!, {r4, r5} + stm r12!, {r6, r7} ldr r1, [sp, #184] str r1, [sp, #172] mov r3, sp @@ -3329,89 +3091,20 @@ curve25519: str r4, [sp, #188] mov r1, #0 str r1, [sp, #164] - # Set one - mov r10, #1 - mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0] - str r11, [r0, #4] -#else - strd r10, r11, [r0] -#endif - mov r10, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #16] - str r11, [r0, #20] -#else - strd r10, r11, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - # Set zero + mov r4, #1 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 mov r10, #0 mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp] - str r11, [sp, #4] -#else - strd r10, r11, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #8] - str r11, [sp, #12] -#else - strd r10, r11, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #16] - str r11, [sp, #20] -#else - strd r10, r11, [sp, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #24] - str r11, [sp, #28] -#else - strd r10, r11, [sp, #24] -#endif - # Set one - mov r10, #1 - mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #32] - str r11, [sp, #36] -#else - strd r10, r11, [sp, #32] -#endif - mov r10, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #40] - str r11, [sp, #44] -#else - strd r10, r11, [sp, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #48] - str r11, [sp, #52] -#else - strd r10, r11, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #56] - str r11, [sp, #60] -#else - strd r10, r11, [sp, #56] -#endif + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + add r3, sp, #32 + stm r3, {r4, r5, r6, r7, r8, r9, r10, r11} + mov r4, #0 + mov r3, sp + stm r3, {r4, r5, r6, r7, r8, r9, r10, r11} add r3, sp, #0x40 # Copy ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -3675,6 +3368,7 @@ L_curve25519_inv_8: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size curve25519,.-curve25519 #endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_CURVE25519 */ #ifdef HAVE_ED25519 .text .align 4 @@ -4974,13 +4668,14 @@ sc_reduce: add sp, sp, #52 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size sc_reduce,.-sc_reduce +#ifdef HAVE_ED25519_SIGN .text .align 4 .globl sc_muladd .type sc_muladd, %function sc_muladd: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x50 + sub sp, sp, #0x70 add lr, sp, #0x44 stm lr, {r0, r1, r3} mov lr, r2 @@ -5082,7 +4777,7 @@ sc_muladd: mov r3, r12 add lr, sp, #32 stm lr, {r3, r4, r5, r6, r7, r8, r9, r10} - ldr r0, [sp, #68] + add r0, sp, #0x50 # Add c to a * b ldr lr, [sp, #76] ldm sp!, {r2, r3, r4, r5, r6, r7, r8, r9} @@ -5502,11 +5197,20 @@ sc_muladd: adcs r8, r8, #0 adc r9, r9, r1 bfc r9, #28, #4 + ldr r0, [sp, #68] # Store result - stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} - add sp, sp, #0x50 + str r2, [r0] + str r3, [r0, #4] + str r4, [r0, #8] + str r5, [r0, #12] + str r6, [r0, #16] + str r7, [r0, #20] + str r8, [r0, #24] + str r9, [r0, #28] + add sp, sp, #0x70 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size sc_muladd,.-sc_muladd +#endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 627b74edb..1de8c9c77 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -378,9 +378,23 @@ void fe_frombytes(fe out_p, const unsigned char* in_p) register const unsigned char* in asm ("r1") = (const unsigned char*)in_p; __asm__ __volatile__ ( - "ldm %[in], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ldr r2, [%[in]]\n\t" + "ldr r3, [%[in], #4]\n\t" + "ldr r4, [%[in], #8]\n\t" + "ldr r5, [%[in], #12]\n\t" + "ldr r6, [%[in], #16]\n\t" + "ldr r7, [%[in], #20]\n\t" + "ldr r8, [%[in], #24]\n\t" + "ldr r9, [%[in], #28]\n\t" "bfc r9, #31, #1\n\t" - "stm %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "str r2, [%[out]]\n\t" + "str r3, [%[out], #4]\n\t" + "str r4, [%[out], #8]\n\t" + "str r5, [%[out], #12]\n\t" + "str r6, [%[out], #16]\n\t" + "str r7, [%[out], #20]\n\t" + "str r8, [%[out], #24]\n\t" + "str r9, [%[out], #28]\n\t" : [out] "+r" (out), [in] "+r" (in) : : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" @@ -413,7 +427,14 @@ void fe_tobytes(unsigned char* out_p, const fe n_p) "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" "bfc r9, #31, #1\n\t" - "stm %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "str r2, [%[out]]\n\t" + "str r3, [%[out], #4]\n\t" + "str r4, [%[out], #8]\n\t" + "str r5, [%[out], #12]\n\t" + "str r6, [%[out], #16]\n\t" + "str r7, [%[out], #20]\n\t" + "str r8, [%[out], #24]\n\t" + "str r9, [%[out], #28]\n\t" : [out] "+r" (out), [n] "+r" (n) : : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12" @@ -428,34 +449,16 @@ void fe_1(fe n_p) /* Set one */ "mov r2, #1\n\t" "mov r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n]]\n\t" - "str r3, [%[n], #4]\n\t" -#else - "strd r2, r3, [%[n]]\n\t" -#endif - "mov r2, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n], #8]\n\t" - "str r3, [%[n], #12]\n\t" -#else - "strd r2, r3, [%[n], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n], #16]\n\t" - "str r3, [%[n], #20]\n\t" -#else - "strd r2, r3, [%[n], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n], #24]\n\t" - "str r3, [%[n], #28]\n\t" -#else - "strd r2, r3, [%[n], #24]\n\t" -#endif + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0\n\t" + "mov r9, #0\n\t" + "stm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" : [n] "+r" (n) : - : "memory", "r2", "r3" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -467,33 +470,16 @@ void fe_0(fe n_p) /* Set zero */ "mov r2, #0\n\t" "mov r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n]]\n\t" - "str r3, [%[n], #4]\n\t" -#else - "strd r2, r3, [%[n]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n], #8]\n\t" - "str r3, [%[n], #12]\n\t" -#else - "strd r2, r3, [%[n], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n], #16]\n\t" - "str r3, [%[n], #20]\n\t" -#else - "strd r2, r3, [%[n], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[n], #24]\n\t" - "str r3, [%[n], #28]\n\t" -#else - "strd r2, r3, [%[n], #24]\n\t" -#endif + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0\n\t" + "mov r9, #0\n\t" + "stm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" : [n] "+r" (n) : - : "memory", "r2", "r3" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -650,6 +636,7 @@ int fe_isnegative(const fe a_p) return (uint32_t)(size_t)a; } +#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) #ifndef WC_NO_CACHE_RESISTANT void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) { @@ -2466,6 +2453,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) } #endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ void fe_mul_op(void); void fe_mul_op() @@ -2756,6 +2744,7 @@ void fe_sq(fe r_p, const fe a_p) ); } +#ifdef HAVE_CURVE25519 void fe_mul121666(fe r_p, fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -2815,89 +2804,20 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "str %[a], [sp, #168]\n\t" "mov %[n], #0\n\t" "str %[n], [sp, #172]\n\t" - /* Set one */ - "mov r10, #1\n\t" - "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r]]\n\t" - "str r11, [%[r], #4]\n\t" -#else - "strd r10, r11, [%[r]]\n\t" -#endif - "mov r10, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #8]\n\t" - "str r11, [%[r], #12]\n\t" -#else - "strd r10, r11, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #16]\n\t" - "str r11, [%[r], #20]\n\t" -#else - "strd r10, r11, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" -#else - "strd r10, r11, [%[r], #24]\n\t" -#endif - /* Set zero */ + "mov r4, #1\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0\n\t" + "mov r9, #0\n\t" "mov r10, #0\n\t" "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp]\n\t" - "str r11, [sp, #4]\n\t" -#else - "strd r10, r11, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #8]\n\t" - "str r11, [sp, #12]\n\t" -#else - "strd r10, r11, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #16]\n\t" - "str r11, [sp, #20]\n\t" -#else - "strd r10, r11, [sp, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #24]\n\t" - "str r11, [sp, #28]\n\t" -#else - "strd r10, r11, [sp, #24]\n\t" -#endif - /* Set one */ - "mov r10, #1\n\t" - "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #32]\n\t" - "str r11, [sp, #36]\n\t" -#else - "strd r10, r11, [sp, #32]\n\t" -#endif - "mov r10, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #40]\n\t" - "str r11, [sp, #44]\n\t" -#else - "strd r10, r11, [sp, #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #48]\n\t" - "str r11, [sp, #52]\n\t" -#else - "strd r10, r11, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #56]\n\t" - "str r11, [sp, #60]\n\t" -#else - "strd r10, r11, [sp, #56]\n\t" -#endif + "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "add r3, sp, #32\n\t" + "stm r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "mov r4, #0\n\t" + "mov r3, sp\n\t" + "stm r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "add r3, sp, #0x40\n\t" /* Copy */ "ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -2922,18 +2842,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "ldr %[r], [sp, #160]\n\t" /* Conditional Swap */ "rsb %[n], %[n], #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r]]\n\t" - "ldr r5, [%[r], #4]\n\t" -#else - "ldrd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #64]\n\t" - "ldr r7, [sp, #68]\n\t" -#else - "ldrd r6, r7, [sp, #64]\n\t" -#endif + "mov r3, r0\n\t" + "add r12, sp, #0x40\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -2942,30 +2854,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" -#else - "strd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #64]\n\t" - "str r7, [sp, #68]\n\t" -#else - "strd r6, r7, [sp, #64]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #8]\n\t" - "ldr r5, [%[r], #12]\n\t" -#else - "ldrd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #72]\n\t" - "ldr r7, [sp, #76]\n\t" -#else - "ldrd r6, r7, [sp, #72]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -2974,30 +2866,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" -#else - "strd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #72]\n\t" - "str r7, [sp, #76]\n\t" -#else - "strd r6, r7, [sp, #72]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #16]\n\t" - "ldr r5, [%[r], #20]\n\t" -#else - "ldrd r4, r5, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #80]\n\t" - "ldr r7, [sp, #84]\n\t" -#else - "ldrd r6, r7, [sp, #80]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -3006,30 +2878,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" -#else - "strd r4, r5, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #80]\n\t" - "str r7, [sp, #84]\n\t" -#else - "strd r6, r7, [sp, #80]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #24]\n\t" - "ldr r5, [%[r], #28]\n\t" -#else - "ldrd r4, r5, [%[r], #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #88]\n\t" - "ldr r7, [sp, #92]\n\t" -#else - "ldrd r6, r7, [sp, #88]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -3038,33 +2890,15 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #28]\n\t" -#else - "strd r4, r5, [%[r], #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #88]\n\t" - "str r7, [sp, #92]\n\t" -#else - "strd r6, r7, [sp, #88]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" "ldr %[n], [sp, #172]\n\t" /* Conditional Swap */ "rsb %[n], %[n], #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" -#else - "ldrd r4, r5, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #32]\n\t" - "ldr r7, [sp, #36]\n\t" -#else - "ldrd r6, r7, [sp, #32]\n\t" -#endif + "mov r3, sp\n\t" + "add r12, sp, #32\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -3073,30 +2907,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp]\n\t" - "str r5, [sp, #4]\n\t" -#else - "strd r4, r5, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #32]\n\t" - "str r7, [sp, #36]\n\t" -#else - "strd r6, r7, [sp, #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #8]\n\t" - "ldr r5, [sp, #12]\n\t" -#else - "ldrd r4, r5, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #40]\n\t" - "ldr r7, [sp, #44]\n\t" -#else - "ldrd r6, r7, [sp, #40]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -3105,30 +2919,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #8]\n\t" - "str r5, [sp, #12]\n\t" -#else - "strd r4, r5, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #40]\n\t" - "str r7, [sp, #44]\n\t" -#else - "strd r6, r7, [sp, #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #16]\n\t" - "ldr r5, [sp, #20]\n\t" -#else - "ldrd r4, r5, [sp, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #48]\n\t" - "ldr r7, [sp, #52]\n\t" -#else - "ldrd r6, r7, [sp, #48]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -3137,30 +2931,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #16]\n\t" - "str r5, [sp, #20]\n\t" -#else - "strd r4, r5, [sp, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #48]\n\t" - "str r7, [sp, #52]\n\t" -#else - "strd r6, r7, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #24]\n\t" - "ldr r5, [sp, #28]\n\t" -#else - "ldrd r4, r5, [sp, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #56]\n\t" - "ldr r7, [sp, #60]\n\t" -#else - "ldrd r6, r7, [sp, #56]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" + "ldm r3, {r4, r5}\n\t" + "ldm r12, {r6, r7}\n\t" "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" "and r8, r8, %[n]\n\t" @@ -3169,18 +2943,8 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r5, r5, r9\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #24]\n\t" - "str r5, [sp, #28]\n\t" -#else - "strd r4, r5, [sp, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #56]\n\t" - "str r7, [sp, #60]\n\t" -#else - "strd r6, r7, [sp, #56]\n\t" -#endif + "stm r3!, {r4, r5}\n\t" + "stm r12!, {r6, r7}\n\t" "ldr %[n], [sp, #184]\n\t" "str %[n], [sp, #172]\n\t" "mov r3, sp\n\t" @@ -3435,89 +3199,20 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "str r4, [sp, #188]\n\t" "mov %[n], #0\n\t" "str %[n], [sp, #164]\n\t" - /* Set one */ - "mov r10, #1\n\t" - "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r]]\n\t" - "str r11, [%[r], #4]\n\t" -#else - "strd r10, r11, [%[r]]\n\t" -#endif - "mov r10, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #8]\n\t" - "str r11, [%[r], #12]\n\t" -#else - "strd r10, r11, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #16]\n\t" - "str r11, [%[r], #20]\n\t" -#else - "strd r10, r11, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" -#else - "strd r10, r11, [%[r], #24]\n\t" -#endif - /* Set zero */ + "mov r4, #1\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0\n\t" + "mov r8, #0\n\t" + "mov r9, #0\n\t" "mov r10, #0\n\t" "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp]\n\t" - "str r11, [sp, #4]\n\t" -#else - "strd r10, r11, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #8]\n\t" - "str r11, [sp, #12]\n\t" -#else - "strd r10, r11, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #16]\n\t" - "str r11, [sp, #20]\n\t" -#else - "strd r10, r11, [sp, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #24]\n\t" - "str r11, [sp, #28]\n\t" -#else - "strd r10, r11, [sp, #24]\n\t" -#endif - /* Set one */ - "mov r10, #1\n\t" - "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #32]\n\t" - "str r11, [sp, #36]\n\t" -#else - "strd r10, r11, [sp, #32]\n\t" -#endif - "mov r10, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #40]\n\t" - "str r11, [sp, #44]\n\t" -#else - "strd r10, r11, [sp, #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #48]\n\t" - "str r11, [sp, #52]\n\t" -#else - "strd r10, r11, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #56]\n\t" - "str r11, [sp, #60]\n\t" -#else - "strd r10, r11, [sp, #56]\n\t" -#endif + "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "add r3, sp, #32\n\t" + "stm r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "mov r4, #0\n\t" + "mov r3, sp\n\t" + "stm r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "add r3, sp, #0x40\n\t" /* Copy */ "ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -3795,6 +3490,7 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) } #endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_CURVE25519 */ #ifdef HAVE_ED25519 void fe_invert(fe r_p, const fe a_p) { @@ -5157,6 +4853,7 @@ void sc_reduce(byte* s_p) ); } +#ifdef HAVE_ED25519_SIGN void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) { register byte* s asm ("r0") = (byte*)s_p; @@ -5165,7 +4862,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) register const byte* c asm ("r3") = (const byte*)c_p; __asm__ __volatile__ ( - "sub sp, sp, #0x50\n\t" + "sub sp, sp, #0x70\n\t" "add lr, sp, #0x44\n\t" "stm lr, {%[s], %[a], %[c]}\n\t" "mov lr, %[b]\n\t" @@ -5267,7 +4964,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[c], r12\n\t" "add lr, sp, #32\n\t" "stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" - "ldr %[s], [sp, #68]\n\t" + "add %[s], sp, #0x50\n\t" /* Add c to a * b */ "ldr lr, [sp, #76]\n\t" "ldm sp!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" @@ -5687,15 +5384,24 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "adcs r8, r8, #0\n\t" "adc r9, r9, %[a]\n\t" "bfc r9, #28, #4\n\t" + "ldr %[s], [sp, #68]\n\t" /* Store result */ - "stm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "add sp, sp, #0x50\n\t" + "str %[b], [%[s]]\n\t" + "str %[c], [%[s], #4]\n\t" + "str r4, [%[s], #8]\n\t" + "str r5, [%[s], #12]\n\t" + "str r6, [%[s], #16]\n\t" + "str r7, [%[s], #20]\n\t" + "str r8, [%[s], #24]\n\t" + "str r9, [%[s], #28]\n\t" + "add sp, sp, #0x70\n\t" : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); } +#endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S index d46e13d05..b836b4749 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519.S +++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S @@ -236,11 +236,25 @@ fe_add: .type fe_frombytes, %function fe_frombytes: PUSH {r4, r5, r6, r7, r8, r9, lr} - LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} + LDR r2, [r1] + LDR r3, [r1, #4] + LDR r4, [r1, #8] + LDR r5, [r1, #12] + LDR r6, [r1, #16] + LDR r7, [r1, #20] + LDR r8, [r1, #24] + LDR r9, [r1, #28] BFC r9, #31, #1 - STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + STR r2, [r0] + STR r3, [r0, #4] + STR r4, [r0, #8] + STR r5, [r0, #12] + STR r6, [r0, #16] + STR r7, [r0, #20] + STR r8, [r0, #24] + STR r9, [r0, #28] POP {r4, r5, r6, r7, r8, r9, pc} - # Cycle Count = 35 + # Cycle Count = 49 .size fe_frombytes,.-fe_frombytes .text .align 4 @@ -268,9 +282,16 @@ fe_tobytes: ADCS r8, r8, #0x0 ADC r9, r9, #0x0 BFC r9, #31, #1 - STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + STR r2, [r0] + STR r3, [r0, #4] + STR r4, [r0, #8] + STR r5, [r0, #12] + STR r6, [r0, #16] + STR r7, [r0, #20] + STR r8, [r0, #24] + STR r9, [r0, #28] POP {r4, r5, r6, r7, r8, r9, r10, pc} - # Cycle Count = 55 + # Cycle Count = 62 .size fe_tobytes,.-fe_tobytes .text .align 4 @@ -1490,6 +1511,386 @@ fe_cmov_table: #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ +#ifdef WOLFSSL_SP_NO_UMAAL + .text + .align 4 + .globl fe_mul_op + .type fe_mul_op, %function +fe_mul_op: + PUSH {lr} + SUB sp, sp, #0x28 + STR r0, [sp, #36] + MOV r0, #0x0 + LDR r12, [r1] + # A[0] * B[0] + LDR lr, [r2] + UMULL r3, r4, r12, lr + # A[0] * B[2] + LDR lr, [r2, #8] + UMULL r5, r6, r12, lr + # A[0] * B[4] + LDR lr, [r2, #16] + UMULL r7, r8, r12, lr + # A[0] * B[6] + LDR lr, [r2, #24] + UMULL r9, r10, r12, lr + STR r3, [sp] + # A[0] * B[1] + LDR lr, [r2, #4] + MOV r11, r0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[0] * B[3] + LDR lr, [r2, #12] + ADCS r6, r6, #0x0 + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[0] * B[5] + LDR lr, [r2, #20] + ADCS r8, r8, #0x0 + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[0] * B[7] + LDR lr, [r2, #28] + ADCS r10, r10, #0x0 + ADC r3, r0, #0x0 + UMLAL r10, r3, r12, lr + # A[1] * B[0] + LDR r12, [r1, #4] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r4, r11, r12, lr + STR r4, [sp, #4] + ADDS r5, r5, r11 + # A[1] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[1] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[1] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[1] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[1] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[1] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[1] * B[7] + LDR lr, [r2, #28] + ADC r4, r0, #0x0 + UMLAL r3, r4, r12, lr + # A[2] * B[0] + LDR r12, [r1, #8] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r5, r11, r12, lr + STR r5, [sp, #8] + ADDS r6, r6, r11 + # A[2] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[2] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[2] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[2] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[2] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[2] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[2] * B[7] + LDR lr, [r2, #28] + ADC r5, r0, #0x0 + UMLAL r4, r5, r12, lr + # A[3] * B[0] + LDR r12, [r1, #12] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r6, r11, r12, lr + STR r6, [sp, #12] + ADDS r7, r7, r11 + # A[3] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[3] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[3] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[3] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[3] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[3] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[3] * B[7] + LDR lr, [r2, #28] + ADC r6, r0, #0x0 + UMLAL r5, r6, r12, lr + # A[4] * B[0] + LDR r12, [r1, #16] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r7, r11, r12, lr + STR r7, [sp, #16] + ADDS r8, r8, r11 + # A[4] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[4] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[4] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[4] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[4] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[4] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[4] * B[7] + LDR lr, [r2, #28] + ADC r7, r0, #0x0 + UMLAL r6, r7, r12, lr + # A[5] * B[0] + LDR r12, [r1, #20] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r8, r11, r12, lr + STR r8, [sp, #20] + ADDS r9, r9, r11 + # A[5] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[5] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[5] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[5] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[5] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[5] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[5] * B[7] + LDR lr, [r2, #28] + ADC r8, r0, #0x0 + UMLAL r7, r8, r12, lr + # A[6] * B[0] + LDR r12, [r1, #24] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r9, r11, r12, lr + STR r9, [sp, #24] + ADDS r10, r10, r11 + # A[6] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[6] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[6] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[6] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[6] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[6] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[6] * B[7] + LDR lr, [r2, #28] + ADC r9, r0, #0x0 + UMLAL r8, r9, r12, lr + # A[7] * B[0] + LDR r12, [r1, #28] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r10, r11, r12, lr + STR r10, [sp, #28] + ADDS r3, r3, r11 + # A[7] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[7] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[7] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[7] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[7] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[7] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[7] * B[7] + LDR lr, [r2, #28] + ADC r10, r0, #0x0 + UMLAL r9, r10, r12, lr + # Reduce + LDR r2, [sp, #28] + MOV lr, sp + MOV r12, #0x26 + UMULL r10, r11, r10, r12 + ADDS r10, r10, r2 + ADC r11, r11, #0x0 + MOV r12, #0x13 + LSL r11, r11, #1 + ORR r11, r11, r10, LSR #31 + MUL r11, r11, r12 + LDM lr!, {r1, r2} + MOV r12, #0x26 + ADDS r1, r1, r11 + ADC r11, r0, #0x0 + UMLAL r1, r11, r3, r12 + ADDS r2, r2, r11 + ADC r11, r0, #0x0 + UMLAL r2, r11, r4, r12 + LDM lr!, {r3, r4} + ADDS r3, r3, r11 + ADC r11, r0, #0x0 + UMLAL r3, r11, r5, r12 + ADDS r4, r4, r11 + ADC r11, r0, #0x0 + UMLAL r4, r11, r6, r12 + LDM lr!, {r5, r6} + ADDS r5, r5, r11 + ADC r11, r0, #0x0 + UMLAL r5, r11, r7, r12 + ADDS r6, r6, r11 + ADC r11, r0, #0x0 + UMLAL r6, r11, r8, r12 + LDM lr!, {r7, r8} + ADDS r7, r7, r11 + ADC r11, r0, #0x0 + UMLAL r7, r11, r9, r12 + BFC r10, #31, #1 + ADDS r8, r10, r11 + # Store + LDR r0, [sp, #36] + STM r0, {r1, r2, r3, r4, r5, r6, r7, r8} + ADD sp, sp, #0x28 + POP {pc} + # Cycle Count = 406 + .size fe_mul_op,.-fe_mul_op +#else .text .align 4 .globl fe_mul_op @@ -1622,6 +2023,7 @@ fe_mul_op: POP {pc} # Cycle Count = 239 .size fe_mul_op,.-fe_mul_op +#endif /* WOLFSSL_SP_NO_UMAAL */ .text .align 4 .globl fe_mul @@ -1632,6 +2034,279 @@ fe_mul: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 24 .size fe_mul,.-fe_mul +#ifdef WOLFSSL_SP_NO_UMAAL + .text + .align 4 + .globl fe_sq_op + .type fe_sq_op, %function +fe_sq_op: + PUSH {lr} + SUB sp, sp, #0x44 + STR r0, [sp, #64] + # Square + MOV r0, #0x0 + LDR r12, [r1] + # A[0] * A[1] + LDR lr, [r1, #4] + UMULL r4, r5, r12, lr + # A[0] * A[3] + LDR lr, [r1, #12] + UMULL r6, r7, r12, lr + # A[0] * A[5] + LDR lr, [r1, #20] + UMULL r8, r9, r12, lr + # A[0] * A[7] + LDR lr, [r1, #28] + UMULL r10, r3, r12, lr + # A[0] * A[2] + LDR lr, [r1, #8] + MOV r11, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[0] * A[4] + LDR lr, [r1, #16] + ADCS r7, r7, #0x0 + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[0] * A[6] + LDR lr, [r1, #24] + ADCS r9, r9, #0x0 + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + ADCS r3, r3, #0x0 + STR r4, [sp, #4] + STR r5, [sp, #8] + # A[1] * A[2] + LDR r12, [r1, #4] + LDR lr, [r1, #8] + MOV r11, #0x0 + UMLAL r6, r11, r12, lr + STR r6, [sp, #12] + ADDS r7, r7, r11 + # A[1] * A[3] + LDR lr, [r1, #12] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + STR r7, [sp, #16] + ADDS r8, r8, r11 + # A[1] * A[4] + LDR lr, [r1, #16] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[1] * A[5] + LDR lr, [r1, #20] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[1] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[1] * A[7] + LDR lr, [r1, #28] + ADC r4, r0, #0x0 + UMLAL r3, r4, r12, lr + # A[2] * A[3] + LDR r12, [r1, #8] + LDR lr, [r1, #12] + MOV r11, #0x0 + UMLAL r8, r11, r12, lr + STR r8, [sp, #20] + ADDS r9, r9, r11 + # A[2] * A[4] + LDR lr, [r1, #16] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + STR r9, [sp, #24] + ADDS r10, r10, r11 + # A[2] * A[5] + LDR lr, [r1, #20] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[2] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[2] * A[7] + LDR lr, [r1, #28] + ADC r5, r0, #0x0 + UMLAL r4, r5, r12, lr + # A[3] * A[4] + LDR r12, [r1, #12] + LDR lr, [r1, #16] + MOV r11, #0x0 + UMLAL r10, r11, r12, lr + STR r10, [sp, #28] + ADDS r3, r3, r11 + # A[3] * A[5] + LDR lr, [r1, #20] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[3] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[3] * A[7] + LDR lr, [r1, #28] + ADC r6, r0, #0x0 + UMLAL r5, r6, r12, lr + # A[4] * A[5] + LDR r12, [r1, #16] + LDR lr, [r1, #20] + MOV r11, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[4] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[4] * A[7] + LDR lr, [r1, #28] + ADC r7, r0, #0x0 + UMLAL r6, r7, r12, lr + # A[5] * A[6] + LDR r12, [r1, #20] + LDR lr, [r1, #24] + MOV r11, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[5] * A[7] + LDR lr, [r1, #28] + ADC r8, r0, #0x0 + UMLAL r7, r8, r12, lr + # A[6] * A[7] + LDR r12, [r1, #24] + LDR lr, [r1, #28] + MOV r9, #0x0 + UMLAL r8, r9, r12, lr + ADD lr, sp, #0x20 + STM lr, {r3, r4, r5, r6, r7, r8, r9} + ADD lr, sp, #0x4 + LDM lr, {r4, r5, r6, r7, r8, r9, r10} + ADDS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADCS r10, r10, r10 + STM lr!, {r4, r5, r6, r7, r8, r9, r10} + LDM lr, {r3, r4, r5, r6, r7, r8, r9} + ADCS r3, r3, r3 + ADCS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADC r10, r0, #0x0 + STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} + ADD lr, sp, #0x4 + LDM lr, {r4, r5, r6, r7, r8, r9, r10} + MOV lr, sp + # A[0] * A[0] + LDR r12, [r1] + UMULL r3, r11, r12, r12 + ADDS r4, r4, r11 + # A[1] * A[1] + LDR r12, [r1, #4] + ADCS r5, r5, #0x0 + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, r12 + ADDS r6, r6, r11 + # A[2] * A[2] + LDR r12, [r1, #8] + ADCS r7, r7, #0x0 + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, r12 + ADDS r8, r8, r11 + # A[3] * A[3] + LDR r12, [r1, #12] + ADCS r9, r9, #0x0 + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, r12 + ADDS r10, r10, r11 + STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10} + LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10} + # A[4] * A[4] + LDR r12, [r1, #16] + ADCS r3, r3, #0x0 + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, r12 + ADDS r4, r4, r11 + # A[5] * A[5] + LDR r12, [r1, #20] + ADCS r5, r5, #0x0 + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, r12 + ADDS r6, r6, r11 + # A[6] * A[6] + LDR r12, [r1, #24] + ADCS r7, r7, #0x0 + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, r12 + ADDS r8, r8, r11 + # A[7] * A[7] + LDR r12, [r1, #28] + ADCS r9, r9, #0x0 + ADC r10, r10, #0x0 + UMLAL r9, r10, r12, r12 + # Reduce + LDR r2, [sp, #28] + MOV lr, sp + MOV r12, #0x26 + UMULL r10, r11, r10, r12 + ADDS r10, r10, r2 + ADC r11, r11, #0x0 + MOV r12, #0x13 + LSL r11, r11, #1 + ORR r11, r11, r10, LSR #31 + MUL r11, r11, r12 + LDM lr!, {r1, r2} + MOV r12, #0x26 + ADDS r1, r1, r11 + ADC r11, r0, #0x0 + UMLAL r1, r11, r3, r12 + ADDS r2, r2, r11 + ADC r11, r0, #0x0 + UMLAL r2, r11, r4, r12 + LDM lr!, {r3, r4} + ADDS r3, r3, r11 + ADC r11, r0, #0x0 + UMLAL r3, r11, r5, r12 + ADDS r4, r4, r11 + ADC r11, r0, #0x0 + UMLAL r4, r11, r6, r12 + LDM lr!, {r5, r6} + ADDS r5, r5, r11 + ADC r11, r0, #0x0 + UMLAL r5, r11, r7, r12 + ADDS r6, r6, r11 + ADC r11, r0, #0x0 + UMLAL r6, r11, r8, r12 + LDM lr!, {r7, r8} + ADDS r7, r7, r11 + ADC r11, r0, #0x0 + UMLAL r7, r11, r9, r12 + BFC r10, #31, #1 + ADDS r8, r10, r11 + # Store + LDR r0, [sp, #64] + STM r0, {r1, r2, r3, r4, r5, r6, r7, r8} + ADD sp, sp, #0x44 + POP {pc} + # Cycle Count = 355 + .size fe_sq_op,.-fe_sq_op +#else .text .align 4 .globl fe_sq_op @@ -1750,6 +2425,7 @@ fe_sq_op: POP {pc} # Cycle Count = 179 .size fe_sq_op,.-fe_sq_op +#endif /* WOLFSSL_SP_NO_UMAAL */ .text .align 4 .globl fe_sq @@ -1761,6 +2437,57 @@ fe_sq: # Cycle Count = 24 .size fe_sq,.-fe_sq #ifdef HAVE_CURVE25519 +#ifdef WOLFSSL_SP_NO_UMAAL + .text + .align 4 + .globl fe_mul121666 + .type fe_mul121666, %function +fe_mul121666: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + # Multiply by 121666 + LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} + MOV r12, #0xdb42 + MOVT r12, #0x1 + UMULL r2, r10, r2, r12 + UMULL r3, r11, r3, r12 + ADDS r3, r3, r10 + ADC r11, r11, #0x0 + UMULL r4, r10, r4, r12 + ADDS r4, r4, r11 + ADC r10, r10, #0x0 + UMULL r5, r11, r5, r12 + ADDS r5, r5, r10 + ADC r11, r11, #0x0 + UMULL r6, r10, r6, r12 + ADDS r6, r6, r11 + ADC r10, r10, #0x0 + UMULL r7, r11, r7, r12 + ADDS r7, r7, r10 + ADC r11, r11, #0x0 + UMULL r8, r10, r8, r12 + ADDS r8, r8, r11 + ADC r10, r10, #0x0 + UMULL r9, r11, r9, r12 + ADDS r9, r9, r10 + MOV r12, #0x13 + ADC r11, r11, #0x0 + LSL r11, r11, #1 + ORR r11, r11, r9, LSR #31 + MUL r11, r11, r12 + ADDS r2, r2, r11 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + BFC r9, #31, #1 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 75 + .size fe_mul121666,.-fe_mul121666 +#else .text .align 4 .globl fe_mul121666 @@ -1797,6 +2524,7 @@ fe_mul121666: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 69 .size fe_mul121666,.-fe_mul121666 +#endif /* WOLFSSL_SP_NO_UMAAL */ #ifndef WC_NO_CACHE_RESISTANT .text .align 4 @@ -2630,6 +3358,312 @@ L_fe_invert8: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 292 .size fe_invert,.-fe_invert +#ifdef WOLFSSL_SP_NO_UMAAL + .text + .align 4 + .globl fe_sq2 + .type fe_sq2, %function +fe_sq2: + PUSH {lr} + SUB sp, sp, #0x44 + STR r0, [sp, #64] + # Square * 2 + MOV r0, #0x0 + LDR r12, [r1] + # A[0] * A[1] + LDR lr, [r1, #4] + UMULL r4, r5, r12, lr + # A[0] * A[3] + LDR lr, [r1, #12] + UMULL r6, r7, r12, lr + # A[0] * A[5] + LDR lr, [r1, #20] + UMULL r8, r9, r12, lr + # A[0] * A[7] + LDR lr, [r1, #28] + UMULL r10, r3, r12, lr + # A[0] * A[2] + LDR lr, [r1, #8] + MOV r11, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[0] * A[4] + LDR lr, [r1, #16] + ADCS r7, r7, #0x0 + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[0] * A[6] + LDR lr, [r1, #24] + ADCS r9, r9, #0x0 + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + ADCS r3, r3, #0x0 + STR r4, [sp, #4] + STR r5, [sp, #8] + # A[1] * A[2] + LDR r12, [r1, #4] + LDR lr, [r1, #8] + MOV r11, #0x0 + UMLAL r6, r11, r12, lr + STR r6, [sp, #12] + ADDS r7, r7, r11 + # A[1] * A[3] + LDR lr, [r1, #12] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + STR r7, [sp, #16] + ADDS r8, r8, r11 + # A[1] * A[4] + LDR lr, [r1, #16] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[1] * A[5] + LDR lr, [r1, #20] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[1] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[1] * A[7] + LDR lr, [r1, #28] + ADC r4, r0, #0x0 + UMLAL r3, r4, r12, lr + # A[2] * A[3] + LDR r12, [r1, #8] + LDR lr, [r1, #12] + MOV r11, #0x0 + UMLAL r8, r11, r12, lr + STR r8, [sp, #20] + ADDS r9, r9, r11 + # A[2] * A[4] + LDR lr, [r1, #16] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + STR r9, [sp, #24] + ADDS r10, r10, r11 + # A[2] * A[5] + LDR lr, [r1, #20] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[2] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[2] * A[7] + LDR lr, [r1, #28] + ADC r5, r0, #0x0 + UMLAL r4, r5, r12, lr + # A[3] * A[4] + LDR r12, [r1, #12] + LDR lr, [r1, #16] + MOV r11, #0x0 + UMLAL r10, r11, r12, lr + STR r10, [sp, #28] + ADDS r3, r3, r11 + # A[3] * A[5] + LDR lr, [r1, #20] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[3] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[3] * A[7] + LDR lr, [r1, #28] + ADC r6, r0, #0x0 + UMLAL r5, r6, r12, lr + # A[4] * A[5] + LDR r12, [r1, #16] + LDR lr, [r1, #20] + MOV r11, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[4] * A[6] + LDR lr, [r1, #24] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[4] * A[7] + LDR lr, [r1, #28] + ADC r7, r0, #0x0 + UMLAL r6, r7, r12, lr + # A[5] * A[6] + LDR r12, [r1, #20] + LDR lr, [r1, #24] + MOV r11, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[5] * A[7] + LDR lr, [r1, #28] + ADC r8, r0, #0x0 + UMLAL r7, r8, r12, lr + # A[6] * A[7] + LDR r12, [r1, #24] + LDR lr, [r1, #28] + MOV r9, #0x0 + UMLAL r8, r9, r12, lr + ADD lr, sp, #0x20 + STM lr, {r3, r4, r5, r6, r7, r8, r9} + ADD lr, sp, #0x4 + LDM lr, {r4, r5, r6, r7, r8, r9, r10} + ADDS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADCS r10, r10, r10 + STM lr!, {r4, r5, r6, r7, r8, r9, r10} + LDM lr, {r3, r4, r5, r6, r7, r8, r9} + ADCS r3, r3, r3 + ADCS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADC r10, r0, #0x0 + STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} + ADD lr, sp, #0x4 + LDM lr, {r4, r5, r6, r7, r8, r9, r10} + MOV lr, sp + # A[0] * A[0] + LDR r12, [r1] + UMULL r3, r11, r12, r12 + ADDS r4, r4, r11 + # A[1] * A[1] + LDR r12, [r1, #4] + ADCS r5, r5, #0x0 + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, r12 + ADDS r6, r6, r11 + # A[2] * A[2] + LDR r12, [r1, #8] + ADCS r7, r7, #0x0 + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, r12 + ADDS r8, r8, r11 + # A[3] * A[3] + LDR r12, [r1, #12] + ADCS r9, r9, #0x0 + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, r12 + ADDS r10, r10, r11 + STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10} + LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10} + # A[4] * A[4] + LDR r12, [r1, #16] + ADCS r3, r3, #0x0 + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, r12 + ADDS r4, r4, r11 + # A[5] * A[5] + LDR r12, [r1, #20] + ADCS r5, r5, #0x0 + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, r12 + ADDS r6, r6, r11 + # A[6] * A[6] + LDR r12, [r1, #24] + ADCS r7, r7, #0x0 + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, r12 + ADDS r8, r8, r11 + # A[7] * A[7] + LDR r12, [r1, #28] + ADCS r9, r9, #0x0 + ADC r10, r10, #0x0 + UMLAL r9, r10, r12, r12 + # Reduce + LDR r2, [sp, #28] + MOV lr, sp + MOV r12, #0x26 + UMULL r10, r11, r10, r12 + ADDS r10, r10, r2 + ADC r11, r11, #0x0 + MOV r12, #0x13 + LSL r11, r11, #1 + ORR r11, r11, r10, LSR #31 + MUL r11, r11, r12 + LDM lr!, {r1, r2} + MOV r12, #0x26 + ADDS r1, r1, r11 + ADC r11, r0, #0x0 + UMLAL r1, r11, r3, r12 + ADDS r2, r2, r11 + ADC r11, r0, #0x0 + UMLAL r2, r11, r4, r12 + LDM lr!, {r3, r4} + ADDS r3, r3, r11 + ADC r11, r0, #0x0 + UMLAL r3, r11, r5, r12 + ADDS r4, r4, r11 + ADC r11, r0, #0x0 + UMLAL r4, r11, r6, r12 + LDM lr!, {r5, r6} + ADDS r5, r5, r11 + ADC r11, r0, #0x0 + UMLAL r5, r11, r7, r12 + ADDS r6, r6, r11 + ADC r11, r0, #0x0 + UMLAL r6, r11, r8, r12 + LDM lr!, {r7, r8} + ADDS r7, r7, r11 + ADC r11, r0, #0x0 + UMLAL r7, r11, r9, r12 + BFC r10, #31, #1 + ADDS r8, r10, r11 + # Reduce if top bit set + MOV r12, #0x13 + AND r11, r12, r8, ASR #31 + ADDS r1, r1, r11 + ADCS r2, r2, #0x0 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + BFC r8, #31, #1 + ADCS r7, r7, #0x0 + ADC r8, r8, #0x0 + # Double + ADDS r1, r1, r1 + ADCS r2, r2, r2 + ADCS r3, r3, r3 + ADCS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADC r8, r8, r8 + # Reduce if top bit set + MOV r12, #0x13 + AND r11, r12, r8, ASR #31 + ADDS r1, r1, r11 + ADCS r2, r2, #0x0 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + BFC r8, #31, #1 + ADCS r7, r7, #0x0 + ADC r8, r8, #0x0 + # Store + LDR r0, [sp, #64] + STM r0, {r1, r2, r3, r4, r5, r6, r7, r8} + ADD sp, sp, #0x44 + POP {pc} + # Cycle Count = 385 + .size fe_sq2,.-fe_sq2 +#else .text .align 4 .globl fe_sq2 @@ -2783,6 +3817,7 @@ fe_sq2: POP {pc} # Cycle Count = 213 .size fe_sq2,.-fe_sq2 +#endif /* WOLFSSL_SP_NO_UMAAL */ .text .align 4 .globl fe_pow22523 @@ -3360,13 +4395,441 @@ ge_sub: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 138 .size ge_sub,.-ge_sub +#ifdef WOLFSSL_SP_NO_UMAAL .text .align 4 .globl sc_reduce .type sc_reduce, %function sc_reduce: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} - SUB sp, sp, #0x34 + SUB sp, sp, #0x38 + STR r0, [sp, #52] + # Load bits 252-511 + ADD r0, r0, #0x1c + LDM r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} + LSR lr, r9, #24 + LSL r9, r9, #4 + ORR r9, r9, r8, LSR #28 + LSL r8, r8, #4 + ORR r8, r8, r7, LSR #28 + LSL r7, r7, #4 + ORR r7, r7, r6, LSR #28 + LSL r6, r6, #4 + ORR r6, r6, r5, LSR #28 + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r9, #28, #4 + SUB r0, r0, #0x1c + # Add order times bits 504..511 + MOV r10, #0x2c13 + MOVT r10, #0xa30a + MOV r11, #0x9ce5 + MOVT r11, #0xa7ed + MOV r1, #0x0 + UMLAL r2, r1, r10, lr + ADDS r3, r3, r1 + MOV r1, #0x0 + ADC r1, r1, #0x0 + UMLAL r3, r1, r11, lr + MOV r10, #0x6329 + MOVT r10, #0x5d08 + MOV r11, #0x621 + MOVT r11, #0xeb21 + ADDS r4, r4, r1 + MOV r1, #0x0 + ADC r1, r1, #0x0 + UMLAL r4, r1, r10, lr + ADDS r5, r5, r1 + MOV r1, #0x0 + ADC r1, r1, #0x0 + UMLAL r5, r1, r11, lr + ADDS r6, r6, r1 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + SUBS r6, r6, lr + SBCS r7, r7, #0x0 + SBCS r8, r8, #0x0 + SBC r9, r9, #0x0 + # Sub product of top 8 words and order + MOV r12, sp + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM r0!, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + BFC r11, #28, #4 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r0, r0, #0x10 + SUB r12, r12, #0x20 + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV lr, #0x0 + LDM r12, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV lr, #0x0 + LDM r12, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV lr, #0x0 + LDM r12, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 + # Subtract at 4 * 32 + LDM r12, {r10, r11} + SUBS r10, r10, r2 + SBCS r11, r11, r3 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r4 + SBCS r11, r11, r5 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r6 + SBCS r11, r11, r7 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r8 + SBC r11, r11, r9 + STM r12!, {r10, r11} + SUB r12, r12, #0x24 + ASR lr, r11, #25 + # Conditionally subtract order starting at bit 125 + MOV r1, #0xa0000000 + MOV r2, #0xba7d + MOVT r2, #0x4b9e + MOV r3, #0x4c63 + MOVT r3, #0xcb02 + MOV r4, #0xf39a + MOVT r4, #0xd45e + MOV r5, #0xdf3b + MOVT r5, #0x29b + MOV r9, #0x2000000 + AND r1, r1, lr + AND r2, r2, lr + AND r3, r3, lr + AND r4, r4, lr + AND r5, r5, lr + AND r9, r9, lr + LDM r12, {r10, r11} + ADDS r10, r10, r1 + ADCS r11, r11, r2 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r3 + ADCS r11, r11, r4 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r5 + ADCS r11, r11, #0x0 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, #0x0 + ADCS r11, r11, #0x0 + STM r12!, {r10, r11} + LDM r12, {r10} + ADCS r10, r10, #0x0 + STM r12!, {r10} + SUB r0, r0, #0x10 + MOV r12, sp + # Load bits 252-376 + ADD r12, r12, #0x1c + LDM r12, {r1, r2, r3, r4, r5} + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r5, #29, #3 + SUB r12, r12, #0x1c + # Sub product of top 4 words and order + MOV r0, sp + # * -5cf5d3ed + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, lr, r2, r1 + ADDS r7, r7, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r7, lr, r3, r1 + ADDS r8, r8, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r8, lr, r4, r1 + ADDS r9, r9, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r9, lr, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # * -5812631b + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV r10, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, r10, r2, r1 + ADDS r7, r7, r10 + MOV r10, #0x0 + ADC r10, r10, #0x0 + UMLAL r7, r10, r3, r1 + ADDS r8, r8, r10 + MOV r10, #0x0 + ADC r10, r10, #0x0 + UMLAL r8, r10, r4, r1 + ADDS r9, r9, r10 + MOV r10, #0x0 + ADC r10, r10, #0x0 + UMLAL r9, r10, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # * -a2f79cd7 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV r11, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, r11, r2, r1 + ADDS r7, r7, r11 + MOV r11, #0x0 + ADC r11, r11, #0x0 + UMLAL r7, r11, r3, r1 + ADDS r8, r8, r11 + MOV r11, #0x0 + ADC r11, r11, #0x0 + UMLAL r8, r11, r4, r1 + ADDS r9, r9, r11 + MOV r11, #0x0 + ADC r11, r11, #0x0 + UMLAL r9, r11, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # * -14def9df + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV r12, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, r12, r2, r1 + ADDS r7, r7, r12 + MOV r12, #0x0 + ADC r12, r12, #0x0 + UMLAL r7, r12, r3, r1 + ADDS r8, r8, r12 + MOV r12, #0x0 + ADC r12, r12, #0x0 + UMLAL r8, r12, r4, r1 + ADDS r9, r9, r12 + MOV r12, #0x0 + ADC r12, r12, #0x0 + UMLAL r9, r12, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # Add overflows at 4 * 32 + LDM r0, {r6, r7, r8, r9} + BFC r9, #28, #4 + ADDS r6, r6, lr + ADCS r7, r7, r10 + ADCS r8, r8, r11 + ADC r9, r9, r12 + # Subtract top at 4 * 32 + SUBS r6, r6, r2 + SBCS r7, r7, r3 + SBCS r8, r8, r4 + SBCS r9, r9, r5 + SBC r1, r1, r1 + SUB r0, r0, #0x10 + LDM r0, {r2, r3, r4, r5} + MOV r10, #0xd3ed + MOVT r10, #0x5cf5 + MOV r11, #0x631a + MOVT r11, #0x5812 + MOV r12, #0x9cd6 + MOVT r12, #0xa2f7 + MOV lr, #0xf9de + MOVT lr, #0x14de + AND r10, r10, r1 + AND r11, r11, r1 + AND r12, r12, r1 + AND lr, lr, r1 + ADDS r2, r2, r10 + ADCS r3, r3, r11 + ADCS r4, r4, r12 + ADCS r5, r5, lr + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + AND r1, r1, #0x10000000 + ADCS r8, r8, #0x0 + ADC r9, r9, r1 + BFC r9, #28, #4 + # Store result + LDR r0, [sp, #52] + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ADD sp, sp, #0x38 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 588 + .size sc_reduce,.-sc_reduce +#else + .text + .align 4 + .globl sc_reduce + .type sc_reduce, %function +sc_reduce: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x38 + STR r0, [sp, #52] # Load bits 252-511 ADD r0, r0, #0x1c LDM r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} @@ -3412,96 +4875,107 @@ sc_reduce: SBCS r8, r8, #0x0 SBC r9, r9, #0x0 # Sub product of top 8 words and order + MOV r12, sp MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 - LDM r0!, {r10, r11, r12} + LDM r0!, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM r0!, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} LDM r0!, {r10, r11} UMAAL r10, lr, r8, r1 BFC r11, #28, #4 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} + STM r12!, {r10, r11, lr} SUB r0, r0, #0x10 - SUB sp, sp, #0x20 + SUB r12, r12, #0x20 MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV lr, #0x0 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} UMAAL r10, lr, r8, r1 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} - SUB sp, sp, #0x20 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 MOV r1, #0x6329 MOVT r1, #0x5d08 MOV lr, #0x0 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} UMAAL r10, lr, r8, r1 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} - SUB sp, sp, #0x20 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 MOV r1, #0x621 MOVT r1, #0xeb21 MOV lr, #0x0 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} UMAAL r10, lr, r8, r1 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} - SUB sp, sp, #0x20 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 # Subtract at 4 * 32 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} SUBS r10, r10, r2 SBCS r11, r11, r3 - SBCS r12, r12, r4 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - SBCS r10, r10, r5 - SBCS r11, r11, r6 - SBCS r12, r12, r7 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r4 + SBCS r11, r11, r5 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r6 + SBCS r11, r11, r7 + STM r12!, {r10, r11} + LDM r12, {r10, r11} SBCS r10, r10, r8 SBC r11, r11, r9 - STM sp!, {r10, r11} - SUB sp, sp, #0x24 + STM r12!, {r10, r11} + SUB r12, r12, #0x24 ASR lr, r11, #25 # Conditionally subtract order starting at bit 125 MOV r1, #0xa0000000 @@ -3520,26 +4994,30 @@ sc_reduce: AND r4, r4, lr AND r5, r5, lr AND r9, r9, lr - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} ADDS r10, r10, r1 ADCS r11, r11, r2 - ADCS r12, r12, r3 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - ADCS r10, r10, r4 - ADCS r11, r11, r5 - ADCS r12, r12, #0x0 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r3 + ADCS r11, r11, r4 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r5 + ADCS r11, r11, #0x0 + STM r12!, {r10, r11} + LDM r12, {r10, r11} ADCS r10, r10, #0x0 ADCS r11, r11, #0x0 - ADCS r12, r12, r9 - STM sp!, {r10, r11, r12} - SUB sp, sp, #0x30 + STM r12!, {r10, r11} + LDM r12, {r10} + ADCS r10, r10, #0x0 + STM r12!, {r10} SUB r0, r0, #0x10 + MOV r12, sp # Load bits 252-376 - ADD sp, sp, #0x1c - LDM sp, {r1, r2, r3, r4, r5} + ADD r12, r12, #0x1c + LDM r12, {r1, r2, r3, r4, r5} LSL r5, r5, #4 ORR r5, r5, r4, LSR #28 LSL r4, r4, #4 @@ -3549,54 +5027,55 @@ sc_reduce: LSL r2, r2, #4 ORR r2, r2, r1, LSR #28 BFC r5, #29, #3 - SUB sp, sp, #0x1c - # Sub product of top 8 words and order + SUB r12, r12, #0x1c + # Sub product of top 4 words and order + MOV r0, sp # * -5cf5d3ed MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, lr, r2, r1 UMAAL r7, lr, r3, r1 UMAAL r8, lr, r4, r1 UMAAL r9, lr, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # * -5812631b MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV r10, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, r10, r2, r1 UMAAL r7, r10, r3, r1 UMAAL r8, r10, r4, r1 UMAAL r9, r10, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # * -a2f79cd7 MOV r1, #0x6329 MOVT r1, #0x5d08 MOV r11, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, r11, r2, r1 UMAAL r7, r11, r3, r1 UMAAL r8, r11, r4, r1 UMAAL r9, r11, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # * -14def9df MOV r1, #0x621 MOVT r1, #0xeb21 MOV r12, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, r12, r2, r1 UMAAL r7, r12, r3, r1 UMAAL r8, r12, r4, r1 UMAAL r9, r12, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # Add overflows at 4 * 32 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} BFC r9, #28, #4 ADDS r6, r6, lr ADCS r7, r7, r10 @@ -3608,8 +5087,8 @@ sc_reduce: SBCS r8, r8, r4 SBCS r9, r9, r5 SBC r1, r1, r1 - SUB sp, sp, #0x10 - LDM sp, {r2, r3, r4, r5} + SUB r0, r0, #0x10 + LDM r0, {r2, r3, r4, r5} MOV r10, #0xd3ed MOVT r10, #0x5cf5 MOV r11, #0x631a @@ -3633,12 +5112,798 @@ sc_reduce: ADC r9, r9, r1 BFC r9, #28, #4 # Store result + LDR r0, [sp, #52] STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} - ADD sp, sp, #0x34 + ADD sp, sp, #0x38 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 482 + # Cycle Count = 502 .size sc_reduce,.-sc_reduce +#endif /* WOLFSSL_SP_NO_UMAAL */ #ifdef HAVE_ED25519_SIGN +#ifdef WOLFSSL_SP_NO_UMAAL + .text + .align 4 + .globl sc_muladd + .type sc_muladd, %function +sc_muladd: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x50 + ADD lr, sp, #0x44 + STM lr, {r0, r1, r3} + MOV r0, #0x0 + LDR r12, [r1] + # A[0] * B[0] + LDR lr, [r2] + UMULL r3, r4, r12, lr + # A[0] * B[2] + LDR lr, [r2, #8] + UMULL r5, r6, r12, lr + # A[0] * B[4] + LDR lr, [r2, #16] + UMULL r7, r8, r12, lr + # A[0] * B[6] + LDR lr, [r2, #24] + UMULL r9, r10, r12, lr + STR r3, [sp] + # A[0] * B[1] + LDR lr, [r2, #4] + MOV r11, r0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[0] * B[3] + LDR lr, [r2, #12] + ADCS r6, r6, #0x0 + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[0] * B[5] + LDR lr, [r2, #20] + ADCS r8, r8, #0x0 + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[0] * B[7] + LDR lr, [r2, #28] + ADCS r10, r10, #0x0 + ADC r3, r0, #0x0 + UMLAL r10, r3, r12, lr + # A[1] * B[0] + LDR r12, [r1, #4] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r4, r11, r12, lr + STR r4, [sp, #4] + ADDS r5, r5, r11 + # A[1] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[1] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[1] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[1] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[1] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[1] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[1] * B[7] + LDR lr, [r2, #28] + ADC r4, r0, #0x0 + UMLAL r3, r4, r12, lr + # A[2] * B[0] + LDR r12, [r1, #8] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r5, r11, r12, lr + STR r5, [sp, #8] + ADDS r6, r6, r11 + # A[2] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[2] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[2] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[2] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[2] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[2] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[2] * B[7] + LDR lr, [r2, #28] + ADC r5, r0, #0x0 + UMLAL r4, r5, r12, lr + # A[3] * B[0] + LDR r12, [r1, #12] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r6, r11, r12, lr + STR r6, [sp, #12] + ADDS r7, r7, r11 + # A[3] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[3] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[3] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[3] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[3] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[3] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[3] * B[7] + LDR lr, [r2, #28] + ADC r6, r0, #0x0 + UMLAL r5, r6, r12, lr + # A[4] * B[0] + LDR r12, [r1, #16] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r7, r11, r12, lr + STR r7, [sp, #16] + ADDS r8, r8, r11 + # A[4] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[4] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[4] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[4] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[4] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[4] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[4] * B[7] + LDR lr, [r2, #28] + ADC r7, r0, #0x0 + UMLAL r6, r7, r12, lr + # A[5] * B[0] + LDR r12, [r1, #20] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r8, r11, r12, lr + STR r8, [sp, #20] + ADDS r9, r9, r11 + # A[5] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r9, r11, r12, lr + ADDS r10, r10, r11 + # A[5] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[5] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[5] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[5] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[5] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[5] * B[7] + LDR lr, [r2, #28] + ADC r8, r0, #0x0 + UMLAL r7, r8, r12, lr + # A[6] * B[0] + LDR r12, [r1, #24] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r9, r11, r12, lr + STR r9, [sp, #24] + ADDS r10, r10, r11 + # A[6] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r10, r11, r12, lr + ADDS r3, r3, r11 + # A[6] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[6] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[6] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[6] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[6] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[6] * B[7] + LDR lr, [r2, #28] + ADC r9, r0, #0x0 + UMLAL r8, r9, r12, lr + # A[7] * B[0] + LDR r12, [r1, #28] + LDR lr, [r2] + MOV r11, #0x0 + UMLAL r10, r11, r12, lr + STR r10, [sp, #28] + ADDS r3, r3, r11 + # A[7] * B[1] + LDR lr, [r2, #4] + ADC r11, r0, #0x0 + UMLAL r3, r11, r12, lr + ADDS r4, r4, r11 + # A[7] * B[2] + LDR lr, [r2, #8] + ADC r11, r0, #0x0 + UMLAL r4, r11, r12, lr + ADDS r5, r5, r11 + # A[7] * B[3] + LDR lr, [r2, #12] + ADC r11, r0, #0x0 + UMLAL r5, r11, r12, lr + ADDS r6, r6, r11 + # A[7] * B[4] + LDR lr, [r2, #16] + ADC r11, r0, #0x0 + UMLAL r6, r11, r12, lr + ADDS r7, r7, r11 + # A[7] * B[5] + LDR lr, [r2, #20] + ADC r11, r0, #0x0 + UMLAL r7, r11, r12, lr + ADDS r8, r8, r11 + # A[7] * B[6] + LDR lr, [r2, #24] + ADC r11, r0, #0x0 + UMLAL r8, r11, r12, lr + ADDS r9, r9, r11 + # A[7] * B[7] + LDR lr, [r2, #28] + ADC r10, r0, #0x0 + UMLAL r9, r10, r12, lr + ADD lr, sp, #0x20 + STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} + MOV r0, sp + # Add c to a * b + LDR lr, [sp, #76] + LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + LDM lr!, {r1, r10, r11, r12} + ADDS r2, r2, r1 + ADCS r3, r3, r10 + ADCS r4, r4, r11 + ADCS r5, r5, r12 + LDM lr!, {r1, r10, r11, r12} + ADCS r6, r6, r1 + ADCS r7, r7, r10 + ADCS r8, r8, r11 + ADCS r9, r9, r12 + MOV r1, r9 + STM r0!, {r2, r3, r4, r5, r6, r7, r8, r9} + LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ADCS r2, r2, #0x0 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + SUB r0, r0, #0x20 + # Get 252..503 and 504..507 + LSR lr, r9, #24 + LSL r9, r9, #4 + ORR r9, r9, r8, LSR #28 + LSL r8, r8, #4 + ORR r8, r8, r7, LSR #28 + LSL r7, r7, #4 + ORR r7, r7, r6, LSR #28 + LSL r6, r6, #4 + ORR r6, r6, r5, LSR #28 + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r9, #28, #4 + # Add order times bits 504..507 + MOV r10, #0x2c13 + MOVT r10, #0xa30a + MOV r11, #0x9ce5 + MOVT r11, #0xa7ed + MOV r1, #0x0 + UMLAL r2, r1, r10, lr + ADDS r3, r3, r1 + MOV r1, #0x0 + ADC r1, r1, #0x0 + UMLAL r3, r1, r11, lr + MOV r10, #0x6329 + MOVT r10, #0x5d08 + MOV r11, #0x621 + MOVT r11, #0xeb21 + ADDS r4, r4, r1 + MOV r1, #0x0 + ADC r1, r1, #0x0 + UMLAL r4, r1, r10, lr + ADDS r5, r5, r1 + MOV r1, #0x0 + ADC r1, r1, #0x0 + UMLAL r5, r1, r11, lr + ADDS r6, r6, r1 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + SUBS r6, r6, lr + SBCS r7, r7, #0x0 + SBCS r8, r8, #0x0 + SBC r9, r9, #0x0 + # Sub product of top 8 words and order + MOV r12, sp + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM r0!, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + BFC r11, #28, #4 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r0, r0, #0x10 + SUB r12, r12, #0x20 + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV lr, #0x0 + LDM r12, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV lr, #0x0 + LDM r12, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV lr, #0x0 + LDM r12, {r10, r11} + UMLAL r10, lr, r2, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r3, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r4, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r6, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADDS r10, r10, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r10, lr, r8, r1 + ADDS r11, r11, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r11, lr, r9, r1 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 + # Subtract at 4 * 32 + LDM r12, {r10, r11} + SUBS r10, r10, r2 + SBCS r11, r11, r3 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r4 + SBCS r11, r11, r5 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r6 + SBCS r11, r11, r7 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r8 + SBC r11, r11, r9 + STM r12!, {r10, r11} + SUB r12, r12, #0x24 + ASR lr, r11, #25 + # Conditionally subtract order starting at bit 125 + MOV r1, #0xa0000000 + MOV r2, #0xba7d + MOVT r2, #0x4b9e + MOV r3, #0x4c63 + MOVT r3, #0xcb02 + MOV r4, #0xf39a + MOVT r4, #0xd45e + MOV r5, #0xdf3b + MOVT r5, #0x29b + MOV r9, #0x2000000 + AND r1, r1, lr + AND r2, r2, lr + AND r3, r3, lr + AND r4, r4, lr + AND r5, r5, lr + AND r9, r9, lr + LDM r12, {r10, r11} + ADDS r10, r10, r1 + ADCS r11, r11, r2 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r3 + ADCS r11, r11, r4 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r5 + ADCS r11, r11, #0x0 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, #0x0 + ADCS r11, r11, #0x0 + STM r12!, {r10, r11} + LDM r12, {r10} + ADCS r10, r10, #0x0 + STM r12!, {r10} + SUB r0, r0, #0x10 + MOV r12, sp + # Load bits 252-376 + ADD r12, r12, #0x1c + LDM r12, {r1, r2, r3, r4, r5} + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r5, #29, #3 + SUB r12, r12, #0x1c + # Sub product of top 4 words and order + MOV r0, sp + # * -5cf5d3ed + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, lr, r2, r1 + ADDS r7, r7, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r7, lr, r3, r1 + ADDS r8, r8, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r8, lr, r4, r1 + ADDS r9, r9, lr + MOV lr, #0x0 + ADC lr, lr, #0x0 + UMLAL r9, lr, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # * -5812631b + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV r10, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, r10, r2, r1 + ADDS r7, r7, r10 + MOV r10, #0x0 + ADC r10, r10, #0x0 + UMLAL r7, r10, r3, r1 + ADDS r8, r8, r10 + MOV r10, #0x0 + ADC r10, r10, #0x0 + UMLAL r8, r10, r4, r1 + ADDS r9, r9, r10 + MOV r10, #0x0 + ADC r10, r10, #0x0 + UMLAL r9, r10, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # * -a2f79cd7 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV r11, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, r11, r2, r1 + ADDS r7, r7, r11 + MOV r11, #0x0 + ADC r11, r11, #0x0 + UMLAL r7, r11, r3, r1 + ADDS r8, r8, r11 + MOV r11, #0x0 + ADC r11, r11, #0x0 + UMLAL r8, r11, r4, r1 + ADDS r9, r9, r11 + MOV r11, #0x0 + ADC r11, r11, #0x0 + UMLAL r9, r11, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # * -14def9df + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV r12, #0x0 + LDM r0, {r6, r7, r8, r9} + UMLAL r6, r12, r2, r1 + ADDS r7, r7, r12 + MOV r12, #0x0 + ADC r12, r12, #0x0 + UMLAL r7, r12, r3, r1 + ADDS r8, r8, r12 + MOV r12, #0x0 + ADC r12, r12, #0x0 + UMLAL r8, r12, r4, r1 + ADDS r9, r9, r12 + MOV r12, #0x0 + ADC r12, r12, #0x0 + UMLAL r9, r12, r5, r1 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 + # Add overflows at 4 * 32 + LDM r0, {r6, r7, r8, r9} + BFC r9, #28, #4 + ADDS r6, r6, lr + ADCS r7, r7, r10 + ADCS r8, r8, r11 + ADC r9, r9, r12 + # Subtract top at 4 * 32 + SUBS r6, r6, r2 + SBCS r7, r7, r3 + SBCS r8, r8, r4 + SBCS r9, r9, r5 + SBC r1, r1, r1 + SUB r0, r0, #0x10 + LDM r0, {r2, r3, r4, r5} + MOV r10, #0xd3ed + MOVT r10, #0x5cf5 + MOV r11, #0x631a + MOVT r11, #0x5812 + MOV r12, #0x9cd6 + MOVT r12, #0xa2f7 + MOV lr, #0xf9de + MOVT lr, #0x14de + AND r10, r10, r1 + AND r11, r11, r1 + AND r12, r12, r1 + AND lr, lr, r1 + ADDS r2, r2, r10 + ADCS r3, r3, r11 + ADCS r4, r4, r12 + ADCS r5, r5, lr + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + AND r1, r1, #0x10000000 + ADCS r8, r8, #0x0 + ADC r9, r9, r1 + BFC r9, #28, #4 + LDR r0, [sp, #68] + # Store result + STR r2, [r0] + STR r3, [r0, #4] + STR r4, [r0, #8] + STR r5, [r0, #12] + STR r6, [r0, #16] + STR r7, [r0, #20] + STR r8, [r0, #24] + STR r9, [r0, #28] + ADD sp, sp, #0x50 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 994 + .size sc_muladd,.-sc_muladd +#else .text .align 4 .globl sc_muladd @@ -3747,10 +6012,10 @@ sc_muladd: MOV r3, r12 ADD lr, sp, #0x20 STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} - LDR r0, [sp, #68] + MOV r0, sp # Add c to a * b LDR lr, [sp, #76] - LDM sp!, {r2, r3, r4, r5, r6, r7, r8, r9} + LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} LDM lr!, {r1, r10, r11, r12} ADDS r2, r2, r1 ADCS r3, r3, r10 @@ -3762,8 +6027,8 @@ sc_muladd: ADCS r8, r8, r11 ADCS r9, r9, r12 MOV r1, r9 - STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} - LDM sp, {r2, r3, r4, r5, r6, r7, r8, r9} + STM r0!, {r2, r3, r4, r5, r6, r7, r8, r9} + LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} ADCS r2, r2, #0x0 ADCS r3, r3, #0x0 ADCS r4, r4, #0x0 @@ -3772,10 +6037,9 @@ sc_muladd: ADCS r7, r7, #0x0 ADCS r8, r8, #0x0 ADC r9, r9, #0x0 - SUB sp, sp, #0x20 + SUB r0, r0, #0x20 # Get 252..503 and 504..507 LSR lr, r9, #24 - BFC r9, #24, #8 LSL r9, r9, #4 ORR r9, r9, r8, LSR #28 LSL r8, r8, #4 @@ -3792,6 +6056,7 @@ sc_muladd: ORR r3, r3, r2, LSR #28 LSL r2, r2, #4 ORR r2, r2, r1, LSR #28 + BFC r9, #28, #4 # Add order times bits 504..507 MOV r10, #0x2c13 MOVT r10, #0xa30a @@ -3815,96 +6080,107 @@ sc_muladd: SBCS r8, r8, #0x0 SBC r9, r9, #0x0 # Sub product of top 8 words and order + MOV r12, sp MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 - LDM r0!, {r10, r11, r12} + LDM r0!, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM r0!, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r0!, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} LDM r0!, {r10, r11} UMAAL r10, lr, r8, r1 BFC r11, #28, #4 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} + STM r12!, {r10, r11, lr} SUB r0, r0, #0x10 - SUB sp, sp, #0x20 + SUB r12, r12, #0x20 MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV lr, #0x0 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} UMAAL r10, lr, r8, r1 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} - SUB sp, sp, #0x20 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 MOV r1, #0x6329 MOVT r1, #0x5d08 MOV lr, #0x0 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} UMAAL r10, lr, r8, r1 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} - SUB sp, sp, #0x20 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 MOV r1, #0x621 MOVT r1, #0xeb21 MOV lr, #0x0 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} UMLAL r10, lr, r2, r1 UMAAL r11, lr, r3, r1 - UMAAL r12, lr, r4, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - UMAAL r10, lr, r5, r1 - UMAAL r11, lr, r6, r1 - UMAAL r12, lr, r7, r1 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r4, r1 + UMAAL r11, lr, r5, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + UMAAL r10, lr, r6, r1 + UMAAL r11, lr, r7, r1 + STM r12!, {r10, r11} + LDM r12, {r10, r11} UMAAL r10, lr, r8, r1 UMAAL r11, lr, r9, r1 - STM sp!, {r10, r11, lr} - SUB sp, sp, #0x20 + STM r12!, {r10, r11, lr} + SUB r12, r12, #0x20 # Subtract at 4 * 32 - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} SUBS r10, r10, r2 SBCS r11, r11, r3 - SBCS r12, r12, r4 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - SBCS r10, r10, r5 - SBCS r11, r11, r6 - SBCS r12, r12, r7 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r4 + SBCS r11, r11, r5 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + SBCS r10, r10, r6 + SBCS r11, r11, r7 + STM r12!, {r10, r11} + LDM r12, {r10, r11} SBCS r10, r10, r8 SBC r11, r11, r9 - STM sp!, {r10, r11} - SUB sp, sp, #0x24 + STM r12!, {r10, r11} + SUB r12, r12, #0x24 ASR lr, r11, #25 # Conditionally subtract order starting at bit 125 MOV r1, #0xa0000000 @@ -3923,26 +6199,30 @@ sc_muladd: AND r4, r4, lr AND r5, r5, lr AND r9, r9, lr - LDM sp, {r10, r11, r12} + LDM r12, {r10, r11} ADDS r10, r10, r1 ADCS r11, r11, r2 - ADCS r12, r12, r3 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} - ADCS r10, r10, r4 - ADCS r11, r11, r5 - ADCS r12, r12, #0x0 - STM sp!, {r10, r11, r12} - LDM sp, {r10, r11, r12} + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r3 + ADCS r11, r11, r4 + STM r12!, {r10, r11} + LDM r12, {r10, r11} + ADCS r10, r10, r5 + ADCS r11, r11, #0x0 + STM r12!, {r10, r11} + LDM r12, {r10, r11} ADCS r10, r10, #0x0 ADCS r11, r11, #0x0 - ADCS r12, r12, r9 - STM sp!, {r10, r11, r12} - SUB sp, sp, #0x30 + STM r12!, {r10, r11} + LDM r12, {r10} + ADCS r10, r10, #0x0 + STM r12!, {r10} SUB r0, r0, #0x10 + MOV r12, sp # Load bits 252-376 - ADD sp, sp, #0x1c - LDM sp, {r1, r2, r3, r4, r5} + ADD r12, r12, #0x1c + LDM r12, {r1, r2, r3, r4, r5} LSL r5, r5, #4 ORR r5, r5, r4, LSR #28 LSL r4, r4, #4 @@ -3952,54 +6232,55 @@ sc_muladd: LSL r2, r2, #4 ORR r2, r2, r1, LSR #28 BFC r5, #29, #3 - SUB sp, sp, #0x1c - # Sub product of top 8 words and order + SUB r12, r12, #0x1c + # Sub product of top 4 words and order + MOV r0, sp # * -5cf5d3ed MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, lr, r2, r1 UMAAL r7, lr, r3, r1 UMAAL r8, lr, r4, r1 UMAAL r9, lr, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # * -5812631b MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV r10, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, r10, r2, r1 UMAAL r7, r10, r3, r1 UMAAL r8, r10, r4, r1 UMAAL r9, r10, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # * -a2f79cd7 MOV r1, #0x6329 MOVT r1, #0x5d08 MOV r11, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, r11, r2, r1 UMAAL r7, r11, r3, r1 UMAAL r8, r11, r4, r1 UMAAL r9, r11, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # * -14def9df MOV r1, #0x621 MOVT r1, #0xeb21 MOV r12, #0x0 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} UMLAL r6, r12, r2, r1 UMAAL r7, r12, r3, r1 UMAAL r8, r12, r4, r1 UMAAL r9, r12, r5, r1 - STM sp, {r6, r7, r8, r9} - ADD sp, sp, #0x4 + STM r0, {r6, r7, r8, r9} + ADD r0, r0, #0x4 # Add overflows at 4 * 32 - LDM sp, {r6, r7, r8, r9} + LDM r0, {r6, r7, r8, r9} BFC r9, #28, #4 ADDS r6, r6, lr ADCS r7, r7, r10 @@ -4011,8 +6292,8 @@ sc_muladd: SBCS r8, r8, r4 SBCS r9, r9, r5 SBC r1, r1, r1 - SUB sp, sp, #0x10 - LDM sp, {r2, r3, r4, r5} + SUB r0, r0, #0x10 + LDM r0, {r2, r3, r4, r5} MOV r10, #0xd3ed MOVT r10, #0x5cf5 MOV r11, #0x631a @@ -4035,12 +6316,21 @@ sc_muladd: ADCS r8, r8, #0x0 ADC r9, r9, r1 BFC r9, #28, #4 + LDR r0, [sp, #68] # Store result - STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + STR r2, [r0] + STR r3, [r0, #4] + STR r4, [r0, #8] + STR r5, [r0, #12] + STR r6, [r0, #16] + STR r7, [r0, #20] + STR r8, [r0, #24] + STR r9, [r0, #28] ADD sp, sp, #0x50 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 728 + # Cycle Count = 752 .size sc_muladd,.-sc_muladd +#endif /* WOLFSSL_SP_NO_UMAAL */ #endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c index 43c79b5ed..4df206607 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -260,9 +260,23 @@ void fe_frombytes(fe out_p, const unsigned char* in_p) register const unsigned char* in asm ("r1") = (const unsigned char*)in_p; __asm__ __volatile__ ( - "LDM %[in], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "LDR r2, [%[in]]\n\t" + "LDR r3, [%[in], #4]\n\t" + "LDR r4, [%[in], #8]\n\t" + "LDR r5, [%[in], #12]\n\t" + "LDR r6, [%[in], #16]\n\t" + "LDR r7, [%[in], #20]\n\t" + "LDR r8, [%[in], #24]\n\t" + "LDR r9, [%[in], #28]\n\t" "BFC r9, #31, #1\n\t" - "STM %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "STR r2, [%[out]]\n\t" + "STR r3, [%[out], #4]\n\t" + "STR r4, [%[out], #8]\n\t" + "STR r5, [%[out], #12]\n\t" + "STR r6, [%[out], #16]\n\t" + "STR r7, [%[out], #20]\n\t" + "STR r8, [%[out], #24]\n\t" + "STR r9, [%[out], #28]\n\t" : [out] "+r" (out), [in] "+r" (in) : : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" @@ -295,7 +309,14 @@ void fe_tobytes(unsigned char* out_p, const fe n_p) "ADCS r8, r8, #0x0\n\t" "ADC r9, r9, #0x0\n\t" "BFC r9, #31, #1\n\t" - "STM %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "STR r2, [%[out]]\n\t" + "STR r3, [%[out], #4]\n\t" + "STR r4, [%[out], #8]\n\t" + "STR r5, [%[out], #12]\n\t" + "STR r6, [%[out], #16]\n\t" + "STR r7, [%[out], #20]\n\t" + "STR r8, [%[out], #24]\n\t" + "STR r9, [%[out], #28]\n\t" : [out] "+r" (out), [n] "+r" (n) : : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" @@ -1544,6 +1565,387 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ +#ifdef WOLFSSL_SP_NO_UMAAL +void fe_mul_op(void); +void fe_mul_op() +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x28\n\t" + "STR r0, [sp, #36]\n\t" + "MOV r0, #0x0\n\t" + "LDR r12, [r1]\n\t" + /* A[0] * B[0] */ + "LDR lr, [r2]\n\t" + "UMULL r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "UMULL r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "UMULL r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "UMULL r9, r10, r12, lr\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "MOV r11, r0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[0] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[0] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[0] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r3, r0, #0x0\n\t" + "UMLAL r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "LDR r12, [r1, #4]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "STR r4, [sp, #4]\n\t" + "ADDS r5, r5, r11\n\t" + /* A[1] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[1] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r4, r0, #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "LDR r12, [r1, #8]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "STR r5, [sp, #8]\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[2] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[2] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r5, r0, #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "LDR r12, [r1, #12]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[3] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[3] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[3] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r6, r0, #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "LDR r12, [r1, #16]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[4] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[4] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[4] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[4] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[4] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r7, r0, #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "LDR r12, [r1, #20]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[5] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[5] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[5] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[5] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[5] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r8, r0, #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "LDR r12, [r1, #24]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[6] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[6] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[6] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[6] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[6] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[6] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r9, r0, #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "LDR r12, [r1, #28]\n\t" + "LDR lr, [r2]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[7] * B[1] */ + "LDR lr, [r2, #4]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[7] * B[2] */ + "LDR lr, [r2, #8]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[7] * B[3] */ + "LDR lr, [r2, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[7] * B[4] */ + "LDR lr, [r2, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[7] * B[5] */ + "LDR lr, [r2, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * B[6] */ + "LDR lr, [r2, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[7] * B[7] */ + "LDR lr, [r2, #28]\n\t" + "ADC r10, r0, #0x0\n\t" + "UMLAL r9, r10, r12, lr\n\t" + /* Reduce */ + "LDR r2, [sp, #28]\n\t" + "MOV lr, sp\n\t" + "MOV r12, #0x26\n\t" + "UMULL r10, r11, r10, r12\n\t" + "ADDS r10, r10, r2\n\t" + "ADC r11, r11, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL r11, r11, #1\n\t" + "ORR r11, r11, r10, LSR #31\n\t" + "MUL r11, r11, r12\n\t" + "LDM lr!, {r1, r2}\n\t" + "MOV r12, #0x26\n\t" + "ADDS r1, r1, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r1, r11, r3, r12\n\t" + "ADDS r2, r2, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r2, r11, r4, r12\n\t" + "LDM lr!, {r3, r4}\n\t" + "ADDS r3, r3, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r5, r12\n\t" + "ADDS r4, r4, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r6, r12\n\t" + "LDM lr!, {r5, r6}\n\t" + "ADDS r5, r5, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r7, r12\n\t" + "ADDS r6, r6, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r8, r12\n\t" + "LDM lr!, {r7, r8}\n\t" + "ADDS r7, r7, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r9, r12\n\t" + "BFC r10, #31, #1\n\t" + "ADDS r8, r10, r11\n\t" + /* Store */ + "LDR r0, [sp, #36]\n\t" + "STM r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x28\n\t" + : + : + : "memory", "lr" + ); +} + +#else void fe_mul_op(void); void fe_mul_op() { @@ -1677,6 +2079,7 @@ void fe_mul_op() ); } +#endif /* WOLFSSL_SP_NO_UMAAL */ void fe_mul(fe r_p, const fe a_p, const fe b_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -1691,6 +2094,280 @@ void fe_mul(fe r_p, const fe a_p, const fe b_p) ); } +#ifdef WOLFSSL_SP_NO_UMAAL +void fe_sq_op(void); +void fe_sq_op() +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + "STR r0, [sp, #64]\n\t" + /* Square */ + "MOV r0, #0x0\n\t" + "LDR r12, [r1]\n\t" + /* A[0] * A[1] */ + "LDR lr, [r1, #4]\n\t" + "UMULL r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "LDR lr, [r1, #12]\n\t" + "UMULL r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "UMULL r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "UMULL r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "LDR lr, [r1, #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[0] * A[4] */ + "LDR lr, [r1, #16]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[0] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + "ADCS r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + "STR r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "LDR r12, [r1, #4]\n\t" + "LDR lr, [r1, #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * A[3] */ + "LDR lr, [r1, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * A[4] */ + "LDR lr, [r1, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r4, r0, #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "LDR r12, [r1, #8]\n\t" + "LDR lr, [r1, #12]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * A[4] */ + "LDR lr, [r1, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r5, r0, #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "LDR r12, [r1, #12]\n\t" + "LDR lr, [r1, #16]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r6, r0, #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "LDR r12, [r1, #16]\n\t" + "LDR lr, [r1, #20]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r7, r0, #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "LDR r12, [r1, #20]\n\t" + "LDR lr, [r1, #24]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r8, r0, #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "LDR r12, [r1, #24]\n\t" + "LDR lr, [r1, #28]\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "STM lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADC r10, r0, #0x0\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "MOV lr, sp\n\t" + /* A[0] * A[0] */ + "LDR r12, [r1]\n\t" + "UMULL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[1] * A[1] */ + "LDR r12, [r1, #4]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * A[2] */ + "LDR r12, [r1, #8]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * A[3] */ + "LDR r12, [r1, #12]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, r12\n\t" + "ADDS r10, r10, r11\n\t" + "STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "LDR r12, [r1, #16]\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * A[5] */ + "LDR r12, [r1, #20]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * A[6] */ + "LDR r12, [r1, #24]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * A[7] */ + "LDR r12, [r1, #28]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r12, r12\n\t" + /* Reduce */ + "LDR r2, [sp, #28]\n\t" + "MOV lr, sp\n\t" + "MOV r12, #0x26\n\t" + "UMULL r10, r11, r10, r12\n\t" + "ADDS r10, r10, r2\n\t" + "ADC r11, r11, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL r11, r11, #1\n\t" + "ORR r11, r11, r10, LSR #31\n\t" + "MUL r11, r11, r12\n\t" + "LDM lr!, {r1, r2}\n\t" + "MOV r12, #0x26\n\t" + "ADDS r1, r1, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r1, r11, r3, r12\n\t" + "ADDS r2, r2, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r2, r11, r4, r12\n\t" + "LDM lr!, {r3, r4}\n\t" + "ADDS r3, r3, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r5, r12\n\t" + "ADDS r4, r4, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r6, r12\n\t" + "LDM lr!, {r5, r6}\n\t" + "ADDS r5, r5, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r7, r12\n\t" + "ADDS r6, r6, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r8, r12\n\t" + "LDM lr!, {r7, r8}\n\t" + "ADDS r7, r7, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r9, r12\n\t" + "BFC r10, #31, #1\n\t" + "ADDS r8, r10, r11\n\t" + /* Store */ + "LDR r0, [sp, #64]\n\t" + "STM r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x44\n\t" + : + : + : "memory", "lr" + ); +} + +#else void fe_sq_op(void); void fe_sq_op() { @@ -1810,6 +2487,7 @@ void fe_sq_op() ); } +#endif /* WOLFSSL_SP_NO_UMAAL */ void fe_sq(fe r_p, const fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -1824,6 +2502,60 @@ void fe_sq(fe r_p, const fe a_p) } #ifdef HAVE_CURVE25519 +#ifdef WOLFSSL_SP_NO_UMAAL +void fe_mul121666(fe r_p, fe a_p) +{ + register sword32* r asm ("r0") = (sword32*)r_p; + register sword32* a asm ("r1") = (sword32*)a_p; + + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "LDM %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "MOV r12, #0xdb42\n\t" + "MOVT r12, #0x1\n\t" + "UMULL r2, r10, r2, r12\n\t" + "UMULL r3, r11, r3, r12\n\t" + "ADDS r3, r3, r10\n\t" + "ADC r11, r11, #0x0\n\t" + "UMULL r4, r10, r4, r12\n\t" + "ADDS r4, r4, r11\n\t" + "ADC r10, r10, #0x0\n\t" + "UMULL r5, r11, r5, r12\n\t" + "ADDS r5, r5, r10\n\t" + "ADC r11, r11, #0x0\n\t" + "UMULL r6, r10, r6, r12\n\t" + "ADDS r6, r6, r11\n\t" + "ADC r10, r10, #0x0\n\t" + "UMULL r7, r11, r7, r12\n\t" + "ADDS r7, r7, r10\n\t" + "ADC r11, r11, #0x0\n\t" + "UMULL r8, r10, r8, r12\n\t" + "ADDS r8, r8, r11\n\t" + "ADC r10, r10, #0x0\n\t" + "UMULL r9, r11, r9, r12\n\t" + "ADDS r9, r9, r10\n\t" + "MOV r12, #0x13\n\t" + "ADC r11, r11, #0x0\n\t" + "LSL r11, r11, #1\n\t" + "ORR r11, r11, r9, LSR #31\n\t" + "MUL r11, r11, r12\n\t" + "ADDS r2, r2, r11\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "BFC r9, #31, #1\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "STM %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +#else void fe_mul121666(fe r_p, fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -1863,6 +2595,7 @@ void fe_mul121666(fe r_p, fe a_p) ); } +#endif /* WOLFSSL_SP_NO_UMAAL */ #ifndef WC_NO_CACHE_RESISTANT int curve25519(byte* r_p, const byte* n_p, const byte* a_p) { @@ -2736,6 +3469,315 @@ void fe_invert(fe r_p, const fe a_p) ); } +#ifdef WOLFSSL_SP_NO_UMAAL +void fe_sq2(fe r_p, const fe a_p) +{ + register sword32* r asm ("r0") = (sword32*)r_p; + register const sword32* a asm ("r1") = (const sword32*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + "STR r0, [sp, #64]\n\t" + /* Square * 2 */ + "MOV r0, #0x0\n\t" + "LDR r12, [r1]\n\t" + /* A[0] * A[1] */ + "LDR lr, [r1, #4]\n\t" + "UMULL r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "LDR lr, [r1, #12]\n\t" + "UMULL r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "UMULL r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "UMULL r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "LDR lr, [r1, #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[0] * A[4] */ + "LDR lr, [r1, #16]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[0] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + "ADCS r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + "STR r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "LDR r12, [r1, #4]\n\t" + "LDR lr, [r1, #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * A[3] */ + "LDR lr, [r1, #12]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * A[4] */ + "LDR lr, [r1, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r4, r0, #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "LDR r12, [r1, #8]\n\t" + "LDR lr, [r1, #12]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * A[4] */ + "LDR lr, [r1, #16]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r5, r0, #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "LDR r12, [r1, #12]\n\t" + "LDR lr, [r1, #16]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * A[5] */ + "LDR lr, [r1, #20]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r6, r0, #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "LDR r12, [r1, #16]\n\t" + "LDR lr, [r1, #20]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * A[6] */ + "LDR lr, [r1, #24]\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r7, r0, #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "LDR r12, [r1, #20]\n\t" + "LDR lr, [r1, #24]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * A[7] */ + "LDR lr, [r1, #28]\n\t" + "ADC r8, r0, #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "LDR r12, [r1, #24]\n\t" + "LDR lr, [r1, #28]\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "STM lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADC r10, r0, #0x0\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "MOV lr, sp\n\t" + /* A[0] * A[0] */ + "LDR r12, [r1]\n\t" + "UMULL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[1] * A[1] */ + "LDR r12, [r1, #4]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * A[2] */ + "LDR r12, [r1, #8]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * A[3] */ + "LDR r12, [r1, #12]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r9, r11, r12, r12\n\t" + "ADDS r10, r10, r11\n\t" + "STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "LDR r12, [r1, #16]\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * A[5] */ + "LDR r12, [r1, #20]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * A[6] */ + "LDR r12, [r1, #24]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * A[7] */ + "LDR r12, [r1, #28]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r12, r12\n\t" + /* Reduce */ + "LDR r2, [sp, #28]\n\t" + "MOV lr, sp\n\t" + "MOV r12, #0x26\n\t" + "UMULL r10, r11, r10, r12\n\t" + "ADDS r10, r10, r2\n\t" + "ADC r11, r11, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL r11, r11, #1\n\t" + "ORR r11, r11, r10, LSR #31\n\t" + "MUL r11, r11, r12\n\t" + "LDM lr!, {r1, r2}\n\t" + "MOV r12, #0x26\n\t" + "ADDS r1, r1, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r1, r11, r3, r12\n\t" + "ADDS r2, r2, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r2, r11, r4, r12\n\t" + "LDM lr!, {r3, r4}\n\t" + "ADDS r3, r3, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r3, r11, r5, r12\n\t" + "ADDS r4, r4, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r4, r11, r6, r12\n\t" + "LDM lr!, {r5, r6}\n\t" + "ADDS r5, r5, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r5, r11, r7, r12\n\t" + "ADDS r6, r6, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r6, r11, r8, r12\n\t" + "LDM lr!, {r7, r8}\n\t" + "ADDS r7, r7, r11\n\t" + "ADC r11, r0, #0x0\n\t" + "UMLAL r7, r11, r9, r12\n\t" + "BFC r10, #31, #1\n\t" + "ADDS r8, r10, r11\n\t" + /* Reduce if top bit set */ + "MOV r12, #0x13\n\t" + "AND r11, r12, r8, ASR #31\n\t" + "ADDS r1, r1, r11\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "BFC r8, #31, #1\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r8, r8, #0x0\n\t" + /* Double */ + "ADDS r1, r1, r1\n\t" + "ADCS r2, r2, r2\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADC r8, r8, r8\n\t" + /* Reduce if top bit set */ + "MOV r12, #0x13\n\t" + "AND r11, r12, r8, ASR #31\n\t" + "ADDS r1, r1, r11\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "BFC r8, #31, #1\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r8, r8, #0x0\n\t" + /* Store */ + "LDR r0, [sp, #64]\n\t" + "STM r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr" + ); +} + +#else void fe_sq2(fe r_p, const fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -2892,6 +3934,7 @@ void fe_sq2(fe r_p, const fe a_p) ); } +#endif /* WOLFSSL_SP_NO_UMAAL */ void fe_pow22523(fe r_p, const fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -3505,12 +4548,442 @@ void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) ); } +#ifdef WOLFSSL_SP_NO_UMAAL void sc_reduce(byte* s_p) { register byte* s asm ("r0") = (byte*)s_p; __asm__ __volatile__ ( - "SUB sp, sp, #0x34\n\t" + "SUB sp, sp, #0x38\n\t" + "STR %[s], [sp, #52]\n\t" + /* Load bits 252-511 */ + "ADD %[s], %[s], #0x1c\n\t" + "LDM %[s], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "LSR lr, r9, #24\n\t" + "LSL r9, r9, #4\n\t" + "ORR r9, r9, r8, LSR #28\n\t" + "LSL r8, r8, #4\n\t" + "ORR r8, r8, r7, LSR #28\n\t" + "LSL r7, r7, #4\n\t" + "ORR r7, r7, r6, LSR #28\n\t" + "LSL r6, r6, #4\n\t" + "ORR r6, r6, r5, LSR #28\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, r3, LSR #28\n\t" + "LSL r3, r3, #4\n\t" + "ORR r3, r3, r2, LSR #28\n\t" + "LSL r2, r2, #4\n\t" + "ORR r2, r2, r1, LSR #28\n\t" + "BFC r9, #28, #4\n\t" + "SUB %[s], %[s], #0x1c\n\t" + /* Add order times bits 504..511 */ + "MOV r10, #0x2c13\n\t" + "MOVT r10, #0xa30a\n\t" + "MOV r11, #0x9ce5\n\t" + "MOVT r11, #0xa7ed\n\t" + "MOV r1, #0x0\n\t" + "UMLAL r2, r1, r10, lr\n\t" + "ADDS r3, r3, r1\n\t" + "MOV r1, #0x0\n\t" + "ADC r1, r1, #0x0\n\t" + "UMLAL r3, r1, r11, lr\n\t" + "MOV r10, #0x6329\n\t" + "MOVT r10, #0x5d08\n\t" + "MOV r11, #0x621\n\t" + "MOVT r11, #0xeb21\n\t" + "ADDS r4, r4, r1\n\t" + "MOV r1, #0x0\n\t" + "ADC r1, r1, #0x0\n\t" + "UMLAL r4, r1, r10, lr\n\t" + "ADDS r5, r5, r1\n\t" + "MOV r1, #0x0\n\t" + "ADC r1, r1, #0x0\n\t" + "UMLAL r5, r1, r11, lr\n\t" + "ADDS r6, r6, r1\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "SUBS r6, r6, lr\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC r9, r9, #0x0\n\t" + /* Sub product of top 8 words and order */ + "MOV r12, sp\n\t" + "MOV r1, #0x2c13\n\t" + "MOVT r1, #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r3, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, r1\n\t" + "BFC r11, #28, #4\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, r1\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "SUB r12, r12, #0x20\n\t" + "MOV r1, #0x9ce5\n\t" + "MOVT r1, #0xa7ed\n\t" + "MOV lr, #0x0\n\t" + "LDM r12, {r10, r11}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r3, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, r1\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" + "MOV r1, #0x6329\n\t" + "MOVT r1, #0x5d08\n\t" + "MOV lr, #0x0\n\t" + "LDM r12, {r10, r11}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r3, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, r1\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" + "MOV r1, #0x621\n\t" + "MOVT r1, #0xeb21\n\t" + "MOV lr, #0x0\n\t" + "LDM r12, {r10, r11}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r3, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, r1\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, r1\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" + /* Subtract at 4 * 32 */ + "LDM r12, {r10, r11}\n\t" + "SUBS r10, r10, r2\n\t" + "SBCS r11, r11, r3\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r4\n\t" + "SBCS r11, r11, r5\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r8\n\t" + "SBC r11, r11, r9\n\t" + "STM r12!, {r10, r11}\n\t" + "SUB r12, r12, #0x24\n\t" + "ASR lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ + "MOV r1, #0xa0000000\n\t" + "MOV r2, #0xba7d\n\t" + "MOVT r2, #0x4b9e\n\t" + "MOV r3, #0x4c63\n\t" + "MOVT r3, #0xcb02\n\t" + "MOV r4, #0xf39a\n\t" + "MOVT r4, #0xd45e\n\t" + "MOV r5, #0xdf3b\n\t" + "MOVT r5, #0x29b\n\t" + "MOV r9, #0x2000000\n\t" + "AND r1, r1, lr\n\t" + "AND r2, r2, lr\n\t" + "AND r3, r3, lr\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r9, r9, lr\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, r1\n\t" + "ADCS r11, r11, r2\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, r3\n\t" + "ADCS r11, r11, r4\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, r5\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10}\n\t" + "ADCS r10, r10, #0x0\n\t" + "STM r12!, {r10}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "MOV r12, sp\n\t" + /* Load bits 252-376 */ + "ADD r12, r12, #0x1c\n\t" + "LDM r12, {r1, r2, r3, r4, r5}\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, r3, LSR #28\n\t" + "LSL r3, r3, #4\n\t" + "ORR r3, r3, r2, LSR #28\n\t" + "LSL r2, r2, #4\n\t" + "ORR r2, r2, r1, LSR #28\n\t" + "BFC r5, #29, #3\n\t" + "SUB r12, r12, #0x1c\n\t" + /* Sub product of top 4 words and order */ + "MOV %[s], sp\n\t" + /* * -5cf5d3ed */ + "MOV r1, #0x2c13\n\t" + "MOVT r1, #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, lr, r2, r1\n\t" + "ADDS r7, r7, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r7, lr, r3, r1\n\t" + "ADDS r8, r8, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r8, lr, r4, r1\n\t" + "ADDS r9, r9, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r9, lr, r5, r1\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* * -5812631b */ + "MOV r1, #0x9ce5\n\t" + "MOVT r1, #0xa7ed\n\t" + "MOV r10, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, r10, r2, r1\n\t" + "ADDS r7, r7, r10\n\t" + "MOV r10, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r7, r10, r3, r1\n\t" + "ADDS r8, r8, r10\n\t" + "MOV r10, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r8, r10, r4, r1\n\t" + "ADDS r9, r9, r10\n\t" + "MOV r10, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r5, r1\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* * -a2f79cd7 */ + "MOV r1, #0x6329\n\t" + "MOVT r1, #0x5d08\n\t" + "MOV r11, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, r11, r2, r1\n\t" + "ADDS r7, r7, r11\n\t" + "MOV r11, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "UMLAL r7, r11, r3, r1\n\t" + "ADDS r8, r8, r11\n\t" + "MOV r11, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "UMLAL r8, r11, r4, r1\n\t" + "ADDS r9, r9, r11\n\t" + "MOV r11, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "UMLAL r9, r11, r5, r1\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* * -14def9df */ + "MOV r1, #0x621\n\t" + "MOVT r1, #0xeb21\n\t" + "MOV r12, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, r12, r2, r1\n\t" + "ADDS r7, r7, r12\n\t" + "MOV r12, #0x0\n\t" + "ADC r12, r12, #0x0\n\t" + "UMLAL r7, r12, r3, r1\n\t" + "ADDS r8, r8, r12\n\t" + "MOV r12, #0x0\n\t" + "ADC r12, r12, #0x0\n\t" + "UMLAL r8, r12, r4, r1\n\t" + "ADDS r9, r9, r12\n\t" + "MOV r12, #0x0\n\t" + "ADC r12, r12, #0x0\n\t" + "UMLAL r9, r12, r5, r1\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* Add overflows at 4 * 32 */ + "LDM %[s], {r6, r7, r8, r9}\n\t" + "BFC r9, #28, #4\n\t" + "ADDS r6, r6, lr\n\t" + "ADCS r7, r7, r10\n\t" + "ADCS r8, r8, r11\n\t" + "ADC r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "SUBS r6, r6, r2\n\t" + "SBCS r7, r7, r3\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBC r1, r1, r1\n\t" + "SUB %[s], %[s], #0x10\n\t" + "LDM %[s], {r2, r3, r4, r5}\n\t" + "MOV r10, #0xd3ed\n\t" + "MOVT r10, #0x5cf5\n\t" + "MOV r11, #0x631a\n\t" + "MOVT r11, #0x5812\n\t" + "MOV r12, #0x9cd6\n\t" + "MOVT r12, #0xa2f7\n\t" + "MOV lr, #0xf9de\n\t" + "MOVT lr, #0x14de\n\t" + "AND r10, r10, r1\n\t" + "AND r11, r11, r1\n\t" + "AND r12, r12, r1\n\t" + "AND lr, lr, r1\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADCS r5, r5, lr\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "AND r1, r1, #0x10000000\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, r1\n\t" + "BFC r9, #28, #4\n\t" + /* Store result */ + "LDR %[s], [sp, #52]\n\t" + "STM %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x38\n\t" + : [s] "+r" (s) + : + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +void sc_reduce(byte* s_p) +{ + register byte* s asm ("r0") = (byte*)s_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x38\n\t" + "STR %[s], [sp, #52]\n\t" /* Load bits 252-511 */ "ADD %[s], %[s], #0x1c\n\t" "LDM %[s], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" @@ -3556,96 +5029,107 @@ void sc_reduce(byte* s_p) "SBCS r8, r8, #0x0\n\t" "SBC r9, r9, #0x0\n\t" /* Sub product of top 8 words and order */ + "MOV r12, sp\n\t" "MOV r1, #0x2c13\n\t" "MOVT r1, #0xa30a\n\t" "MOV lr, #0x0\n\t" - "LDM %[s]!, {r10, r11, r12}\n\t" + "LDM %[s]!, {r10, r11}\n\t" "UMLAL r10, lr, r2, r1\n\t" "UMAAL r11, lr, r3, r1\n\t" - "UMAAL r12, lr, r4, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM %[s]!, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, r1\n\t" - "UMAAL r11, lr, r6, r1\n\t" - "UMAAL r12, lr, r7, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMAAL r10, lr, r4, r1\n\t" + "UMAAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMAAL r10, lr, r6, r1\n\t" + "UMAAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" "LDM %[s]!, {r10, r11}\n\t" "UMAAL r10, lr, r8, r1\n\t" "BFC r11, #28, #4\n\t" "UMAAL r11, lr, r9, r1\n\t" - "STM sp!, {r10, r11, lr}\n\t" + "STM r12!, {r10, r11, lr}\n\t" "SUB %[s], %[s], #0x10\n\t" - "SUB sp, sp, #0x20\n\t" + "SUB r12, r12, #0x20\n\t" "MOV r1, #0x9ce5\n\t" "MOVT r1, #0xa7ed\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "UMLAL r10, lr, r2, r1\n\t" "UMAAL r11, lr, r3, r1\n\t" - "UMAAL r12, lr, r4, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, r1\n\t" - "UMAAL r11, lr, r6, r1\n\t" - "UMAAL r12, lr, r7, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r4, r1\n\t" + "UMAAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r6, r1\n\t" + "UMAAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "UMAAL r10, lr, r8, r1\n\t" "UMAAL r11, lr, r9, r1\n\t" - "STM sp!, {r10, r11, lr}\n\t" - "SUB sp, sp, #0x20\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" "MOV r1, #0x6329\n\t" "MOVT r1, #0x5d08\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "UMLAL r10, lr, r2, r1\n\t" "UMAAL r11, lr, r3, r1\n\t" - "UMAAL r12, lr, r4, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, r1\n\t" - "UMAAL r11, lr, r6, r1\n\t" - "UMAAL r12, lr, r7, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r4, r1\n\t" + "UMAAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r6, r1\n\t" + "UMAAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "UMAAL r10, lr, r8, r1\n\t" "UMAAL r11, lr, r9, r1\n\t" - "STM sp!, {r10, r11, lr}\n\t" - "SUB sp, sp, #0x20\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" "MOV r1, #0x621\n\t" "MOVT r1, #0xeb21\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "UMLAL r10, lr, r2, r1\n\t" "UMAAL r11, lr, r3, r1\n\t" - "UMAAL r12, lr, r4, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, r1\n\t" - "UMAAL r11, lr, r6, r1\n\t" - "UMAAL r12, lr, r7, r1\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r4, r1\n\t" + "UMAAL r11, lr, r5, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r6, r1\n\t" + "UMAAL r11, lr, r7, r1\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "UMAAL r10, lr, r8, r1\n\t" "UMAAL r11, lr, r9, r1\n\t" - "STM sp!, {r10, r11, lr}\n\t" - "SUB sp, sp, #0x20\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" /* Subtract at 4 * 32 */ - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "SUBS r10, r10, r2\n\t" "SBCS r11, r11, r3\n\t" - "SBCS r12, r12, r4\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "SBCS r10, r10, r5\n\t" - "SBCS r11, r11, r6\n\t" - "SBCS r12, r12, r7\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r4\n\t" + "SBCS r11, r11, r5\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "SBCS r10, r10, r8\n\t" "SBC r11, r11, r9\n\t" - "STM sp!, {r10, r11}\n\t" - "SUB sp, sp, #0x24\n\t" + "STM r12!, {r10, r11}\n\t" + "SUB r12, r12, #0x24\n\t" "ASR lr, r11, #25\n\t" /* Conditionally subtract order starting at bit 125 */ "MOV r1, #0xa0000000\n\t" @@ -3664,26 +5148,30 @@ void sc_reduce(byte* s_p) "AND r4, r4, lr\n\t" "AND r5, r5, lr\n\t" "AND r9, r9, lr\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "ADDS r10, r10, r1\n\t" "ADCS r11, r11, r2\n\t" - "ADCS r12, r12, r3\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "ADCS r10, r10, r4\n\t" - "ADCS r11, r11, r5\n\t" - "ADCS r12, r12, #0x0\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, r3\n\t" + "ADCS r11, r11, r4\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, r5\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "ADCS r10, r10, #0x0\n\t" "ADCS r11, r11, #0x0\n\t" - "ADCS r12, r12, r9\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "SUB sp, sp, #0x30\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10}\n\t" + "ADCS r10, r10, #0x0\n\t" + "STM r12!, {r10}\n\t" "SUB %[s], %[s], #0x10\n\t" + "MOV r12, sp\n\t" /* Load bits 252-376 */ - "ADD sp, sp, #0x1c\n\t" - "LDM sp, {r1, r2, r3, r4, r5}\n\t" + "ADD r12, r12, #0x1c\n\t" + "LDM r12, {r1, r2, r3, r4, r5}\n\t" "LSL r5, r5, #4\n\t" "ORR r5, r5, r4, LSR #28\n\t" "LSL r4, r4, #4\n\t" @@ -3693,54 +5181,55 @@ void sc_reduce(byte* s_p) "LSL r2, r2, #4\n\t" "ORR r2, r2, r1, LSR #28\n\t" "BFC r5, #29, #3\n\t" - "SUB sp, sp, #0x1c\n\t" - /* Sub product of top 8 words and order */ + "SUB r12, r12, #0x1c\n\t" + /* Sub product of top 4 words and order */ + "MOV %[s], sp\n\t" /* * -5cf5d3ed */ "MOV r1, #0x2c13\n\t" "MOVT r1, #0xa30a\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, lr, r2, r1\n\t" "UMAAL r7, lr, r3, r1\n\t" "UMAAL r8, lr, r4, r1\n\t" "UMAAL r9, lr, r5, r1\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* * -5812631b */ "MOV r1, #0x9ce5\n\t" "MOVT r1, #0xa7ed\n\t" "MOV r10, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, r10, r2, r1\n\t" "UMAAL r7, r10, r3, r1\n\t" "UMAAL r8, r10, r4, r1\n\t" "UMAAL r9, r10, r5, r1\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* * -a2f79cd7 */ "MOV r1, #0x6329\n\t" "MOVT r1, #0x5d08\n\t" "MOV r11, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, r11, r2, r1\n\t" "UMAAL r7, r11, r3, r1\n\t" "UMAAL r8, r11, r4, r1\n\t" "UMAAL r9, r11, r5, r1\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* * -14def9df */ "MOV r1, #0x621\n\t" "MOVT r1, #0xeb21\n\t" "MOV r12, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, r12, r2, r1\n\t" "UMAAL r7, r12, r3, r1\n\t" "UMAAL r8, r12, r4, r1\n\t" "UMAAL r9, r12, r5, r1\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* Add overflows at 4 * 32 */ - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "BFC r9, #28, #4\n\t" "ADDS r6, r6, lr\n\t" "ADCS r7, r7, r10\n\t" @@ -3752,8 +5241,8 @@ void sc_reduce(byte* s_p) "SBCS r8, r8, r4\n\t" "SBCS r9, r9, r5\n\t" "SBC r1, r1, r1\n\t" - "SUB sp, sp, #0x10\n\t" - "LDM sp, {r2, r3, r4, r5}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "LDM %[s], {r2, r3, r4, r5}\n\t" "MOV r10, #0xd3ed\n\t" "MOVT r10, #0x5cf5\n\t" "MOV r11, #0x631a\n\t" @@ -3777,15 +5266,806 @@ void sc_reduce(byte* s_p) "ADC r9, r9, r1\n\t" "BFC r9, #28, #4\n\t" /* Store result */ + "LDR %[s], [sp, #52]\n\t" "STM %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x34\n\t" + "ADD sp, sp, #0x38\n\t" : [s] "+r" (s) : : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); } +#endif /* WOLFSSL_SP_NO_UMAAL */ #ifdef HAVE_ED25519_SIGN +#ifdef WOLFSSL_SP_NO_UMAAL +void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) +{ + register byte* s asm ("r0") = (byte*)s_p; + register const byte* a asm ("r1") = (const byte*)a_p; + register const byte* b asm ("r2") = (const byte*)b_p; + register const byte* c asm ("r3") = (const byte*)c_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x50\n\t" + "ADD lr, sp, #0x44\n\t" + "STM lr, {%[s], %[a], %[c]}\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "LDR lr, [%[b]]\n\t" + "UMULL %[c], r4, r12, lr\n\t" + /* A[0] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "UMULL r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "UMULL r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "UMULL r9, r10, r12, lr\n\t" + "STR %[c], [sp]\n\t" + /* A[0] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "MOV r11, %[r]\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[0] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[0] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[0] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC %[c], %[r], #0x0\n\t" + "UMLAL r10, %[c], r12, lr\n\t" + /* A[1] * B[0] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "STR r4, [sp, #4]\n\t" + "ADDS r5, r5, r11\n\t" + /* A[1] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[1] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[1] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL %[c], r4, r12, lr\n\t" + /* A[2] * B[0] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "STR r5, [sp, #8]\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[2] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[2] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[2] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL %[c], r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[3] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[3] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[3] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[3] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL %[c], r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[4] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[4] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[4] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[4] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL %[c], r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[4] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[5] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[5] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[5] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL %[c], r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[5] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[5] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[6] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[6] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL %[c], r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[6] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[6] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[6] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[6] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r9, %[r], #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "LDR r12, [%[a], #28]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS %[c], %[c], r11\n\t" + /* A[7] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL %[c], r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[7] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[7] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[7] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[7] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[7] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r10, %[r], #0x0\n\t" + "UMLAL r9, r10, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" + "MOV %[s], sp\n\t" + /* Add c to a * b */ + "LDR lr, [sp, #76]\n\t" + "LDM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "LDM lr!, {%[a], r10, r11, r12}\n\t" + "ADDS %[b], %[b], %[a]\n\t" + "ADCS %[c], %[c], r10\n\t" + "ADCS r4, r4, r11\n\t" + "ADCS r5, r5, r12\n\t" + "LDM lr!, {%[a], r10, r11, r12}\n\t" + "ADCS r6, r6, %[a]\n\t" + "ADCS r7, r7, r10\n\t" + "ADCS r8, r8, r11\n\t" + "ADCS r9, r9, r12\n\t" + "MOV %[a], r9\n\t" + "STM %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "LDM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ADCS %[b], %[b], #0x0\n\t" + "ADCS %[c], %[c], #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "SUB %[s], %[s], #0x20\n\t" + /* Get 252..503 and 504..507 */ + "LSR lr, r9, #24\n\t" + "LSL r9, r9, #4\n\t" + "ORR r9, r9, r8, LSR #28\n\t" + "LSL r8, r8, #4\n\t" + "ORR r8, r8, r7, LSR #28\n\t" + "LSL r7, r7, #4\n\t" + "ORR r7, r7, r6, LSR #28\n\t" + "LSL r6, r6, #4\n\t" + "ORR r6, r6, r5, LSR #28\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, %[c], LSR #28\n\t" + "LSL %[c], %[c], #4\n\t" + "ORR %[c], %[c], %[b], LSR #28\n\t" + "LSL %[b], %[b], #4\n\t" + "ORR %[b], %[b], %[a], LSR #28\n\t" + "BFC r9, #28, #4\n\t" + /* Add order times bits 504..507 */ + "MOV r10, #0x2c13\n\t" + "MOVT r10, #0xa30a\n\t" + "MOV r11, #0x9ce5\n\t" + "MOVT r11, #0xa7ed\n\t" + "MOV %[a], #0x0\n\t" + "UMLAL %[b], %[a], r10, lr\n\t" + "ADDS %[c], %[c], %[a]\n\t" + "MOV %[a], #0x0\n\t" + "ADC %[a], %[a], #0x0\n\t" + "UMLAL %[c], %[a], r11, lr\n\t" + "MOV r10, #0x6329\n\t" + "MOVT r10, #0x5d08\n\t" + "MOV r11, #0x621\n\t" + "MOVT r11, #0xeb21\n\t" + "ADDS r4, r4, %[a]\n\t" + "MOV %[a], #0x0\n\t" + "ADC %[a], %[a], #0x0\n\t" + "UMLAL r4, %[a], r10, lr\n\t" + "ADDS r5, r5, %[a]\n\t" + "MOV %[a], #0x0\n\t" + "ADC %[a], %[a], #0x0\n\t" + "UMLAL r5, %[a], r11, lr\n\t" + "ADDS r6, r6, %[a]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "SUBS r6, r6, lr\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC r9, r9, #0x0\n\t" + /* Sub product of top 8 words and order */ + "MOV r12, sp\n\t" + "MOV %[a], #0x2c13\n\t" + "MOVT %[a], #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, %[c], %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, %[a]\n\t" + "BFC r11, #28, #4\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, %[a]\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "SUB r12, r12, #0x20\n\t" + "MOV %[a], #0x9ce5\n\t" + "MOVT %[a], #0xa7ed\n\t" + "MOV lr, #0x0\n\t" + "LDM r12, {r10, r11}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, %[c], %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, %[a]\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" + "MOV %[a], #0x6329\n\t" + "MOVT %[a], #0x5d08\n\t" + "MOV lr, #0x0\n\t" + "LDM r12, {r10, r11}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, %[c], %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, %[a]\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" + "MOV %[a], #0x621\n\t" + "MOVT %[a], #0xeb21\n\t" + "MOV lr, #0x0\n\t" + "LDM r12, {r10, r11}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, %[c], %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r4, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r6, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r10, lr, r8, %[a]\n\t" + "ADDS r11, r11, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r11, lr, r9, %[a]\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" + /* Subtract at 4 * 32 */ + "LDM r12, {r10, r11}\n\t" + "SUBS r10, r10, %[b]\n\t" + "SBCS r11, r11, %[c]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r4\n\t" + "SBCS r11, r11, r5\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r8\n\t" + "SBC r11, r11, r9\n\t" + "STM r12!, {r10, r11}\n\t" + "SUB r12, r12, #0x24\n\t" + "ASR lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ + "MOV %[a], #0xa0000000\n\t" + "MOV %[b], #0xba7d\n\t" + "MOVT %[b], #0x4b9e\n\t" + "MOV %[c], #0x4c63\n\t" + "MOVT %[c], #0xcb02\n\t" + "MOV r4, #0xf39a\n\t" + "MOVT r4, #0xd45e\n\t" + "MOV r5, #0xdf3b\n\t" + "MOVT r5, #0x29b\n\t" + "MOV r9, #0x2000000\n\t" + "AND %[a], %[a], lr\n\t" + "AND %[b], %[b], lr\n\t" + "AND %[c], %[c], lr\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r9, r9, lr\n\t" + "LDM r12, {r10, r11}\n\t" + "ADDS r10, r10, %[a]\n\t" + "ADCS r11, r11, %[b]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, %[c]\n\t" + "ADCS r11, r11, r4\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, r5\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10}\n\t" + "ADCS r10, r10, #0x0\n\t" + "STM r12!, {r10}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "MOV r12, sp\n\t" + /* Load bits 252-376 */ + "ADD r12, r12, #0x1c\n\t" + "LDM r12, {%[a], %[b], %[c], r4, r5}\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, %[c], LSR #28\n\t" + "LSL %[c], %[c], #4\n\t" + "ORR %[c], %[c], %[b], LSR #28\n\t" + "LSL %[b], %[b], #4\n\t" + "ORR %[b], %[b], %[a], LSR #28\n\t" + "BFC r5, #29, #3\n\t" + "SUB r12, r12, #0x1c\n\t" + /* Sub product of top 4 words and order */ + "MOV %[s], sp\n\t" + /* * -5cf5d3ed */ + "MOV %[a], #0x2c13\n\t" + "MOVT %[a], #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, lr, %[b], %[a]\n\t" + "ADDS r7, r7, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r7, lr, %[c], %[a]\n\t" + "ADDS r8, r8, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r8, lr, r4, %[a]\n\t" + "ADDS r9, r9, lr\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "UMLAL r9, lr, r5, %[a]\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* * -5812631b */ + "MOV %[a], #0x9ce5\n\t" + "MOVT %[a], #0xa7ed\n\t" + "MOV r10, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, r10, %[b], %[a]\n\t" + "ADDS r7, r7, r10\n\t" + "MOV r10, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r7, r10, %[c], %[a]\n\t" + "ADDS r8, r8, r10\n\t" + "MOV r10, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r8, r10, r4, %[a]\n\t" + "ADDS r9, r9, r10\n\t" + "MOV r10, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r5, %[a]\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* * -a2f79cd7 */ + "MOV %[a], #0x6329\n\t" + "MOVT %[a], #0x5d08\n\t" + "MOV r11, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, r11, %[b], %[a]\n\t" + "ADDS r7, r7, r11\n\t" + "MOV r11, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "UMLAL r7, r11, %[c], %[a]\n\t" + "ADDS r8, r8, r11\n\t" + "MOV r11, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "UMLAL r8, r11, r4, %[a]\n\t" + "ADDS r9, r9, r11\n\t" + "MOV r11, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "UMLAL r9, r11, r5, %[a]\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* * -14def9df */ + "MOV %[a], #0x621\n\t" + "MOVT %[a], #0xeb21\n\t" + "MOV r12, #0x0\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" + "UMLAL r6, r12, %[b], %[a]\n\t" + "ADDS r7, r7, r12\n\t" + "MOV r12, #0x0\n\t" + "ADC r12, r12, #0x0\n\t" + "UMLAL r7, r12, %[c], %[a]\n\t" + "ADDS r8, r8, r12\n\t" + "MOV r12, #0x0\n\t" + "ADC r12, r12, #0x0\n\t" + "UMLAL r8, r12, r4, %[a]\n\t" + "ADDS r9, r9, r12\n\t" + "MOV r12, #0x0\n\t" + "ADC r12, r12, #0x0\n\t" + "UMLAL r9, r12, r5, %[a]\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" + /* Add overflows at 4 * 32 */ + "LDM %[s], {r6, r7, r8, r9}\n\t" + "BFC r9, #28, #4\n\t" + "ADDS r6, r6, lr\n\t" + "ADCS r7, r7, r10\n\t" + "ADCS r8, r8, r11\n\t" + "ADC r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "SUBS r6, r6, %[b]\n\t" + "SBCS r7, r7, %[c]\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBC %[a], %[a], %[a]\n\t" + "SUB %[s], %[s], #0x10\n\t" + "LDM %[s], {%[b], %[c], r4, r5}\n\t" + "MOV r10, #0xd3ed\n\t" + "MOVT r10, #0x5cf5\n\t" + "MOV r11, #0x631a\n\t" + "MOVT r11, #0x5812\n\t" + "MOV r12, #0x9cd6\n\t" + "MOVT r12, #0xa2f7\n\t" + "MOV lr, #0xf9de\n\t" + "MOVT lr, #0x14de\n\t" + "AND r10, r10, %[a]\n\t" + "AND r11, r11, %[a]\n\t" + "AND r12, r12, %[a]\n\t" + "AND lr, lr, %[a]\n\t" + "ADDS %[b], %[b], r10\n\t" + "ADCS %[c], %[c], r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADCS r5, r5, lr\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "AND %[a], %[a], #0x10000000\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, %[a]\n\t" + "BFC r9, #28, #4\n\t" + "LDR %[s], [sp, #68]\n\t" + /* Store result */ + "STR %[b], [%[s]]\n\t" + "STR %[c], [%[s], #4]\n\t" + "STR r4, [%[s], #8]\n\t" + "STR r5, [%[s], #12]\n\t" + "STR r6, [%[s], #16]\n\t" + "STR r7, [%[s], #20]\n\t" + "STR r8, [%[s], #24]\n\t" + "STR r9, [%[s], #28]\n\t" + "ADD sp, sp, #0x50\n\t" + : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) { register byte* s asm ("r0") = (byte*)s_p; @@ -3896,10 +6176,10 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "MOV %[c], r12\n\t" "ADD lr, sp, #0x20\n\t" "STM lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" - "LDR %[s], [sp, #68]\n\t" + "MOV %[s], sp\n\t" /* Add c to a * b */ "LDR lr, [sp, #76]\n\t" - "LDM sp!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "LDM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" "LDM lr!, {%[a], r10, r11, r12}\n\t" "ADDS %[b], %[b], %[a]\n\t" "ADCS %[c], %[c], r10\n\t" @@ -3911,8 +6191,8 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADCS r8, r8, r11\n\t" "ADCS r9, r9, r12\n\t" "MOV %[a], r9\n\t" - "STM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "LDM sp, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "STM %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "LDM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" "ADCS %[b], %[b], #0x0\n\t" "ADCS %[c], %[c], #0x0\n\t" "ADCS r4, r4, #0x0\n\t" @@ -3921,10 +6201,9 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADCS r7, r7, #0x0\n\t" "ADCS r8, r8, #0x0\n\t" "ADC r9, r9, #0x0\n\t" - "SUB sp, sp, #0x20\n\t" + "SUB %[s], %[s], #0x20\n\t" /* Get 252..503 and 504..507 */ "LSR lr, r9, #24\n\t" - "BFC r9, #24, #8\n\t" "LSL r9, r9, #4\n\t" "ORR r9, r9, r8, LSR #28\n\t" "LSL r8, r8, #4\n\t" @@ -3941,6 +6220,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ORR %[c], %[c], %[b], LSR #28\n\t" "LSL %[b], %[b], #4\n\t" "ORR %[b], %[b], %[a], LSR #28\n\t" + "BFC r9, #28, #4\n\t" /* Add order times bits 504..507 */ "MOV r10, #0x2c13\n\t" "MOVT r10, #0xa30a\n\t" @@ -3964,96 +6244,107 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "SBCS r8, r8, #0x0\n\t" "SBC r9, r9, #0x0\n\t" /* Sub product of top 8 words and order */ + "MOV r12, sp\n\t" "MOV %[a], #0x2c13\n\t" "MOVT %[a], #0xa30a\n\t" "MOV lr, #0x0\n\t" - "LDM %[s]!, {r10, r11, r12}\n\t" + "LDM %[s]!, {r10, r11}\n\t" "UMLAL r10, lr, %[b], %[a]\n\t" "UMAAL r11, lr, %[c], %[a]\n\t" - "UMAAL r12, lr, r4, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM %[s]!, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, %[a]\n\t" - "UMAAL r11, lr, r6, %[a]\n\t" - "UMAAL r12, lr, r7, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMAAL r10, lr, r4, %[a]\n\t" + "UMAAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMAAL r10, lr, r6, %[a]\n\t" + "UMAAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" "LDM %[s]!, {r10, r11}\n\t" "UMAAL r10, lr, r8, %[a]\n\t" "BFC r11, #28, #4\n\t" "UMAAL r11, lr, r9, %[a]\n\t" - "STM sp!, {r10, r11, lr}\n\t" + "STM r12!, {r10, r11, lr}\n\t" "SUB %[s], %[s], #0x10\n\t" - "SUB sp, sp, #0x20\n\t" + "SUB r12, r12, #0x20\n\t" "MOV %[a], #0x9ce5\n\t" "MOVT %[a], #0xa7ed\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "UMLAL r10, lr, %[b], %[a]\n\t" "UMAAL r11, lr, %[c], %[a]\n\t" - "UMAAL r12, lr, r4, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, %[a]\n\t" - "UMAAL r11, lr, r6, %[a]\n\t" - "UMAAL r12, lr, r7, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r4, %[a]\n\t" + "UMAAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r6, %[a]\n\t" + "UMAAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "UMAAL r10, lr, r8, %[a]\n\t" "UMAAL r11, lr, r9, %[a]\n\t" - "STM sp!, {r10, r11, lr}\n\t" - "SUB sp, sp, #0x20\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" "MOV %[a], #0x6329\n\t" "MOVT %[a], #0x5d08\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "UMLAL r10, lr, %[b], %[a]\n\t" "UMAAL r11, lr, %[c], %[a]\n\t" - "UMAAL r12, lr, r4, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, %[a]\n\t" - "UMAAL r11, lr, r6, %[a]\n\t" - "UMAAL r12, lr, r7, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r4, %[a]\n\t" + "UMAAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r6, %[a]\n\t" + "UMAAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "UMAAL r10, lr, r8, %[a]\n\t" "UMAAL r11, lr, r9, %[a]\n\t" - "STM sp!, {r10, r11, lr}\n\t" - "SUB sp, sp, #0x20\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" "MOV %[a], #0x621\n\t" "MOVT %[a], #0xeb21\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "UMLAL r10, lr, %[b], %[a]\n\t" "UMAAL r11, lr, %[c], %[a]\n\t" - "UMAAL r12, lr, r4, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "UMAAL r10, lr, r5, %[a]\n\t" - "UMAAL r11, lr, r6, %[a]\n\t" - "UMAAL r12, lr, r7, %[a]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r4, %[a]\n\t" + "UMAAL r11, lr, r5, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "UMAAL r10, lr, r6, %[a]\n\t" + "UMAAL r11, lr, r7, %[a]\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "UMAAL r10, lr, r8, %[a]\n\t" "UMAAL r11, lr, r9, %[a]\n\t" - "STM sp!, {r10, r11, lr}\n\t" - "SUB sp, sp, #0x20\n\t" + "STM r12!, {r10, r11, lr}\n\t" + "SUB r12, r12, #0x20\n\t" /* Subtract at 4 * 32 */ - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "SUBS r10, r10, %[b]\n\t" "SBCS r11, r11, %[c]\n\t" - "SBCS r12, r12, r4\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "SBCS r10, r10, r5\n\t" - "SBCS r11, r11, r6\n\t" - "SBCS r12, r12, r7\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r4\n\t" + "SBCS r11, r11, r5\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "SBCS r10, r10, r8\n\t" "SBC r11, r11, r9\n\t" - "STM sp!, {r10, r11}\n\t" - "SUB sp, sp, #0x24\n\t" + "STM r12!, {r10, r11}\n\t" + "SUB r12, r12, #0x24\n\t" "ASR lr, r11, #25\n\t" /* Conditionally subtract order starting at bit 125 */ "MOV %[a], #0xa0000000\n\t" @@ -4072,26 +6363,30 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "AND r4, r4, lr\n\t" "AND r5, r5, lr\n\t" "AND r9, r9, lr\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "LDM r12, {r10, r11}\n\t" "ADDS r10, r10, %[a]\n\t" "ADCS r11, r11, %[b]\n\t" - "ADCS r12, r12, %[c]\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" - "ADCS r10, r10, r4\n\t" - "ADCS r11, r11, r5\n\t" - "ADCS r12, r12, #0x0\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "LDM sp, {r10, r11, r12}\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, %[c]\n\t" + "ADCS r11, r11, r4\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" + "ADCS r10, r10, r5\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10, r11}\n\t" "ADCS r10, r10, #0x0\n\t" "ADCS r11, r11, #0x0\n\t" - "ADCS r12, r12, r9\n\t" - "STM sp!, {r10, r11, r12}\n\t" - "SUB sp, sp, #0x30\n\t" + "STM r12!, {r10, r11}\n\t" + "LDM r12, {r10}\n\t" + "ADCS r10, r10, #0x0\n\t" + "STM r12!, {r10}\n\t" "SUB %[s], %[s], #0x10\n\t" + "MOV r12, sp\n\t" /* Load bits 252-376 */ - "ADD sp, sp, #0x1c\n\t" - "LDM sp, {%[a], %[b], %[c], r4, r5}\n\t" + "ADD r12, r12, #0x1c\n\t" + "LDM r12, {%[a], %[b], %[c], r4, r5}\n\t" "LSL r5, r5, #4\n\t" "ORR r5, r5, r4, LSR #28\n\t" "LSL r4, r4, #4\n\t" @@ -4101,54 +6396,55 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "LSL %[b], %[b], #4\n\t" "ORR %[b], %[b], %[a], LSR #28\n\t" "BFC r5, #29, #3\n\t" - "SUB sp, sp, #0x1c\n\t" - /* Sub product of top 8 words and order */ + "SUB r12, r12, #0x1c\n\t" + /* Sub product of top 4 words and order */ + "MOV %[s], sp\n\t" /* * -5cf5d3ed */ "MOV %[a], #0x2c13\n\t" "MOVT %[a], #0xa30a\n\t" "MOV lr, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, lr, %[b], %[a]\n\t" "UMAAL r7, lr, %[c], %[a]\n\t" "UMAAL r8, lr, r4, %[a]\n\t" "UMAAL r9, lr, r5, %[a]\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* * -5812631b */ "MOV %[a], #0x9ce5\n\t" "MOVT %[a], #0xa7ed\n\t" "MOV r10, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, r10, %[b], %[a]\n\t" "UMAAL r7, r10, %[c], %[a]\n\t" "UMAAL r8, r10, r4, %[a]\n\t" "UMAAL r9, r10, r5, %[a]\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* * -a2f79cd7 */ "MOV %[a], #0x6329\n\t" "MOVT %[a], #0x5d08\n\t" "MOV r11, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, r11, %[b], %[a]\n\t" "UMAAL r7, r11, %[c], %[a]\n\t" "UMAAL r8, r11, r4, %[a]\n\t" "UMAAL r9, r11, r5, %[a]\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* * -14def9df */ "MOV %[a], #0x621\n\t" "MOVT %[a], #0xeb21\n\t" "MOV r12, #0x0\n\t" - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "UMLAL r6, r12, %[b], %[a]\n\t" "UMAAL r7, r12, %[c], %[a]\n\t" "UMAAL r8, r12, r4, %[a]\n\t" "UMAAL r9, r12, r5, %[a]\n\t" - "STM sp, {r6, r7, r8, r9}\n\t" - "ADD sp, sp, #0x4\n\t" + "STM %[s], {r6, r7, r8, r9}\n\t" + "ADD %[s], %[s], #0x4\n\t" /* Add overflows at 4 * 32 */ - "LDM sp, {r6, r7, r8, r9}\n\t" + "LDM %[s], {r6, r7, r8, r9}\n\t" "BFC r9, #28, #4\n\t" "ADDS r6, r6, lr\n\t" "ADCS r7, r7, r10\n\t" @@ -4160,8 +6456,8 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "SBCS r8, r8, r4\n\t" "SBCS r9, r9, r5\n\t" "SBC %[a], %[a], %[a]\n\t" - "SUB sp, sp, #0x10\n\t" - "LDM sp, {%[b], %[c], r4, r5}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "LDM %[s], {%[b], %[c], r4, r5}\n\t" "MOV r10, #0xd3ed\n\t" "MOVT r10, #0x5cf5\n\t" "MOV r11, #0x631a\n\t" @@ -4184,8 +6480,16 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADCS r8, r8, #0x0\n\t" "ADC r9, r9, %[a]\n\t" "BFC r9, #28, #4\n\t" + "LDR %[s], [sp, #68]\n\t" /* Store result */ - "STM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "STR %[b], [%[s]]\n\t" + "STR %[c], [%[s], #4]\n\t" + "STR r4, [%[s], #8]\n\t" + "STR r5, [%[s], #12]\n\t" + "STR r6, [%[s], #16]\n\t" + "STR r7, [%[s], #20]\n\t" + "STR r8, [%[s], #24]\n\t" + "STR r9, [%[s], #28]\n\t" "ADD sp, sp, #0x50\n\t" : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : @@ -4193,6 +6497,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) ); } +#endif /* WOLFSSL_SP_NO_UMAAL */ #endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S index e9721428e..91dc10b37 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S @@ -125,10 +125,14 @@ Transform_Sha256_Len: # Start of loop processing a block L_SHA256_transform_len_begin: # Load, Reverse and Store W - 64 bytes - LDRD r4, r5, [r1] - LDRD r6, r7, [r1, #8] - LDRD r8, r9, [r1, #16] - LDRD r10, r11, [r1, #24] + LDR r4, [r1] + LDR r5, [r1, #4] + LDR r6, [r1, #8] + LDR r7, [r1, #12] + LDR r8, [r1, #16] + LDR r9, [r1, #20] + LDR r10, [r1, #24] + LDR r11, [r1, #28] REV r4, r4 REV r5, r5 REV r6, r6 @@ -141,10 +145,14 @@ L_SHA256_transform_len_begin: STRD r6, r7, [sp, #8] STRD r8, r9, [sp, #16] STRD r10, r11, [sp, #24] - LDRD r4, r5, [r1, #32] - LDRD r6, r7, [r1, #40] - LDRD r8, r9, [r1, #48] - LDRD r10, r11, [r1, #56] + LDR r4, [r1, #32] + LDR r5, [r1, #36] + LDR r6, [r1, #40] + LDR r7, [r1, #44] + LDR r8, [r1, #48] + LDR r9, [r1, #52] + LDR r10, [r1, #56] + LDR r11, [r1, #60] REV r4, r4 REV r5, r5 REV r6, r6 @@ -1461,7 +1469,7 @@ L_SHA256_transform_len_start: BNE L_SHA256_transform_len_begin ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 1866 + # Cycle Count = 1874 .size Transform_Sha256_Len,.-Transform_Sha256_Len #endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* !NO_SHA256 */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c index f7b396a82..a21a607fe 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -84,10 +84,14 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "\n" "L_SHA256_transform_len_begin_%=:\n\t" /* Load, Reverse and Store W - 64 bytes */ - "LDRD r4, r5, [%[data]]\n\t" - "LDRD r6, r7, [%[data], #8]\n\t" - "LDRD r8, r9, [%[data], #16]\n\t" - "LDRD r10, r11, [%[data], #24]\n\t" + "LDR r4, [%[data]]\n\t" + "LDR r5, [%[data], #4]\n\t" + "LDR r6, [%[data], #8]\n\t" + "LDR r7, [%[data], #12]\n\t" + "LDR r8, [%[data], #16]\n\t" + "LDR r9, [%[data], #20]\n\t" + "LDR r10, [%[data], #24]\n\t" + "LDR r11, [%[data], #28]\n\t" "REV r4, r4\n\t" "REV r5, r5\n\t" "REV r6, r6\n\t" @@ -100,10 +104,14 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "STRD r6, r7, [sp, #8]\n\t" "STRD r8, r9, [sp, #16]\n\t" "STRD r10, r11, [sp, #24]\n\t" - "LDRD r4, r5, [%[data], #32]\n\t" - "LDRD r6, r7, [%[data], #40]\n\t" - "LDRD r8, r9, [%[data], #48]\n\t" - "LDRD r10, r11, [%[data], #56]\n\t" + "LDR r4, [%[data], #32]\n\t" + "LDR r5, [%[data], #36]\n\t" + "LDR r6, [%[data], #40]\n\t" + "LDR r7, [%[data], #44]\n\t" + "LDR r8, [%[data], #48]\n\t" + "LDR r9, [%[data], #52]\n\t" + "LDR r10, [%[data], #56]\n\t" + "LDR r11, [%[data], #60]\n\t" "REV r4, r4\n\t" "REV r5, r5\n\t" "REV r6, r6\n\t" diff --git a/wolfssl/wolfcrypt/curve25519.h b/wolfssl/wolfcrypt/curve25519.h index 7f6bed03b..7ebb2d542 100644 --- a/wolfssl/wolfcrypt/curve25519.h +++ b/wolfssl/wolfcrypt/curve25519.h @@ -61,9 +61,9 @@ typedef struct { /* ECC point, the internal structure is Little endian * the mathematical functions used the endianness */ typedef struct ECPoint { - byte point[CURVE25519_KEYSIZE]; + ALIGN16 byte point[CURVE25519_KEYSIZE]; #ifdef FREESCALE_LTC_ECC - byte pointY[CURVE25519_KEYSIZE]; + ALIGN16 byte pointY[CURVE25519_KEYSIZE]; #endif byte pointSz; } ECPoint; @@ -80,8 +80,8 @@ struct curve25519_key { curve in dp */ const curve25519_set_type* dp; /* domain parameters, either points to curves (idx >= 0) or user supplied */ - ECPoint p; /* public point for key */ - byte k[CURVE25519_KEYSIZE]; /* private scaler for key */ + ECPoint p; /* public point for key */ + ALIGN16 byte k[CURVE25519_KEYSIZE]; /* private scaler for key */ #ifdef WOLFSSL_ASYNC_CRYPT WC_ASYNC_DEV asyncDev; diff --git a/wolfssl/wolfcrypt/ed25519.h b/wolfssl/wolfcrypt/ed25519.h index 8306f44b1..7d14418f0 100644 --- a/wolfssl/wolfcrypt/ed25519.h +++ b/wolfssl/wolfcrypt/ed25519.h @@ -85,12 +85,12 @@ enum { /* An ED25519 Key */ struct ed25519_key { - byte p[ED25519_PUB_KEY_SIZE]; /* compressed public key */ - byte k[ED25519_PRV_KEY_SIZE]; /* private key : 32 secret -- 32 public */ + ALIGN16 byte p[ED25519_PUB_KEY_SIZE]; /* compressed public key */ + ALIGN16 byte k[ED25519_PRV_KEY_SIZE]; /* private key: 32 secret, 32 pub */ #ifdef FREESCALE_LTC_ECC /* uncompressed point coordinates */ - byte pointX[ED25519_KEY_SIZE]; /* recovered X coordinate */ - byte pointY[ED25519_KEY_SIZE]; /* Y coordinate is the public key with The most significant bit of the final octet always zero. */ + ALIGN16 byte pointX[ED25519_KEY_SIZE]; /* recovered X coordinate */ + ALIGN16 byte pointY[ED25519_KEY_SIZE]; /* Y coordinate is the public key with The most significant bit of the final octet always zero. */ #endif #ifdef WOLFSSL_SE050 word32 keyId; diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h index c8a8b8a94..4438f8fca 100644 --- a/wolfssl/wolfcrypt/ge_operations.h +++ b/wolfssl/wolfcrypt/ge_operations.h @@ -46,15 +46,15 @@ Representations: */ #ifdef ED25519_SMALL - typedef byte ge[F25519_SIZE]; + ALIGN16 typedef byte ge[F25519_SIZE]; #elif defined(CURVED25519_ASM_64BIT) - typedef sword64 ge[4]; + ALIGN16 typedef sword64 ge[4]; #elif defined(CURVED25519_ASM_32BIT) - typedef sword32 ge[8]; + ALIGN16 typedef sword32 ge[8]; #elif defined(CURVED25519_128BIT) - typedef sword64 ge[5]; + ALIGN16 typedef sword64 ge[5]; #else - typedef sword32 ge[10]; + ALIGN16 typedef sword32 ge[10]; #endif typedef struct {