Merge pull request #8020 from SparkiDev/arm32_base_chacha20_poly1305

ARM32 ChaCha20, Poly1305: assembly code
This commit is contained in:
David Garske
2024-09-30 06:53:37 -07:00
committed by GitHub
21 changed files with 2490 additions and 294 deletions

View File

@ -164,13 +164,11 @@ if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c
endif BUILD_ARMASM
if BUILD_ARMASM_NEON
if !BUILD_ARMASM_CRYPTO
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S
endif !BUILD_ARMASM_INLINE
endif !BUILD_ARMASM_CRYPTO
else
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
@ -336,13 +334,11 @@ if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c
endif BUILD_ARMASM
if BUILD_ARMASM_NEON
if !BUILD_ARMASM_CRYPTO
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S
endif !BUILD_ARMASM_INLINE
endif !BUILD_ARMASM_CRYPTO
else
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
@ -701,7 +697,6 @@ if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c
endif BUILD_ARMASM
if BUILD_ARMASM_NEON
if !BUILD_ARMASM_CRYPTO
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
@ -709,7 +704,6 @@ else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm.S
endif !BUILD_ARMASM_INLINE
endif !BUILD_ARMASM_CRYPTO
else
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
@ -924,8 +918,10 @@ if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-poly1305.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305.c
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm.S
endif !BUILD_ARMASM_INLINE
endif
@ -999,17 +995,17 @@ endif
if BUILD_CHACHA
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c
if BUILD_ARMASM_NEON
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
else
if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha.c
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm.S
endif !BUILD_ARMASM_INLINE
endif BUILD_ARMASM
else
if BUILD_RISCV_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-chacha.c
endif BUILD_RISCV_ASM
@ -1018,7 +1014,7 @@ if BUILD_INTELASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha_asm.S
endif BUILD_INTELASM
endif !BUILD_X86_ASM
endif !BUILD_ARMASM_NEON
endif !BUILD_ARMASM
if BUILD_POLY1305
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
endif BUILD_POLY1305

View File

@ -72,8 +72,7 @@ Public domain.
#endif /* HAVE_CHACHA */
#if defined(WOLFSSL_ARMASM) && (!defined(WOLFSSL_ARMASM_NO_NEON) || \
defined(__thumb__))
#if defined(WOLFSSL_ARMASM)
/* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
#elif defined(WOLFSSL_RISCV_ASM)

View File

@ -232,7 +232,7 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
}
#endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
/* if not 64 bit then use 32 bit */
#elif !defined(WOLFSSL_ARMASM) || !defined(__thumb__)
#elif !defined(WOLFSSL_ARMASM)
static word32 U8TO32(const byte *p)
{
@ -269,8 +269,7 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
}
#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
!defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM)
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
/*
This local function operates on a message with a given number of bytes
with a given ctx pointer to a Poly1305 structure.
@ -789,8 +788,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
return 0;
}
#endif /* (!WOLFSSL_ARMASM || (!__aarch64__ && !__thumb__)) &&
* !WOLFSSL_RISCV_ASM */
#endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
@ -885,8 +883,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
/* process full blocks */
if (bytes >= POLY1305_BLOCK_SIZE) {
size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1));
#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
!defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM)
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
int ret;
ret = poly1305_blocks(ctx, m, want);
if (ret != 0)

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./aes/aes.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.S
* ruby ./aes/aes.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.S
*/
#ifdef HAVE_CONFIG_H

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./aes/aes.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.c
* ruby ./aes/aes.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.c
*/
#ifdef HAVE_CONFIG_H
@ -123,7 +124,9 @@ static const uint32_t L_AES_ARM32_td_data[] = {
};
#endif /* HAVE_AES_DECRYPT */
#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
static const uint32_t L_AES_ARM32_te_data[] = {
0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
@ -191,15 +194,19 @@ static const uint32_t L_AES_ARM32_te_data[] = {
0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616,
};
#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
#ifdef HAVE_AES_DECRYPT
static const uint32_t* L_AES_ARM32_td = L_AES_ARM32_td_data;
#endif /* HAVE_AES_DECRYPT */
#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
static const uint32_t* L_AES_ARM32_te = L_AES_ARM32_te_data;
#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
#ifdef HAVE_AES_DECRYPT
void AES_invert_key(unsigned char* ks, word32 rounds);
void AES_invert_key(unsigned char* ks_p, word32 rounds_p);
void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
{
register unsigned char* ks asm ("r0") = (unsigned char*)ks_p;
@ -401,9 +408,12 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
"str r8, [%[ks]], #4\n\t"
"subs r11, r11, #1\n\t"
"bne L_AES_invert_key_mix_loop_%=\n\t"
: [ks] "+r" (ks), [rounds] "+r" (rounds), [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c), [L_AES_ARM32_td] "+r" (L_AES_ARM32_td_c)
: [ks] "+r" (ks), [rounds] "+r" (rounds),
[L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c),
[L_AES_ARM32_td] "+r" (L_AES_ARM32_td_c)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11"
);
}
@ -411,17 +421,20 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
static const uint32_t L_AES_ARM32_rcon[] = {
0x01000000, 0x02000000, 0x04000000, 0x08000000,
0x10000000, 0x20000000, 0x40000000, 0x80000000,
0x1b000000, 0x36000000,
0x1b000000, 0x36000000
};
void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks);
void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char* ks_p)
void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p,
unsigned char* ks_p);
void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p,
unsigned char* ks_p)
{
register const unsigned char* key asm ("r0") = (const unsigned char*)key_p;
register word32 len asm ("r1") = (word32)len_p;
register unsigned char* ks asm ("r2") = (unsigned char*)ks_p;
register uint32_t* L_AES_ARM32_te_c asm ("r3") = (uint32_t*)L_AES_ARM32_te;
register uint32_t* L_AES_ARM32_rcon_c asm ("r4") = (uint32_t*)&L_AES_ARM32_rcon;
register uint32_t* L_AES_ARM32_rcon_c asm ("r4") =
(uint32_t*)&L_AES_ARM32_rcon;
__asm__ __volatile__ (
"mov r8, %[L_AES_ARM32_te]\n\t"
@ -922,14 +935,18 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char
"bne L_AES_set_encrypt_key_loop_128_%=\n\t"
"\n"
"L_AES_set_encrypt_key_end_%=: \n\t"
: [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c), [L_AES_ARM32_rcon] "+r" (L_AES_ARM32_rcon_c)
: [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks),
[L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c),
[L_AES_ARM32_rcon] "+r" (L_AES_ARM32_rcon_c)
:
: "memory", "r12", "lr", "r5", "r6", "r7", "r8", "cc"
: "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8"
);
}
void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks);
void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t* ks_p)
void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p,
const uint32_t* ks_p);
void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p,
const uint32_t* ks_p)
{
register const uint32_t* te asm ("r0") = (const uint32_t*)te_p;
register int nr asm ("r1") = (int)nr_p;
@ -1573,23 +1590,27 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
: [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks)
: [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks)
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
static const uint32_t* L_AES_ARM32_te_ecb = L_AES_ARM32_te_data;
void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr);
void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p)
void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p);
void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p)
{
register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
register unsigned char* out asm ("r1") = (unsigned char*)out_p;
register unsigned long len asm ("r2") = (unsigned long)len_p;
register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
register int nr asm ("r4") = (int)nr_p;
register uint32_t* L_AES_ARM32_te_ecb_c asm ("r5") = (uint32_t*)L_AES_ARM32_te_ecb;
register uint32_t* L_AES_ARM32_te_ecb_c asm ("r5") =
(uint32_t*)L_AES_ARM32_te_ecb;
__asm__ __volatile__ (
"mov lr, %[in]\n\t"
@ -1822,17 +1843,23 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
"\n"
"L_AES_ECB_encrypt_end_%=: \n\t"
"pop {%[ks]}\n\t"
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c)
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
[nr] "+r" (nr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c)
:
: "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11"
);
}
#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT ||
* WOLFSSL_AES_COUNTER */
#ifdef HAVE_AES_CBC
static const uint32_t* L_AES_ARM32_te_cbc = L_AES_ARM32_te_data;
void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv);
void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p)
void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* iv_p);
void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* iv_p)
{
register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
register unsigned char* out asm ("r1") = (unsigned char*)out_p;
@ -1840,7 +1867,8 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
register int nr asm ("r4") = (int)nr_p;
register unsigned char* iv asm ("r5") = (unsigned char*)iv_p;
register uint32_t* L_AES_ARM32_te_cbc_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_cbc;
register uint32_t* L_AES_ARM32_te_cbc_c asm ("r6") =
(uint32_t*)L_AES_ARM32_te_cbc;
__asm__ __volatile__ (
"mov r8, r4\n\t"
@ -2088,17 +2116,23 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
"L_AES_CBC_encrypt_end_%=: \n\t"
"pop {%[ks], r9}\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c)
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
[nr] "+r" (nr), [iv] "+r" (iv),
[L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c)
:
: "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
);
}
#endif /* HAVE_AES_CBC */
#ifdef WOLFSSL_AES_COUNTER
static const uint32_t* L_AES_ARM32_te_ctr = L_AES_ARM32_te_data;
void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr);
void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p)
void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* ctr_p);
void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* ctr_p)
{
register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
register unsigned char* out asm ("r1") = (unsigned char*)out_p;
@ -2106,7 +2140,8 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
register int nr asm ("r4") = (int)nr_p;
register unsigned char* ctr asm ("r5") = (unsigned char*)ctr_p;
register uint32_t* L_AES_ARM32_te_ctr_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_ctr;
register uint32_t* L_AES_ARM32_te_ctr_c asm ("r6") =
(uint32_t*)L_AES_ARM32_te_ctr;
__asm__ __volatile__ (
"mov r12, r4\n\t"
@ -2356,16 +2391,19 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
"rev r7, r7\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
"stm r8, {r4, r5, r6, r7}\n\t"
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c)
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
[nr] "+r" (nr), [ctr] "+r" (ctr),
[L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c)
:
: "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
);
}
#endif /* WOLFSSL_AES_COUNTER */
#ifdef HAVE_AES_DECRYPT
#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC)
void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4);
#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
defined(HAVE_AES_CBC)
void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p);
void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p)
{
register const uint32_t* td asm ("r0") = (const uint32_t*)td_p;
@ -3009,9 +3047,9 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p)
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
: [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4)
: [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4)
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -3052,16 +3090,20 @@ static const unsigned char L_AES_ARM32_td4[] = {
};
#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr);
void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p)
void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p);
void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p)
{
register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
register unsigned char* out asm ("r1") = (unsigned char*)out_p;
register unsigned long len asm ("r2") = (unsigned long)len_p;
register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
register int nr asm ("r4") = (int)nr_p;
register uint32_t* L_AES_ARM32_td_ecb_c asm ("r5") = (uint32_t*)L_AES_ARM32_td_ecb;
register unsigned char* L_AES_ARM32_td4_c asm ("r6") = (unsigned char*)&L_AES_ARM32_td4;
register uint32_t* L_AES_ARM32_td_ecb_c asm ("r5") =
(uint32_t*)L_AES_ARM32_td_ecb;
register unsigned char* L_AES_ARM32_td4_c asm ("r6") =
(unsigned char*)&L_AES_ARM32_td4;
__asm__ __volatile__ (
"mov r8, r4\n\t"
@ -3291,16 +3333,22 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
"bne L_AES_ECB_decrypt_loop_block_128_%=\n\t"
"\n"
"L_AES_ECB_decrypt_end_%=: \n\t"
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c), [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
[nr] "+r" (nr), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c),
[L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
:
: "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
);
}
#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
#ifdef HAVE_AES_CBC
void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv);
void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p)
void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* iv_p);
void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* iv_p)
{
register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
register unsigned char* out asm ("r1") = (unsigned char*)out_p;
@ -3308,8 +3356,10 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
register int nr asm ("r4") = (int)nr_p;
register unsigned char* iv asm ("r5") = (unsigned char*)iv_p;
register uint32_t* L_AES_ARM32_td_ecb_c asm ("r6") = (uint32_t*)L_AES_ARM32_td_ecb;
register unsigned char* L_AES_ARM32_td4_c asm ("r7") = (unsigned char*)&L_AES_ARM32_td4;
register uint32_t* L_AES_ARM32_td_ecb_c asm ("r6") =
(uint32_t*)L_AES_ARM32_td_ecb;
register unsigned char* L_AES_ARM32_td4_c asm ("r7") =
(unsigned char*)&L_AES_ARM32_td4;
__asm__ __volatile__ (
"mov r8, r4\n\t"
@ -3923,9 +3973,12 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
"\n"
"L_AES_CBC_decrypt_end_%=: \n\t"
"pop {%[ks]-r4}\n\t"
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c), [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
[nr] "+r" (nr), [iv] "+r" (iv),
[L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c),
[L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
:
: "memory", "r12", "lr", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r8", "r9", "r10", "r11"
);
}
@ -3940,14 +3993,18 @@ static const uint32_t L_GCM_gmult_len_r[] = {
0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000,
};
void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned char* data, unsigned long len);
void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned char* data_p, unsigned long len_p)
void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p,
const unsigned char* data_p, unsigned long len_p);
void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p,
const unsigned char* data_p, unsigned long len_p)
{
register unsigned char* x asm ("r0") = (unsigned char*)x_p;
register const unsigned char** m asm ("r1") = (const unsigned char**)m_p;
register const unsigned char* data asm ("r2") = (const unsigned char*)data_p;
register const unsigned char* data asm ("r2") =
(const unsigned char*)data_p;
register unsigned long len asm ("r3") = (unsigned long)len_p;
register uint32_t* L_GCM_gmult_len_r_c asm ("r4") = (uint32_t*)&L_GCM_gmult_len_r;
register uint32_t* L_GCM_gmult_len_r_c asm ("r4") =
(uint32_t*)&L_GCM_gmult_len_r;
__asm__ __volatile__ (
"mov lr, %[L_GCM_gmult_len_r]\n\t"
@ -4521,15 +4578,21 @@ void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned
"subs %[len], %[len], #16\n\t"
"add %[data], %[data], #16\n\t"
"bne L_GCM_gmult_len_start_block_%=\n\t"
: [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
: [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len),
[L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
:
: "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10",
"r11"
);
}
static const uint32_t* L_AES_ARM32_te_gcm = L_AES_ARM32_te_data;
void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr);
void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p)
void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* ctr_p);
void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p,
unsigned long len_p, const unsigned char* ks_p, int nr_p,
unsigned char* ctr_p)
{
register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
register unsigned char* out asm ("r1") = (unsigned char*)out_p;
@ -4537,7 +4600,8 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
register int nr asm ("r4") = (int)nr_p;
register unsigned char* ctr asm ("r5") = (unsigned char*)ctr_p;
register uint32_t* L_AES_ARM32_te_gcm_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_gcm;
register uint32_t* L_AES_ARM32_te_gcm_c asm ("r6") =
(uint32_t*)L_AES_ARM32_te_gcm;
__asm__ __volatile__ (
"mov r12, r4\n\t"
@ -4778,9 +4842,11 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l
"rev r7, r7\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
"stm r8, {r4, r5, r6, r7}\n\t"
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c)
: [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
[nr] "+r" (nr), [ctr] "+r" (ctr),
[L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c)
:
: "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
);
}

View File

@ -0,0 +1,523 @@
/* armv8-32-chacha-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./chacha/chacha.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_CHACHA
.text
.align 4
.globl wc_chacha_setiv
.type wc_chacha_setiv, %function
wc_chacha_setiv:
push {r4, lr}
add r3, r0, #52
ldr r4, [r1]
ldr r12, [r1, #4]
ldr lr, [r1, #8]
str r2, [r0, #48]
#ifdef BIG_ENDIAN_ORDER
rev r4, r4
rev r12, r12
rev lr, lr
#endif /* BIG_ENDIAN_ORDER */
stm r3, {r4, r12, lr}
pop {r4, pc}
.size wc_chacha_setiv,.-wc_chacha_setiv
.text
.type L_chacha_arm32_constants, %object
.size L_chacha_arm32_constants, 32
.align 4
L_chacha_arm32_constants:
.word 0x61707865
.word 0x3120646e
.word 0x79622d36
.word 0x6b206574
.word 0x61707865
.word 0x3320646e
.word 0x79622d32
.word 0x6b206574
.text
.align 4
.globl wc_chacha_setkey
.type wc_chacha_setkey, %function
wc_chacha_setkey:
push {r4, r5, lr}
adr r3, L_chacha_arm32_constants
subs r2, r2, #16
add r3, r3, r2
# Start state with constants
ldm r3, {r4, r5, r12, lr}
stm r0!, {r4, r5, r12, lr}
# Next is first 16 bytes of key.
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r12, [r1, #8]
ldr lr, [r1, #12]
#ifdef BIG_ENDIAN_ORDER
rev r4, r4
rev r5, r5
rev r12, r12
rev lr, lr
#endif /* BIG_ENDIAN_ORDER */
stm r0!, {r4, r5, r12, lr}
# Next 16 bytes of key.
beq L_chacha_arm32_setkey_same_keyb_ytes
# Update key pointer for next 16 bytes.
add r1, r1, r2
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r12, [r1, #8]
ldr lr, [r1, #12]
L_chacha_arm32_setkey_same_keyb_ytes:
stm r0, {r4, r5, r12, lr}
pop {r4, r5, pc}
.size wc_chacha_setkey,.-wc_chacha_setkey
#ifdef WOLFSSL_ARMASM_NO_NEON
.text
.align 4
.globl wc_chacha_crypt_bytes
.type wc_chacha_crypt_bytes, %function
wc_chacha_crypt_bytes:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
sub sp, sp, #52
mov lr, r0
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r0, [sp, #32]
str r1, [sp, #36]
#else
strd r0, r1, [sp, #32]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r2, [sp, #40]
str r3, [sp, #44]
#else
strd r2, r3, [sp, #40]
#endif
L_chacha_arm32_crypt_block:
# Put x[12]..x[15] onto stack.
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
ldr r4, [lr, #48]
ldr r5, [lr, #52]
#else
ldrd r4, r5, [lr, #48]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
ldr r6, [lr, #56]
ldr r7, [lr, #60]
#else
ldrd r6, r7, [lr, #56]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r4, [sp, #16]
str r5, [sp, #20]
#else
strd r4, r5, [sp, #16]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r6, [sp, #24]
str r7, [sp, #28]
#else
strd r6, r7, [sp, #24]
#endif
# Load x[0]..x[12] into registers.
ldm lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12}
# 10x 2 full rounds to perform.
mov lr, #10
str lr, [sp, #48]
L_chacha_arm32_crypt_loop:
# 0, 4, 8, 12
# 1, 5, 9, 13
ldr lr, [sp, #20]
add r0, r0, r4
add r1, r1, r5
eor r12, r12, r0
eor lr, lr, r1
ror r12, r12, #16
ror lr, lr, #16
add r8, r8, r12
add r9, r9, lr
eor r4, r4, r8
eor r5, r5, r9
ror r4, r4, #20
ror r5, r5, #20
add r0, r0, r4
add r1, r1, r5
eor r12, r12, r0
eor lr, lr, r1
ror r12, r12, #24
ror lr, lr, #24
add r8, r8, r12
add r9, r9, lr
eor r4, r4, r8
eor r5, r5, r9
ror r4, r4, #25
ror r5, r5, #25
str r12, [sp, #16]
str lr, [sp, #20]
# 2, 6, 10, 14
# 3, 7, 11, 15
ldr r12, [sp, #24]
ldr lr, [sp, #28]
add r2, r2, r6
add r3, r3, r7
eor r12, r12, r2
eor lr, lr, r3
ror r12, r12, #16
ror lr, lr, #16
add r10, r10, r12
add r11, r11, lr
eor r6, r6, r10
eor r7, r7, r11
ror r6, r6, #20
ror r7, r7, #20
add r2, r2, r6
add r3, r3, r7
eor r12, r12, r2
eor lr, lr, r3
ror r12, r12, #24
ror lr, lr, #24
add r10, r10, r12
add r11, r11, lr
eor r6, r6, r10
eor r7, r7, r11
ror r6, r6, #25
ror r7, r7, #25
# 3, 4, 9, 14
# 0, 5, 10, 15
add r3, r3, r4
add r0, r0, r5
eor r12, r12, r3
eor lr, lr, r0
ror r12, r12, #16
ror lr, lr, #16
add r9, r9, r12
add r10, r10, lr
eor r4, r4, r9
eor r5, r5, r10
ror r4, r4, #20
ror r5, r5, #20
add r3, r3, r4
add r0, r0, r5
eor r12, r12, r3
eor lr, lr, r0
ror r12, r12, #24
ror lr, lr, #24
add r9, r9, r12
add r10, r10, lr
eor r4, r4, r9
eor r5, r5, r10
ror r4, r4, #25
ror r5, r5, #25
str r12, [sp, #24]
str lr, [sp, #28]
ldr r12, [sp, #16]
ldr lr, [sp, #20]
# 1, 6, 11, 12
# 2, 7, 8, 13
add r1, r1, r6
add r2, r2, r7
eor r12, r12, r1
eor lr, lr, r2
ror r12, r12, #16
ror lr, lr, #16
add r11, r11, r12
add r8, r8, lr
eor r6, r6, r11
eor r7, r7, r8
ror r6, r6, #20
ror r7, r7, #20
add r1, r1, r6
add r2, r2, r7
eor r12, r12, r1
eor lr, lr, r2
ror r12, r12, #24
ror lr, lr, #24
add r11, r11, r12
add r8, r8, lr
eor r6, r6, r11
eor r7, r7, r8
ror r6, r6, #25
ror r7, r7, #25
str lr, [sp, #20]
# Check if we have done enough rounds.
ldr lr, [sp, #48]
subs lr, lr, #1
str lr, [sp, #48]
bgt L_chacha_arm32_crypt_loop
stm sp, {r8, r9, r10, r11, r12}
ldr lr, [sp, #32]
mov r12, sp
# Add in original state
ldm lr!, {r8, r9, r10, r11}
add r0, r0, r8
add r1, r1, r9
add r2, r2, r10
add r3, r3, r11
ldm lr!, {r8, r9, r10, r11}
add r4, r4, r8
add r5, r5, r9
add r6, r6, r10
add r7, r7, r11
ldm r12, {r8, r9}
ldm lr!, {r10, r11}
add r8, r8, r10
add r9, r9, r11
stm r12!, {r8, r9}
ldm r12, {r8, r9}
ldm lr!, {r10, r11}
add r8, r8, r10
add r9, r9, r11
stm r12!, {r8, r9}
ldm r12, {r8, r9}
ldm lr!, {r10, r11}
add r8, r8, r10
add r9, r9, r11
add r10, r10, #1
stm r12!, {r8, r9}
str r10, [lr, #-8]
ldm r12, {r8, r9}
ldm lr, {r10, r11}
add r8, r8, r10
add r9, r9, r11
stm r12, {r8, r9}
ldr r12, [sp, #44]
cmp r12, #0x40
blt L_chacha_arm32_crypt_lt_block
ldr r12, [sp, #40]
ldr lr, [sp, #36]
# XOR state into 64 bytes.
ldr r8, [r12]
ldr r9, [r12, #4]
ldr r10, [r12, #8]
ldr r11, [r12, #12]
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
eor r3, r3, r11
str r0, [lr]
str r1, [lr, #4]
str r2, [lr, #8]
str r3, [lr, #12]
ldr r8, [r12, #16]
ldr r9, [r12, #20]
ldr r10, [r12, #24]
ldr r11, [r12, #28]
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
str r4, [lr, #16]
str r5, [lr, #20]
str r6, [lr, #24]
str r7, [lr, #28]
ldr r4, [sp]
ldr r5, [sp, #4]
ldr r6, [sp, #8]
ldr r7, [sp, #12]
ldr r8, [r12, #32]
ldr r9, [r12, #36]
ldr r10, [r12, #40]
ldr r11, [r12, #44]
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
str r4, [lr, #32]
str r5, [lr, #36]
str r6, [lr, #40]
str r7, [lr, #44]
ldr r4, [sp, #16]
ldr r5, [sp, #20]
ldr r6, [sp, #24]
ldr r7, [sp, #28]
ldr r8, [r12, #48]
ldr r9, [r12, #52]
ldr r10, [r12, #56]
ldr r11, [r12, #60]
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
str r4, [lr, #48]
str r5, [lr, #52]
str r6, [lr, #56]
str r7, [lr, #60]
ldr r3, [sp, #44]
add r12, r12, #0x40
add lr, lr, #0x40
str r12, [sp, #40]
str lr, [sp, #36]
subs r3, r3, #0x40
ldr lr, [sp, #32]
str r3, [sp, #44]
bne L_chacha_arm32_crypt_block
b L_chacha_arm32_crypt_done
L_chacha_arm32_crypt_lt_block:
# Store in over field of ChaCha.
ldr lr, [sp, #32]
add r12, lr, #0x44
stm r12!, {r0, r1, r2, r3, r4, r5, r6, r7}
ldm sp, {r0, r1, r2, r3, r4, r5, r6, r7}
stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
ldr r2, [sp, #40]
ldr r3, [sp, #44]
#else
ldrd r2, r3, [sp, #40]
#endif
ldr r1, [sp, #36]
rsb r12, r3, #0x40
str r12, [lr, #64]
add lr, lr, #0x44
L_chacha_arm32_crypt_16byte_loop:
cmp r3, #16
blt L_chacha_arm32_crypt_word_loop
# 16 bytes of state XORed into message.
ldm lr!, {r4, r5, r6, r7}
ldr r8, [r2]
ldr r9, [r2, #4]
ldr r10, [r2, #8]
ldr r11, [r2, #12]
eor r8, r8, r4
eor r9, r9, r5
eor r10, r10, r6
eor r11, r11, r7
subs r3, r3, #16
str r8, [r1]
str r9, [r1, #4]
str r10, [r1, #8]
str r11, [r1, #12]
beq L_chacha_arm32_crypt_done
add r2, r2, #16
add r1, r1, #16
b L_chacha_arm32_crypt_16byte_loop
L_chacha_arm32_crypt_word_loop:
cmp r3, #4
blt L_chacha_arm32_crypt_byte_start
# 4 bytes of state XORed into message.
ldr r4, [lr]
ldr r8, [r2]
eor r8, r8, r4
subs r3, r3, #4
str r8, [r1]
beq L_chacha_arm32_crypt_done
add lr, lr, #4
add r2, r2, #4
add r1, r1, #4
b L_chacha_arm32_crypt_word_loop
L_chacha_arm32_crypt_byte_start:
ldr r4, [lr]
L_chacha_arm32_crypt_byte_loop:
ldrb r8, [r2]
eor r8, r8, r4
subs r3, r3, #1
strb r8, [r1]
beq L_chacha_arm32_crypt_done
lsr r4, r4, #8
add r2, r2, #1
add r1, r1, #1
b L_chacha_arm32_crypt_byte_loop
L_chacha_arm32_crypt_done:
add sp, sp, #52
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes
.text
.align 4
.globl wc_chacha_use_over
.type wc_chacha_use_over, %function
wc_chacha_use_over:
push {r4, r5, r6, r7, r8, r9, lr}
L_chacha_arm32_over_16byte_loop:
cmp r3, #16
blt L_chacha_arm32_over_word_loop
# 16 bytes of state XORed into message.
ldr r12, [r0]
ldr lr, [r0, #4]
ldr r4, [r0, #8]
ldr r5, [r0, #12]
ldr r6, [r2]
ldr r7, [r2, #4]
ldr r8, [r2, #8]
ldr r9, [r2, #12]
eor r12, r12, r6
eor lr, lr, r7
eor r4, r4, r8
eor r5, r5, r9
subs r3, r3, #16
str r12, [r1]
str lr, [r1, #4]
str r4, [r1, #8]
str r5, [r1, #12]
beq L_chacha_arm32_over_done
add r0, r0, #16
add r2, r2, #16
add r1, r1, #16
b L_chacha_arm32_over_16byte_loop
L_chacha_arm32_over_word_loop:
cmp r3, #4
blt L_chacha_arm32_over_byte_loop
# 4 bytes of state XORed into message.
ldr r12, [r0]
ldr r6, [r2]
eor r12, r12, r6
subs r3, r3, #4
str r12, [r1]
beq L_chacha_arm32_over_done
add r0, r0, #4
add r2, r2, #4
add r1, r1, #4
b L_chacha_arm32_over_word_loop
L_chacha_arm32_over_byte_loop:
# 4 bytes of state XORed into message.
ldrb r12, [r0]
ldrb r6, [r2]
eor r12, r12, r6
subs r3, r3, #1
strb r12, [r1]
beq L_chacha_arm32_over_done
add r0, r0, #1
add r2, r2, #1
add r1, r1, #1
b L_chacha_arm32_over_byte_loop
L_chacha_arm32_over_done:
pop {r4, r5, r6, r7, r8, r9, pc}
.size wc_chacha_use_over,.-wc_chacha_use_over
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_CHACHA */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#endif /* !WOLFSSL_ARMASM_INLINE */

View File

@ -0,0 +1,576 @@
/* armv8-32-chacha-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./chacha/chacha.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.c
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
#endif /* __IAR_SYSTEMS_ICC__ */
#ifdef __KEIL__
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef HAVE_CHACHA
#include <wolfssl/wolfcrypt/chacha.h>
void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p)
{
register word32* x asm ("r0") = (word32*)x_p;
register const byte* iv asm ("r1") = (const byte*)iv_p;
register word32 counter asm ("r2") = (word32)counter_p;
__asm__ __volatile__ (
"add r3, %[x], #52\n\t"
"ldr r4, [%[iv]]\n\t"
"ldr r12, [%[iv], #4]\n\t"
"ldr lr, [%[iv], #8]\n\t"
"str %[counter], [%[x], #48]\n\t"
#ifdef BIG_ENDIAN_ORDER
"rev r4, r4\n\t"
"rev r12, r12\n\t"
"rev lr, lr\n\t"
#endif /* BIG_ENDIAN_ORDER */
"stm r3, {r4, r12, lr}\n\t"
: [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter)
:
: "memory", "cc", "r3", "r12", "lr", "r4"
);
}
static const uint32_t L_chacha_arm32_constants[] = {
0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
};
void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p)
{
register word32* x asm ("r0") = (word32*)x_p;
register const byte* key asm ("r1") = (const byte*)key_p;
register word32 keySz asm ("r2") = (word32)keySz_p;
register uint32_t* L_chacha_arm32_constants_c asm ("r3") =
(uint32_t*)&L_chacha_arm32_constants;
__asm__ __volatile__ (
"subs %[keySz], %[keySz], #16\n\t"
"add r3, r3, %[keySz]\n\t"
/* Start state with constants */
"ldm r3, {r4, r5, r12, lr}\n\t"
"stm %[x]!, {r4, r5, r12, lr}\n\t"
/* Next is first 16 bytes of key. */
"ldr r4, [%[key]]\n\t"
"ldr r5, [%[key], #4]\n\t"
"ldr r12, [%[key], #8]\n\t"
"ldr lr, [%[key], #12]\n\t"
#ifdef BIG_ENDIAN_ORDER
"rev r4, r4\n\t"
"rev r5, r5\n\t"
"rev r12, r12\n\t"
"rev lr, lr\n\t"
#endif /* BIG_ENDIAN_ORDER */
"stm %[x]!, {r4, r5, r12, lr}\n\t"
/* Next 16 bytes of key. */
"beq L_chacha_arm32_setkey_same_keyb_ytes_%=\n\t"
/* Update key pointer for next 16 bytes. */
"add %[key], %[key], %[keySz]\n\t"
"ldr r4, [%[key]]\n\t"
"ldr r5, [%[key], #4]\n\t"
"ldr r12, [%[key], #8]\n\t"
"ldr lr, [%[key], #12]\n\t"
"\n"
"L_chacha_arm32_setkey_same_keyb_ytes_%=: \n\t"
"stm %[x], {r4, r5, r12, lr}\n\t"
: [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz),
[L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c)
:
: "memory", "cc", "r12", "lr", "r4", "r5"
);
}
#ifdef WOLFSSL_ARMASM_NO_NEON
void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p,
word32 len_p)
{
register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p;
register byte* c asm ("r1") = (byte*)c_p;
register const byte* m asm ("r2") = (const byte*)m_p;
register word32 len asm ("r3") = (word32)len_p;
__asm__ __volatile__ (
"sub sp, sp, #52\n\t"
"mov lr, %[ctx]\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str %[ctx], [sp, #32]\n\t"
"str %[c], [sp, #36]\n\t"
#else
"strd %[ctx], %[c], [sp, #32]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str %[m], [sp, #40]\n\t"
"str %[len], [sp, #44]\n\t"
#else
"strd %[m], %[len], [sp, #40]\n\t"
#endif
"\n"
"L_chacha_arm32_crypt_block_%=: \n\t"
/* Put x[12]..x[15] onto stack. */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"ldr r4, [lr, #48]\n\t"
"ldr r5, [lr, #52]\n\t"
#else
"ldrd r4, r5, [lr, #48]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"ldr r6, [lr, #56]\n\t"
"ldr r7, [lr, #60]\n\t"
#else
"ldrd r6, r7, [lr, #56]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str r4, [sp, #16]\n\t"
"str r5, [sp, #20]\n\t"
#else
"strd r4, r5, [sp, #16]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str r6, [sp, #24]\n\t"
"str r7, [sp, #28]\n\t"
#else
"strd r6, r7, [sp, #24]\n\t"
#endif
/* Load x[0]..x[12] into registers. */
"ldm lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
/* 10x 2 full rounds to perform. */
"mov lr, #10\n\t"
"str lr, [sp, #48]\n\t"
"\n"
"L_chacha_arm32_crypt_loop_%=: \n\t"
/* 0, 4, 8, 12 */
/* 1, 5, 9, 13 */
"ldr lr, [sp, #20]\n\t"
"add %[ctx], %[ctx], r4\n\t"
"add %[c], %[c], r5\n\t"
"eor r12, r12, %[ctx]\n\t"
"eor lr, lr, %[c]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r8, r8, r12\n\t"
"add r9, r9, lr\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"ror r4, r4, #20\n\t"
"ror r5, r5, #20\n\t"
"add %[ctx], %[ctx], r4\n\t"
"add %[c], %[c], r5\n\t"
"eor r12, r12, %[ctx]\n\t"
"eor lr, lr, %[c]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r8, r8, r12\n\t"
"add r9, r9, lr\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"ror r4, r4, #25\n\t"
"ror r5, r5, #25\n\t"
"str r12, [sp, #16]\n\t"
"str lr, [sp, #20]\n\t"
/* 2, 6, 10, 14 */
/* 3, 7, 11, 15 */
"ldr r12, [sp, #24]\n\t"
"ldr lr, [sp, #28]\n\t"
"add %[m], %[m], r6\n\t"
"add %[len], %[len], r7\n\t"
"eor r12, r12, %[m]\n\t"
"eor lr, lr, %[len]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r10, r10, r12\n\t"
"add r11, r11, lr\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"ror r6, r6, #20\n\t"
"ror r7, r7, #20\n\t"
"add %[m], %[m], r6\n\t"
"add %[len], %[len], r7\n\t"
"eor r12, r12, %[m]\n\t"
"eor lr, lr, %[len]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r10, r10, r12\n\t"
"add r11, r11, lr\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"ror r6, r6, #25\n\t"
"ror r7, r7, #25\n\t"
/* 3, 4, 9, 14 */
/* 0, 5, 10, 15 */
"add %[len], %[len], r4\n\t"
"add %[ctx], %[ctx], r5\n\t"
"eor r12, r12, %[len]\n\t"
"eor lr, lr, %[ctx]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r9, r9, r12\n\t"
"add r10, r10, lr\n\t"
"eor r4, r4, r9\n\t"
"eor r5, r5, r10\n\t"
"ror r4, r4, #20\n\t"
"ror r5, r5, #20\n\t"
"add %[len], %[len], r4\n\t"
"add %[ctx], %[ctx], r5\n\t"
"eor r12, r12, %[len]\n\t"
"eor lr, lr, %[ctx]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r9, r9, r12\n\t"
"add r10, r10, lr\n\t"
"eor r4, r4, r9\n\t"
"eor r5, r5, r10\n\t"
"ror r4, r4, #25\n\t"
"ror r5, r5, #25\n\t"
"str r12, [sp, #24]\n\t"
"str lr, [sp, #28]\n\t"
"ldr r12, [sp, #16]\n\t"
"ldr lr, [sp, #20]\n\t"
/* 1, 6, 11, 12 */
/* 2, 7, 8, 13 */
"add %[c], %[c], r6\n\t"
"add %[m], %[m], r7\n\t"
"eor r12, r12, %[c]\n\t"
"eor lr, lr, %[m]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r11, r11, r12\n\t"
"add r8, r8, lr\n\t"
"eor r6, r6, r11\n\t"
"eor r7, r7, r8\n\t"
"ror r6, r6, #20\n\t"
"ror r7, r7, #20\n\t"
"add %[c], %[c], r6\n\t"
"add %[m], %[m], r7\n\t"
"eor r12, r12, %[c]\n\t"
"eor lr, lr, %[m]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r11, r11, r12\n\t"
"add r8, r8, lr\n\t"
"eor r6, r6, r11\n\t"
"eor r7, r7, r8\n\t"
"ror r6, r6, #25\n\t"
"ror r7, r7, #25\n\t"
"str lr, [sp, #20]\n\t"
/* Check if we have done enough rounds. */
"ldr lr, [sp, #48]\n\t"
"subs lr, lr, #1\n\t"
"str lr, [sp, #48]\n\t"
"bgt L_chacha_arm32_crypt_loop_%=\n\t"
"stm sp, {r8, r9, r10, r11, r12}\n\t"
"ldr lr, [sp, #32]\n\t"
"mov r12, sp\n\t"
/* Add in original state */
"ldm lr!, {r8, r9, r10, r11}\n\t"
"add %[ctx], %[ctx], r8\n\t"
"add %[c], %[c], r9\n\t"
"add %[m], %[m], r10\n\t"
"add %[len], %[len], r11\n\t"
"ldm lr!, {r8, r9, r10, r11}\n\t"
"add r4, r4, r8\n\t"
"add r5, r5, r9\n\t"
"add r6, r6, r10\n\t"
"add r7, r7, r11\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr!, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"stm r12!, {r8, r9}\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr!, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"stm r12!, {r8, r9}\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr!, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"add r10, r10, #1\n\t"
"stm r12!, {r8, r9}\n\t"
"str r10, [lr, #-8]\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"stm r12, {r8, r9}\n\t"
"ldr r12, [sp, #44]\n\t"
"cmp r12, #0x40\n\t"
"blt L_chacha_arm32_crypt_lt_block_%=\n\t"
"ldr r12, [sp, #40]\n\t"
"ldr lr, [sp, #36]\n\t"
/* XOR state into 64 bytes. */
"ldr r8, [r12]\n\t"
"ldr r9, [r12, #4]\n\t"
"ldr r10, [r12, #8]\n\t"
"ldr r11, [r12, #12]\n\t"
"eor %[ctx], %[ctx], r8\n\t"
"eor %[c], %[c], r9\n\t"
"eor %[m], %[m], r10\n\t"
"eor %[len], %[len], r11\n\t"
"str %[ctx], [lr]\n\t"
"str %[c], [lr, #4]\n\t"
"str %[m], [lr, #8]\n\t"
"str %[len], [lr, #12]\n\t"
"ldr r8, [r12, #16]\n\t"
"ldr r9, [r12, #20]\n\t"
"ldr r10, [r12, #24]\n\t"
"ldr r11, [r12, #28]\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"str r4, [lr, #16]\n\t"
"str r5, [lr, #20]\n\t"
"str r6, [lr, #24]\n\t"
"str r7, [lr, #28]\n\t"
"ldr r4, [sp]\n\t"
"ldr r5, [sp, #4]\n\t"
"ldr r6, [sp, #8]\n\t"
"ldr r7, [sp, #12]\n\t"
"ldr r8, [r12, #32]\n\t"
"ldr r9, [r12, #36]\n\t"
"ldr r10, [r12, #40]\n\t"
"ldr r11, [r12, #44]\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"str r4, [lr, #32]\n\t"
"str r5, [lr, #36]\n\t"
"str r6, [lr, #40]\n\t"
"str r7, [lr, #44]\n\t"
"ldr r4, [sp, #16]\n\t"
"ldr r5, [sp, #20]\n\t"
"ldr r6, [sp, #24]\n\t"
"ldr r7, [sp, #28]\n\t"
"ldr r8, [r12, #48]\n\t"
"ldr r9, [r12, #52]\n\t"
"ldr r10, [r12, #56]\n\t"
"ldr r11, [r12, #60]\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"str r4, [lr, #48]\n\t"
"str r5, [lr, #52]\n\t"
"str r6, [lr, #56]\n\t"
"str r7, [lr, #60]\n\t"
"ldr %[len], [sp, #44]\n\t"
"add r12, r12, #0x40\n\t"
"add lr, lr, #0x40\n\t"
"str r12, [sp, #40]\n\t"
"str lr, [sp, #36]\n\t"
"subs %[len], %[len], #0x40\n\t"
"ldr lr, [sp, #32]\n\t"
"str %[len], [sp, #44]\n\t"
"bne L_chacha_arm32_crypt_block_%=\n\t"
"b L_chacha_arm32_crypt_done_%=\n\t"
"\n"
"L_chacha_arm32_crypt_lt_block_%=: \n\t"
/* Store in over field of ChaCha. */
"ldr lr, [sp, #32]\n\t"
"add r12, lr, #0x44\n\t"
"stm r12!, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
"ldm sp, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
"stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"ldr %[m], [sp, #40]\n\t"
"ldr %[len], [sp, #44]\n\t"
#else
"ldrd %[m], %[len], [sp, #40]\n\t"
#endif
"ldr %[c], [sp, #36]\n\t"
"rsb r12, %[len], #0x40\n\t"
"str r12, [lr, #64]\n\t"
"add lr, lr, #0x44\n\t"
"\n"
"L_chacha_arm32_crypt_16byte_loop_%=: \n\t"
"cmp %[len], #16\n\t"
"blt L_chacha_arm32_crypt_word_loop_%=\n\t"
/* 16 bytes of state XORed into message. */
"ldm lr!, {r4, r5, r6, r7}\n\t"
"ldr r8, [%[m]]\n\t"
"ldr r9, [%[m], #4]\n\t"
"ldr r10, [%[m], #8]\n\t"
"ldr r11, [%[m], #12]\n\t"
"eor r8, r8, r4\n\t"
"eor r9, r9, r5\n\t"
"eor r10, r10, r6\n\t"
"eor r11, r11, r7\n\t"
"subs %[len], %[len], #16\n\t"
"str r8, [%[c]]\n\t"
"str r9, [%[c], #4]\n\t"
"str r10, [%[c], #8]\n\t"
"str r11, [%[c], #12]\n\t"
"beq L_chacha_arm32_crypt_done_%=\n\t"
"add %[m], %[m], #16\n\t"
"add %[c], %[c], #16\n\t"
"b L_chacha_arm32_crypt_16byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_crypt_word_loop_%=: \n\t"
"cmp %[len], #4\n\t"
"blt L_chacha_arm32_crypt_byte_start_%=\n\t"
/* 4 bytes of state XORed into message. */
"ldr r4, [lr]\n\t"
"ldr r8, [%[m]]\n\t"
"eor r8, r8, r4\n\t"
"subs %[len], %[len], #4\n\t"
"str r8, [%[c]]\n\t"
"beq L_chacha_arm32_crypt_done_%=\n\t"
"add lr, lr, #4\n\t"
"add %[m], %[m], #4\n\t"
"add %[c], %[c], #4\n\t"
"b L_chacha_arm32_crypt_word_loop_%=\n\t"
"\n"
"L_chacha_arm32_crypt_byte_start_%=: \n\t"
"ldr r4, [lr]\n\t"
"\n"
"L_chacha_arm32_crypt_byte_loop_%=: \n\t"
"ldrb r8, [%[m]]\n\t"
"eor r8, r8, r4\n\t"
"subs %[len], %[len], #1\n\t"
"strb r8, [%[c]]\n\t"
"beq L_chacha_arm32_crypt_done_%=\n\t"
"lsr r4, r4, #8\n\t"
"add %[m], %[m], #1\n\t"
"add %[c], %[c], #1\n\t"
"b L_chacha_arm32_crypt_byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_crypt_done_%=: \n\t"
"add sp, sp, #52\n\t"
: [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len)
:
: "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11"
);
}
void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p,
word32 len_p)
{
register byte* over asm ("r0") = (byte*)over_p;
register byte* output asm ("r1") = (byte*)output_p;
register const byte* input asm ("r2") = (const byte*)input_p;
register word32 len asm ("r3") = (word32)len_p;
__asm__ __volatile__ (
"\n"
"L_chacha_arm32_over_16byte_loop_%=: \n\t"
"cmp %[len], #16\n\t"
"blt L_chacha_arm32_over_word_loop_%=\n\t"
/* 16 bytes of state XORed into message. */
"ldr r12, [%[over]]\n\t"
"ldr lr, [%[over], #4]\n\t"
"ldr r4, [%[over], #8]\n\t"
"ldr r5, [%[over], #12]\n\t"
"ldr r6, [%[input]]\n\t"
"ldr r7, [%[input], #4]\n\t"
"ldr r8, [%[input], #8]\n\t"
"ldr r9, [%[input], #12]\n\t"
"eor r12, r12, r6\n\t"
"eor lr, lr, r7\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"subs %[len], %[len], #16\n\t"
"str r12, [%[output]]\n\t"
"str lr, [%[output], #4]\n\t"
"str r4, [%[output], #8]\n\t"
"str r5, [%[output], #12]\n\t"
"beq L_chacha_arm32_over_done_%=\n\t"
"add %[over], %[over], #16\n\t"
"add %[input], %[input], #16\n\t"
"add %[output], %[output], #16\n\t"
"b L_chacha_arm32_over_16byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_over_word_loop_%=: \n\t"
"cmp %[len], #4\n\t"
"blt L_chacha_arm32_over_byte_loop_%=\n\t"
/* 4 bytes of state XORed into message. */
"ldr r12, [%[over]]\n\t"
"ldr r6, [%[input]]\n\t"
"eor r12, r12, r6\n\t"
"subs %[len], %[len], #4\n\t"
"str r12, [%[output]]\n\t"
"beq L_chacha_arm32_over_done_%=\n\t"
"add %[over], %[over], #4\n\t"
"add %[input], %[input], #4\n\t"
"add %[output], %[output], #4\n\t"
"b L_chacha_arm32_over_word_loop_%=\n\t"
"\n"
"L_chacha_arm32_over_byte_loop_%=: \n\t"
/* 4 bytes of state XORed into message. */
"ldrb r12, [%[over]]\n\t"
"ldrb r6, [%[input]]\n\t"
"eor r12, r12, r6\n\t"
"subs %[len], %[len], #1\n\t"
"strb r12, [%[output]]\n\t"
"beq L_chacha_arm32_over_done_%=\n\t"
"add %[over], %[over], #1\n\t"
"add %[input], %[input], #1\n\t"
"add %[output], %[output], #1\n\t"
"b L_chacha_arm32_over_byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_over_done_%=: \n\t"
: [over] "+r" (over), [output] "+r" (output), [input] "+r" (input),
[len] "+r" (len)
:
: "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_CHACHA */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S
* ruby ./x25519/x25519.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S
*/
#ifdef HAVE_CONFIG_H

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
* ruby ./x25519/x25519.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
*/
#ifdef HAVE_CONFIG_H
@ -282,7 +283,7 @@ void fe_add_sub_op()
/* Done Add-Sub */
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -324,7 +325,7 @@ void fe_sub_op()
/* Done Sub */
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -336,9 +337,10 @@ void fe_sub(fe r_p, const fe a_p, const fe b_p)
__asm__ __volatile__ (
"bl fe_sub_op\n\t"
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -381,7 +383,7 @@ void fe_add_op()
/* Done Add */
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -393,9 +395,10 @@ void fe_add(fe r_p, const fe a_p, const fe b_p)
__asm__ __volatile__ (
"bl fe_add_op\n\t"
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -427,9 +430,9 @@ void fe_frombytes(fe out_p, const unsigned char* in_p)
"str r7, [%[out], #20]\n\t"
"str r8, [%[out], #24]\n\t"
"str r9, [%[out], #28]\n\t"
: [out] "+r" (out), [in] "+r" (in)
: [out] "+r" (out), [in] "+r" (in)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
);
}
@ -471,9 +474,9 @@ void fe_tobytes(unsigned char* out_p, const fe n_p)
"str r7, [%[out], #20]\n\t"
"str r8, [%[out], #24]\n\t"
"str r9, [%[out], #28]\n\t"
: [out] "+r" (out), [n] "+r" (n)
: [out] "+r" (out), [n] "+r" (n)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12"
);
}
@ -494,7 +497,7 @@ void fe_1(fe n_p)
"stm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
: [n] "+r" (n)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
);
}
@ -515,7 +518,7 @@ void fe_0(fe n_p)
"stm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
: [n] "+r" (n)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
);
}
@ -574,9 +577,9 @@ void fe_copy(fe r_p, const fe a_p)
#else
"strd r4, r5, [%[r], #24]\n\t"
#endif
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "r2", "r3", "r4", "r5", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5"
);
}
@ -601,9 +604,9 @@ void fe_neg(fe r_p, const fe a_p)
"sbcs r4, lr, r4\n\t"
"sbc r5, r12, r5\n\t"
"stm %[r]!, {r2, r3, r4, r5}\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "r2", "r3", "r4", "r5", "r12", "lr", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r12", "lr"
);
}
@ -645,7 +648,8 @@ int fe_isnonzero(const fe a_p)
"orr %[a], r2, r4\n\t"
: [a] "+r" (a)
:
: "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "cc"
: "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
"r12"
);
return (uint32_t)(size_t)a;
}
@ -671,7 +675,7 @@ int fe_isnegative(const fe a_p)
"eor %[a], %[a], r1\n\t"
: [a] "+r" (a)
:
: "memory", "r1", "r2", "r3", "r4", "r5", "cc"
: "memory", "cc", "r1", "r2", "r3", "r4", "r5"
);
return (uint32_t)(size_t)a;
}
@ -2405,9 +2409,10 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p)
#else
"strd r8, r9, [%[r], #88]\n\t"
#endif
: [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
: [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10",
"r11", "r12", "lr"
);
}
@ -2525,9 +2530,10 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p)
"and r7, r7, lr\n\t"
"stm %[r]!, {r4, r5, r6, r7}\n\t"
"sub %[base], %[base], %[b]\n\t"
: [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
: [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -2914,7 +2920,7 @@ void fe_mul_op()
"add sp, sp, #40\n\t"
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -3057,7 +3063,7 @@ void fe_mul_op()
"add sp, sp, #16\n\t"
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -3070,9 +3076,10 @@ void fe_mul(fe r_p, const fe a_p, const fe b_p)
__asm__ __volatile__ (
"bl fe_mul_op\n\t"
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -3349,7 +3356,7 @@ void fe_sq_op()
"add sp, sp, #0x44\n\t"
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -3478,7 +3485,7 @@ void fe_sq_op()
"stm lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
:
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -3490,9 +3497,10 @@ void fe_sq(fe r_p, const fe a_p)
__asm__ __volatile__ (
"bl fe_sq_op\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "r11", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
"lr", "r10", "r11"
);
}
@ -3562,9 +3570,10 @@ void fe_mul121666(fe r_p, fe a_p)
"adcs r8, r8, #0\n\t"
"adc r9, r9, #0\n\t"
"stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
"lr", "r10"
);
}
@ -3620,9 +3629,10 @@ void fe_mul121666(fe r_p, fe a_p)
"adcs r8, r8, #0\n\t"
"adc r9, r9, #0\n\t"
"stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
"lr", "r10"
);
}
@ -4010,9 +4020,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p)
"bl fe_mul_op\n\t"
"mov r0, #0\n\t"
"add sp, sp, #0xbc\n\t"
: [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
: [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"r3", "r12", "lr"
);
return (uint32_t)(size_t)r;
}
@ -4323,9 +4334,10 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p)
"stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
"mov r0, #0\n\t"
"add sp, sp, #0xc0\n\t"
: [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
: [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"r3", "r12", "lr"
);
return (uint32_t)(size_t)r;
}
@ -4497,9 +4509,10 @@ void fe_invert(fe r_p, const fe a_p)
"ldr %[a], [sp, #132]\n\t"
"ldr %[r], [sp, #128]\n\t"
"add sp, sp, #0x88\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8",
"r9", "r10", "r11"
);
}
@ -4817,9 +4830,9 @@ void fe_sq2(fe r_p, const fe a_p)
"ldr r0, [sp, #64]\n\t"
"stm r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
"add sp, sp, #0x44\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -4996,9 +5009,9 @@ void fe_sq2(fe r_p, const fe a_p)
"stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
"mov r0, r12\n\t"
"mov r1, lr\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "lr", "cc"
: "memory", "cc", "lr"
);
}
@ -5167,9 +5180,10 @@ void fe_pow22523(fe r_p, const fe a_p)
"ldr %[a], [sp, #100]\n\t"
"ldr %[r], [sp, #96]\n\t"
"add sp, sp, #0x68\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [r] "+r" (r), [a] "+r" (a)
:
: "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8",
"r9", "r10", "r11"
);
}
@ -5197,9 +5211,10 @@ void ge_p1p1_to_p2(ge_p2 * r_p, const ge_p1p1 * p_p)
"add r0, r0, #0x40\n\t"
"bl fe_mul_op\n\t"
"add sp, sp, #8\n\t"
: [r] "+r" (r), [p] "+r" (p)
: [r] "+r" (r), [p] "+r" (p)
:
: "memory", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8",
"r9", "r10", "r11"
);
}
@ -5232,9 +5247,10 @@ void ge_p1p1_to_p3(ge_p3 * r_p, const ge_p1p1 * p_p)
"add r0, r0, #0x60\n\t"
"bl fe_mul_op\n\t"
"add sp, sp, #8\n\t"
: [r] "+r" (r), [p] "+r" (p)
: [r] "+r" (r), [p] "+r" (p)
:
: "memory", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8",
"r9", "r10", "r11"
);
}
@ -5279,9 +5295,10 @@ void ge_p2_dbl(ge_p1p1 * r_p, const ge_p2 * p_p)
"mov r1, r0\n\t"
"bl fe_sub_op\n\t"
"add sp, sp, #8\n\t"
: [r] "+r" (r), [p] "+r" (p)
: [r] "+r" (r), [p] "+r" (p)
:
: "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -5365,9 +5382,10 @@ void ge_madd(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p)
"add r1, r0, #32\n\t"
"bl fe_add_sub_op\n\t"
"add sp, sp, #12\n\t"
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -5452,9 +5470,10 @@ void ge_msub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p)
"add r0, r0, #32\n\t"
"bl fe_add_sub_op\n\t"
"add sp, sp, #12\n\t"
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -5539,9 +5558,10 @@ void ge_add(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p)
"add r0, r0, #32\n\t"
"bl fe_add_sub_op\n\t"
"add sp, sp, #44\n\t"
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -5626,9 +5646,10 @@ void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p)
"add r0, r0, #0x40\n\t"
"bl fe_add_sub_op\n\t"
"add sp, sp, #44\n\t"
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
: [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
:
: "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "lr"
);
}
@ -6408,7 +6429,8 @@ void sc_reduce(byte* s_p)
"add sp, sp, #56\n\t"
: [s] "+r" (s)
:
: "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11", "r12", "lr"
);
}
@ -7059,7 +7081,8 @@ void sc_reduce(byte* s_p)
"add sp, sp, #56\n\t"
: [s] "+r" (s)
:
: "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11", "r12", "lr"
);
}
@ -7076,7 +7099,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
__asm__ __volatile__ (
"sub sp, sp, #0x50\n\t"
"add lr, sp, #0x44\n\t"
"stm lr, {%[s], %[a], %[c]}\n\t"
"stm lr, {r0, r1, r3}\n\t"
"mov %[s], #0\n\t"
"ldr r12, [%[a]]\n\t"
/* A[0] * B[0] */
@ -7402,24 +7425,24 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"adc r10, %[s], #0\n\t"
"umlal r9, r10, r12, lr\n\t"
"add lr, sp, #32\n\t"
"stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t"
"stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
"mov %[s], sp\n\t"
/* Add c to a * b */
"ldr lr, [sp, #76]\n\t"
"ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t"
"ldm lr!, {%[a], r10, r11, r12}\n\t"
"ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
"ldm lr!, {r1, r10, r11, r12}\n\t"
"adds %[b], %[b], %[a]\n\t"
"adcs %[c], %[c], r10\n\t"
"adcs r4, r4, r11\n\t"
"adcs r5, r5, r12\n\t"
"ldm lr!, {%[a], r10, r11, r12}\n\t"
"ldm lr!, {r1, r10, r11, r12}\n\t"
"adcs r6, r6, %[a]\n\t"
"adcs r7, r7, r10\n\t"
"adcs r8, r8, r11\n\t"
"adcs r9, r9, r12\n\t"
"mov %[a], r9\n\t"
"stm %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t"
"ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t"
"stm %[s]!, {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
"ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
"adcs %[b], %[b], #0\n\t"
"adcs %[c], %[c], #0\n\t"
"adcs r4, r4, #0\n\t"
@ -7918,7 +7941,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"mov r12, sp\n\t"
/* Load bits 252-376 */
"add r12, r12, #28\n\t"
"ldm r12, {%[a], %[b], %[c], r4, r5}\n\t"
"ldm r12, {r1, r2, r3, r4, r5}\n\t"
"lsl r5, r5, #4\n\t"
"orr r5, r5, r4, lsr #28\n\t"
"lsl r4, r4, #4\n\t"
@ -8097,7 +8120,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"sbcs r9, r9, r5\n\t"
"sbc %[a], %[a], %[a]\n\t"
"sub %[s], %[s], #16\n\t"
"ldm %[s], {%[b], %[c], r4, r5}\n\t"
"ldm %[s], {r2, r3, r4, r5}\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"mov r10, #0x5c\n\t"
"lsl r10, r10, #8\n\t"
@ -8199,9 +8222,10 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"str r8, [%[s], #24]\n\t"
"str r9, [%[s], #28]\n\t"
"add sp, sp, #0x50\n\t"
: [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
: [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"r12", "lr"
);
}
@ -8216,9 +8240,9 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
__asm__ __volatile__ (
"sub sp, sp, #0x50\n\t"
"add lr, sp, #0x44\n\t"
"stm lr, {%[s], %[a], %[c]}\n\t"
"stm lr, {r0, r1, r3}\n\t"
"mov lr, %[b]\n\t"
"ldm %[a], {%[s], %[a], %[b], %[c]}\n\t"
"ldm %[a], {r0, r1, r2, r3}\n\t"
"ldm lr!, {r4, r5, r6}\n\t"
"umull r10, r11, %[s], r4\n\t"
"umull r12, r7, %[a], r4\n\t"
@ -8263,7 +8287,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"umaal r4, r6, %[b], r7\n\t"
"sub lr, lr, #16\n\t"
"umaal r5, r6, %[c], r7\n\t"
"ldm %[s], {%[s], %[a], %[b], %[c]}\n\t"
"ldm %[s], {r0, r1, r2, r3}\n\t"
"str r6, [sp, #64]\n\t"
"ldm lr!, {r6}\n\t"
"mov r7, #0\n\t"
@ -8315,24 +8339,24 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"umaal r9, r10, %[c], lr\n\t"
"mov %[c], r12\n\t"
"add lr, sp, #32\n\t"
"stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t"
"stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
"mov %[s], sp\n\t"
/* Add c to a * b */
"ldr lr, [sp, #76]\n\t"
"ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t"
"ldm lr!, {%[a], r10, r11, r12}\n\t"
"ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
"ldm lr!, {r1, r10, r11, r12}\n\t"
"adds %[b], %[b], %[a]\n\t"
"adcs %[c], %[c], r10\n\t"
"adcs r4, r4, r11\n\t"
"adcs r5, r5, r12\n\t"
"ldm lr!, {%[a], r10, r11, r12}\n\t"
"ldm lr!, {r1, r10, r11, r12}\n\t"
"adcs r6, r6, %[a]\n\t"
"adcs r7, r7, r10\n\t"
"adcs r8, r8, r11\n\t"
"adcs r9, r9, r12\n\t"
"mov %[a], r9\n\t"
"stm %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t"
"ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t"
"stm %[s]!, {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
"ldm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
"adcs %[b], %[b], #0\n\t"
"adcs %[c], %[c], #0\n\t"
"adcs r4, r4, #0\n\t"
@ -8738,7 +8762,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"mov r12, sp\n\t"
/* Load bits 252-376 */
"add r12, r12, #28\n\t"
"ldm r12, {%[a], %[b], %[c], r4, r5}\n\t"
"ldm r12, {r1, r2, r3, r4, r5}\n\t"
"lsl r5, r5, #4\n\t"
"orr r5, r5, r4, lsr #28\n\t"
"lsl r4, r4, #4\n\t"
@ -8881,7 +8905,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"sbcs r9, r9, r5\n\t"
"sbc %[a], %[a], %[a]\n\t"
"sub %[s], %[s], #16\n\t"
"ldm %[s], {%[b], %[c], r4, r5}\n\t"
"ldm %[s], {r2, r3, r4, r5}\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"mov r10, #0x5c\n\t"
"lsl r10, r10, #8\n\t"
@ -8983,9 +9007,10 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
"str r8, [%[s], #24]\n\t"
"str r9, [%[s], #28]\n\t"
"add sp, sp, #0x50\n\t"
: [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
: [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"r12", "lr"
);
}

View File

@ -0,0 +1,357 @@
/* armv8-32-poly1305-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./poly1305/poly1305.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_POLY1305
.text
.align 4
.globl poly1305_blocks_arm32_16
.type poly1305_blocks_arm32_16, %function
poly1305_blocks_arm32_16:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
sub sp, sp, #28
cmp r2, #0
beq L_poly1305_arm32_16_done
add lr, sp, #12
stm lr, {r0, r1, r2, r3}
# Get h pointer
add lr, r0, #16
ldm lr, {r4, r5, r6, r7, r8}
L_poly1305_arm32_16_loop:
# Add m to h
ldr r1, [sp, #16]
ldr r2, [r1]
ldr r3, [r1, #4]
ldr r9, [r1, #8]
ldr r10, [r1, #12]
ldr r11, [sp, #24]
adds r4, r4, r2
adcs r5, r5, r3
adcs r6, r6, r9
adcs r7, r7, r10
add r1, r1, #16
adc r8, r8, r11
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
stm lr, {r4, r5, r6, r7, r8}
#else
# h[0]-h[2] in r4-r6 for multiplication.
str r7, [lr, #12]
str r8, [lr, #16]
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
str r1, [sp, #16]
ldr r1, [sp, #12]
# Multiply h by r
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
# r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i]
ldr r3, [r1]
eor r0, r0, r0
# r[0] * h[0]
# h[0] in r4
umull r4, r5, r3, r4
# r[0] * h[2]
# h[2] in r6
umull r6, r7, r3, r6
# r[0] * h[4]
# h[4] in r8
mul r8, r3, r8
# r[0] * h[1]
ldr r2, [lr, #4]
mov r12, r0
umlal r5, r12, r3, r2
# r[0] * h[3]
ldr r2, [lr, #12]
adds r6, r6, r12
adc r7, r7, r0
umlal r7, r8, r3, r2
# r[1] * h[0]
ldr r3, [r1, #4]
ldr r2, [lr]
mov r12, r0
umlal r5, r12, r3, r2
# r[1] * h[1]
ldr r2, [lr, #4]
adds r6, r6, r12
adc r12, r0, r0
umlal r6, r12, r3, r2
# r[1] * h[2]
ldr r2, [lr, #8]
adds r7, r7, r12
adc r12, r0, r0
umlal r7, r12, r3, r2
# r[1] * h[3]
ldr r2, [lr, #12]
adds r8, r8, r12
adc r9, r0, r0
umlal r8, r9, r3, r2
# r[1] * h[4]
ldr r2, [lr, #16]
mla r9, r3, r2, r9
# r[2] * h[0]
ldr r3, [r1, #8]
ldr r2, [lr]
mov r12, r0
umlal r6, r12, r3, r2
# r[2] * h[1]
ldr r2, [lr, #4]
adds r7, r7, r12
adc r12, r0, r0
umlal r7, r12, r3, r2
# r[2] * h[2]
ldr r2, [lr, #8]
adds r8, r8, r12
adc r12, r0, r0
umlal r8, r12, r3, r2
# r[2] * h[3]
ldr r2, [lr, #12]
adds r9, r9, r12
adc r10, r0, r0
umlal r9, r10, r3, r2
# r[2] * h[4]
ldr r2, [lr, #16]
mla r10, r3, r2, r10
# r[3] * h[0]
ldr r3, [r1, #12]
ldr r2, [lr]
mov r12, r0
umlal r7, r12, r3, r2
# r[3] * h[1]
ldr r2, [lr, #4]
adds r8, r8, r12
adc r12, r0, r0
umlal r8, r12, r3, r2
# r[3] * h[2]
ldr r2, [lr, #8]
adds r9, r9, r12
adc r10, r10, r0
umlal r9, r10, r3, r2
# r[3] * h[3]
ldr r2, [lr, #12]
mov r11, r0
umlal r10, r11, r3, r2
# r[3] * h[4]
ldr r2, [lr, #16]
mov r12, r0
mla r11, r3, r2, r11
#else
ldm r1, {r0, r1, r2, r3}
# r[0] * h[0]
umull r10, r11, r0, r4
# r[1] * h[0]
umull r12, r7, r1, r4
# r[0] * h[1]
umaal r11, r12, r0, r5
# r[2] * h[0]
umull r8, r9, r2, r4
# r[1] * h[1]
umaal r12, r8, r1, r5
# r[0] * h[2]
umaal r12, r7, r0, r6
# r[3] * h[0]
umaal r8, r9, r3, r4
stm sp, {r10, r11, r12}
# r[2] * h[1]
umaal r7, r8, r2, r5
# Replace h[0] with h[3]
ldr r4, [lr, #12]
# r[1] * h[2]
umull r10, r11, r1, r6
# r[2] * h[2]
umaal r8, r9, r2, r6
# r[0] * h[3]
umaal r7, r10, r0, r4
# r[3] * h[1]
umaal r8, r11, r3, r5
# r[1] * h[3]
umaal r8, r10, r1, r4
# r[3] * h[2]
umaal r9, r11, r3, r6
# r[2] * h[3]
umaal r9, r10, r2, r4
# Replace h[1] with h[4]
ldr r5, [lr, #16]
# r[3] * h[3]
umaal r10, r11, r3, r4
mov r12, #0
# r[0] * h[4]
umaal r8, r12, r0, r5
# r[1] * h[4]
umaal r9, r12, r1, r5
# r[2] * h[4]
umaal r10, r12, r2, r5
# r[3] * h[4]
umaal r11, r12, r3, r5
# DONE
ldm sp, {r4, r5, r6}
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
# r12 will be zero because r is masked.
# Load length
ldr r2, [sp, #20]
# Reduce mod 2^130 - 5
bic r3, r8, #3
and r8, r8, #3
adds r4, r4, r3
lsr r3, r3, #2
adcs r5, r5, r9
orr r3, r3, r9, LSL #30
adcs r6, r6, r10
lsr r9, r9, #2
adcs r7, r7, r11
orr r9, r9, r10, LSL #30
adc r8, r8, r12
lsr r10, r10, #2
adds r4, r4, r3
orr r10, r10, r11, LSL #30
adcs r5, r5, r9
lsr r11, r11, #2
adcs r6, r6, r10
adcs r7, r7, r11
adc r8, r8, r12
# Sub 16 from length.
subs r2, r2, #16
# Store length.
str r2, [sp, #20]
# Loop again if more message to do.
bgt L_poly1305_arm32_16_loop
stm lr, {r4, r5, r6, r7, r8}
L_poly1305_arm32_16_done:
add sp, sp, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16
.text
.type L_poly1305_arm32_clamp, %object
.size L_poly1305_arm32_clamp, 16
.align 4
L_poly1305_arm32_clamp:
.word 0xfffffff
.word 0xffffffc
.word 0xffffffc
.word 0xffffffc
.text
.align 4
.globl poly1305_set_key
.type poly1305_set_key, %function
poly1305_set_key:
push {r4, r5, r6, r7, r8, lr}
# Load mask.
adr lr, L_poly1305_arm32_clamp
ldm lr, {r6, r7, r8, r12}
# Load and cache padding.
ldr r2, [r1, #16]
ldr r3, [r1, #20]
ldr r4, [r1, #24]
ldr r5, [r1, #28]
add lr, r0, #36
stm lr, {r2, r3, r4, r5}
# Load, mask and store r.
ldr r2, [r1]
ldr r3, [r1, #4]
ldr r4, [r1, #8]
ldr r5, [r1, #12]
and r2, r2, r6
and r3, r3, r7
and r4, r4, r8
and r5, r5, r12
add lr, r0, #0
stm lr, {r2, r3, r4, r5}
# h (accumulator) = 0
eor r6, r6, r6
eor r7, r7, r7
eor r8, r8, r8
eor r12, r12, r12
add lr, r0, #16
eor r5, r5, r5
stm lr, {r5, r6, r7, r8, r12}
# Zero leftover
str r5, [r0, #52]
pop {r4, r5, r6, r7, r8, pc}
.size poly1305_set_key,.-poly1305_set_key
.text
.align 4
.globl poly1305_final
.type poly1305_final, %function
poly1305_final:
push {r4, r5, r6, r7, r8, r9, lr}
add r9, r0, #16
ldm r9, {r4, r5, r6, r7, r8}
# Add 5 and check for h larger than p.
adds r2, r4, #5
adcs r2, r5, #0
adcs r2, r6, #0
adcs r2, r7, #0
adc r2, r8, #0
sub r2, r2, #4
lsr r2, r2, #31
sub r2, r2, #1
and r2, r2, #5
# Add 0/5 to h.
adds r4, r4, r2
adcs r5, r5, #0
adcs r6, r6, #0
adc r7, r7, #0
# Add padding
add r9, r0, #36
ldm r9, {r2, r3, r12, lr}
adds r4, r4, r2
adcs r5, r5, r3
adcs r6, r6, r12
adc r7, r7, lr
# Store MAC
str r4, [r1]
str r5, [r1, #4]
str r6, [r1, #8]
str r7, [r1, #12]
# Zero out h.
eor r4, r4, r4
eor r5, r5, r5
eor r6, r6, r6
eor r7, r7, r7
eor r8, r8, r8
add r9, r0, #16
stm r9, {r4, r5, r6, r7, r8}
# Zero out r.
add r9, r0, #0
stm r9, {r4, r5, r6, r7}
# Zero out padding.
add r9, r0, #36
stm r9, {r4, r5, r6, r7}
pop {r4, r5, r6, r7, r8, r9, pc}
.size poly1305_final,.-poly1305_final
#endif /* HAVE_POLY1305 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#endif /* !WOLFSSL_ARMASM_INLINE */

View File

@ -0,0 +1,395 @@
/* armv8-32-poly1305-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./poly1305/poly1305.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.c
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
#endif /* __IAR_SYSTEMS_ICC__ */
#ifdef __KEIL__
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef HAVE_POLY1305
#include <wolfssl/wolfcrypt/poly1305.h>
void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
int notLast_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const byte* m asm ("r1") = (const byte*)m_p;
register word32 len asm ("r2") = (word32)len_p;
register int notLast asm ("r3") = (int)notLast_p;
__asm__ __volatile__ (
"sub sp, sp, #28\n\t"
"cmp %[len], #0\n\t"
"beq L_poly1305_arm32_16_done_%=\n\t"
"add lr, sp, #12\n\t"
"stm lr, {r0, r1, r2, r3}\n\t"
/* Get h pointer */
"add lr, %[ctx], #16\n\t"
"ldm lr, {r4, r5, r6, r7, r8}\n\t"
"\n"
"L_poly1305_arm32_16_loop_%=: \n\t"
/* Add m to h */
"ldr %[m], [sp, #16]\n\t"
"ldr %[len], [%[m]]\n\t"
"ldr %[notLast], [%[m], #4]\n\t"
"ldr r9, [%[m], #8]\n\t"
"ldr r10, [%[m], #12]\n\t"
"ldr r11, [sp, #24]\n\t"
"adds r4, r4, %[len]\n\t"
"adcs r5, r5, %[notLast]\n\t"
"adcs r6, r6, r9\n\t"
"adcs r7, r7, r10\n\t"
"add %[m], %[m], #16\n\t"
"adc r8, r8, r11\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
"stm lr, {r4, r5, r6, r7, r8}\n\t"
#else
/* h[0]-h[2] in r4-r6 for multiplication. */
"str r7, [lr, #12]\n\t"
"str r8, [lr, #16]\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
"str %[m], [sp, #16]\n\t"
"ldr %[m], [sp, #12]\n\t"
/* Multiply h by r */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
"ldr %[notLast], [%[m]]\n\t"
"eor %[ctx], %[ctx], %[ctx]\n\t"
/* r[0] * h[0] */
/* h[0] in r4 */
"umull r4, r5, %[notLast], r4\n\t"
/* r[0] * h[2] */
/* h[2] in r6 */
"umull r6, r7, %[notLast], r6\n\t"
/* r[0] * h[4] */
/* h[4] in r8 */
"mul r8, %[notLast], r8\n\t"
/* r[0] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r5, r12, %[notLast], %[len]\n\t"
/* r[0] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r6, r6, r12\n\t"
"adc r7, r7, %[ctx]\n\t"
"umlal r7, r8, %[notLast], %[len]\n\t"
/* r[1] * h[0] */
"ldr %[notLast], [%[m], #4]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r5, r12, %[notLast], %[len]\n\t"
/* r[1] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r6, r6, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r6, r12, %[notLast], %[len]\n\t"
/* r[1] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r7, r7, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[1] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r8, r8, r12\n\t"
"adc r9, %[ctx], %[ctx]\n\t"
"umlal r8, r9, %[notLast], %[len]\n\t"
/* r[1] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mla r9, %[notLast], %[len], r9\n\t"
/* r[2] * h[0] */
"ldr %[notLast], [%[m], #8]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r6, r12, %[notLast], %[len]\n\t"
/* r[2] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r7, r7, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[2] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r8, r8, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r8, r12, %[notLast], %[len]\n\t"
/* r[2] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, %[ctx], %[ctx]\n\t"
"umlal r9, r10, %[notLast], %[len]\n\t"
/* r[2] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mla r10, %[notLast], %[len], r10\n\t"
/* r[3] * h[0] */
"ldr %[notLast], [%[m], #12]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[3] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r8, r8, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r8, r12, %[notLast], %[len]\n\t"
/* r[3] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, r10, %[ctx]\n\t"
"umlal r9, r10, %[notLast], %[len]\n\t"
/* r[3] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"mov r11, %[ctx]\n\t"
"umlal r10, r11, %[notLast], %[len]\n\t"
/* r[3] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mov r12, %[ctx]\n\t"
"mla r11, %[notLast], %[len], r11\n\t"
#else
"ldm %[m], {r0, r1, r2, r3}\n\t"
/* r[0] * h[0] */
"umull r10, r11, %[ctx], r4\n\t"
/* r[1] * h[0] */
"umull r12, r7, %[m], r4\n\t"
/* r[0] * h[1] */
"umaal r11, r12, %[ctx], r5\n\t"
/* r[2] * h[0] */
"umull r8, r9, %[len], r4\n\t"
/* r[1] * h[1] */
"umaal r12, r8, %[m], r5\n\t"
/* r[0] * h[2] */
"umaal r12, r7, %[ctx], r6\n\t"
/* r[3] * h[0] */
"umaal r8, r9, %[notLast], r4\n\t"
"stm sp, {r10, r11, r12}\n\t"
/* r[2] * h[1] */
"umaal r7, r8, %[len], r5\n\t"
/* Replace h[0] with h[3] */
"ldr r4, [lr, #12]\n\t"
/* r[1] * h[2] */
"umull r10, r11, %[m], r6\n\t"
/* r[2] * h[2] */
"umaal r8, r9, %[len], r6\n\t"
/* r[0] * h[3] */
"umaal r7, r10, %[ctx], r4\n\t"
/* r[3] * h[1] */
"umaal r8, r11, %[notLast], r5\n\t"
/* r[1] * h[3] */
"umaal r8, r10, %[m], r4\n\t"
/* r[3] * h[2] */
"umaal r9, r11, %[notLast], r6\n\t"
/* r[2] * h[3] */
"umaal r9, r10, %[len], r4\n\t"
/* Replace h[1] with h[4] */
"ldr r5, [lr, #16]\n\t"
/* r[3] * h[3] */
"umaal r10, r11, %[notLast], r4\n\t"
"mov r12, #0\n\t"
/* r[0] * h[4] */
"umaal r8, r12, %[ctx], r5\n\t"
/* r[1] * h[4] */
"umaal r9, r12, %[m], r5\n\t"
/* r[2] * h[4] */
"umaal r10, r12, %[len], r5\n\t"
/* r[3] * h[4] */
"umaal r11, r12, %[notLast], r5\n\t"
/* DONE */
"ldm sp, {r4, r5, r6}\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
/* r12 will be zero because r is masked. */
/* Load length */
"ldr %[len], [sp, #20]\n\t"
/* Reduce mod 2^130 - 5 */
"bic %[notLast], r8, #3\n\t"
"and r8, r8, #3\n\t"
"adds r4, r4, %[notLast]\n\t"
"lsr %[notLast], %[notLast], #2\n\t"
"adcs r5, r5, r9\n\t"
"orr %[notLast], %[notLast], r9, LSL #30\n\t"
"adcs r6, r6, r10\n\t"
"lsr r9, r9, #2\n\t"
"adcs r7, r7, r11\n\t"
"orr r9, r9, r10, LSL #30\n\t"
"adc r8, r8, r12\n\t"
"lsr r10, r10, #2\n\t"
"adds r4, r4, %[notLast]\n\t"
"orr r10, r10, r11, LSL #30\n\t"
"adcs r5, r5, r9\n\t"
"lsr r11, r11, #2\n\t"
"adcs r6, r6, r10\n\t"
"adcs r7, r7, r11\n\t"
"adc r8, r8, r12\n\t"
/* Sub 16 from length. */
"subs %[len], %[len], #16\n\t"
/* Store length. */
"str %[len], [sp, #20]\n\t"
/* Loop again if more message to do. */
"bgt L_poly1305_arm32_16_loop_%=\n\t"
"stm lr, {r4, r5, r6, r7, r8}\n\t"
"\n"
"L_poly1305_arm32_16_done_%=: \n\t"
"add sp, sp, #28\n\t"
: [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
[notLast] "+r" (notLast)
:
: "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11"
);
}
static const uint32_t L_poly1305_arm32_clamp[] = {
0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
};
void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const byte* key asm ("r1") = (const byte*)key_p;
register uint32_t* L_poly1305_arm32_clamp_c asm ("r2") =
(uint32_t*)&L_poly1305_arm32_clamp;
__asm__ __volatile__ (
/* Load mask. */
"mov lr, %[L_poly1305_arm32_clamp]\n\t"
"ldm lr, {r6, r7, r8, r12}\n\t"
/* Load and cache padding. */
"ldr r2, [%[key], #16]\n\t"
"ldr r3, [%[key], #20]\n\t"
"ldr r4, [%[key], #24]\n\t"
"ldr r5, [%[key], #28]\n\t"
"add lr, %[ctx], #36\n\t"
"stm lr, {r2, r3, r4, r5}\n\t"
/* Load, mask and store r. */
"ldr r2, [%[key]]\n\t"
"ldr r3, [%[key], #4]\n\t"
"ldr r4, [%[key], #8]\n\t"
"ldr r5, [%[key], #12]\n\t"
"and r2, r2, r6\n\t"
"and r3, r3, r7\n\t"
"and r4, r4, r8\n\t"
"and r5, r5, r12\n\t"
"add lr, %[ctx], #0\n\t"
"stm lr, {r2, r3, r4, r5}\n\t"
/* h (accumulator) = 0 */
"eor r6, r6, r6\n\t"
"eor r7, r7, r7\n\t"
"eor r8, r8, r8\n\t"
"eor r12, r12, r12\n\t"
"add lr, %[ctx], #16\n\t"
"eor r5, r5, r5\n\t"
"stm lr, {r5, r6, r7, r8, r12}\n\t"
/* Zero leftover */
"str r5, [%[ctx], #52]\n\t"
: [ctx] "+r" (ctx), [key] "+r" (key),
[L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c)
:
: "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
);
}
void poly1305_final(Poly1305* ctx_p, byte* mac_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register byte* mac asm ("r1") = (byte*)mac_p;
__asm__ __volatile__ (
"add r9, %[ctx], #16\n\t"
"ldm r9, {r4, r5, r6, r7, r8}\n\t"
/* Add 5 and check for h larger than p. */
"adds r2, r4, #5\n\t"
"adcs r2, r5, #0\n\t"
"adcs r2, r6, #0\n\t"
"adcs r2, r7, #0\n\t"
"adc r2, r8, #0\n\t"
"sub r2, r2, #4\n\t"
"lsr r2, r2, #31\n\t"
"sub r2, r2, #1\n\t"
"and r2, r2, #5\n\t"
/* Add 0/5 to h. */
"adds r4, r4, r2\n\t"
"adcs r5, r5, #0\n\t"
"adcs r6, r6, #0\n\t"
"adc r7, r7, #0\n\t"
/* Add padding */
"add r9, %[ctx], #36\n\t"
"ldm r9, {r2, r3, r12, lr}\n\t"
"adds r4, r4, r2\n\t"
"adcs r5, r5, r3\n\t"
"adcs r6, r6, r12\n\t"
"adc r7, r7, lr\n\t"
/* Store MAC */
"str r4, [%[mac]]\n\t"
"str r5, [%[mac], #4]\n\t"
"str r6, [%[mac], #8]\n\t"
"str r7, [%[mac], #12]\n\t"
/* Zero out h. */
"eor r4, r4, r4\n\t"
"eor r5, r5, r5\n\t"
"eor r6, r6, r6\n\t"
"eor r7, r7, r7\n\t"
"eor r8, r8, r8\n\t"
"add r9, %[ctx], #16\n\t"
"stm r9, {r4, r5, r6, r7, r8}\n\t"
/* Zero out r. */
"add r9, %[ctx], #0\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
/* Zero out padding. */
"add r9, %[ctx], #36\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
: [ctx] "+r" (ctx), [mac] "+r" (mac)
:
: "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
"r9"
);
}
#endif /* HAVE_POLY1305 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha2/sha256.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
* ruby ./sha2/sha256.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
*/
#ifdef HAVE_CONFIG_H

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha2/sha256.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.c
* ruby ./sha2/sha256.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.c
*/
#ifdef HAVE_CONFIG_H
@ -74,13 +75,14 @@ static const uint32_t L_SHA256_transform_len_k[] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len);
void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p);
void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
{
register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p;
register const byte* data asm ("r1") = (const byte*)data_p;
register word32 len asm ("r2") = (word32)len_p;
register uint32_t* L_SHA256_transform_len_k_c asm ("r3") = (uint32_t*)&L_SHA256_transform_len_k;
register uint32_t* L_SHA256_transform_len_k_c asm ("r3") =
(uint32_t*)&L_SHA256_transform_len_k;
__asm__ __volatile__ (
"sub sp, sp, #0xc0\n\t"
@ -1732,9 +1734,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
"add %[data], %[data], #0x40\n\t"
"bne L_SHA256_transform_len_begin_%=\n\t"
"add sp, sp, #0xc0\n\t"
: [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
: [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len),
[L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"r12"
);
}
@ -1761,13 +1765,14 @@ static const uint32_t L_SHA256_transform_neon_len_k[] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len);
void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p);
void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
{
register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p;
register const byte* data asm ("r1") = (const byte*)data_p;
register word32 len asm ("r2") = (word32)len_p;
register uint32_t* L_SHA256_transform_neon_len_k_c asm ("r3") = (uint32_t*)&L_SHA256_transform_neon_len_k;
register uint32_t* L_SHA256_transform_neon_len_k_c asm ("r3") =
(uint32_t*)&L_SHA256_transform_neon_len_k;
__asm__ __volatile__ (
"sub sp, sp, #24\n\t"
@ -2794,9 +2799,12 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
"str r10, [sp, #8]\n\t"
"bne L_SHA256_transform_neon_len_begin_%=\n\t"
"add sp, sp, #24\n\t"
: [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_neon_len_k] "+r" (L_SHA256_transform_neon_len_k_c)
: [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len),
[L_SHA256_transform_neon_len_k] "+r" (L_SHA256_transform_neon_len_k_c)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr",
"r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
"d10", "d11"
);
}

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha3/sha3.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
* ruby ./sha3/sha3.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
*/
#ifdef HAVE_CONFIG_H
@ -32,6 +33,8 @@
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_SHA3
#ifndef WOLFSSL_ARMASM_NO_NEON
.text
.type L_sha3_arm2_neon_rt, %object
.size L_sha3_arm2_neon_rt, 192
@ -85,60 +88,6 @@ L_sha3_arm2_neon_rt:
.word 0x0
.word 0x80008008
.word 0x80000000
.text
.type L_sha3_arm2_rt, %object
.size L_sha3_arm2_rt, 192
.align 4
L_sha3_arm2_rt:
.word 0x1
.word 0x0
.word 0x8082
.word 0x0
.word 0x808a
.word 0x80000000
.word 0x80008000
.word 0x80000000
.word 0x808b
.word 0x0
.word 0x80000001
.word 0x0
.word 0x80008081
.word 0x80000000
.word 0x8009
.word 0x80000000
.word 0x8a
.word 0x0
.word 0x88
.word 0x0
.word 0x80008009
.word 0x0
.word 0x8000000a
.word 0x0
.word 0x8000808b
.word 0x0
.word 0x8b
.word 0x80000000
.word 0x8089
.word 0x80000000
.word 0x8003
.word 0x80000000
.word 0x8002
.word 0x80000000
.word 0x80
.word 0x80000000
.word 0x800a
.word 0x0
.word 0x8000000a
.word 0x80000000
.word 0x80008081
.word 0x80000000
.word 0x8080
.word 0x80000000
.word 0x80000001
.word 0x0
.word 0x80008008
.word 0x80000000
#ifndef WOLFSSL_ARMASM_NO_NEON
.text
.align 4
.globl BlockSha3
@ -407,6 +356,59 @@ L_sha3_arm32_neon_begin:
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_ARMASM_NO_NEON */
#ifdef WOLFSSL_ARMASM_NO_NEON
.text
.type L_sha3_arm2_rt, %object
.size L_sha3_arm2_rt, 192
.align 4
L_sha3_arm2_rt:
.word 0x1
.word 0x0
.word 0x8082
.word 0x0
.word 0x808a
.word 0x80000000
.word 0x80008000
.word 0x80000000
.word 0x808b
.word 0x0
.word 0x80000001
.word 0x0
.word 0x80008081
.word 0x80000000
.word 0x8009
.word 0x80000000
.word 0x8a
.word 0x0
.word 0x88
.word 0x0
.word 0x80008009
.word 0x0
.word 0x8000000a
.word 0x0
.word 0x8000808b
.word 0x0
.word 0x8b
.word 0x80000000
.word 0x8089
.word 0x80000000
.word 0x8003
.word 0x80000000
.word 0x8002
.word 0x80000000
.word 0x80
.word 0x80000000
.word 0x800a
.word 0x0
.word 0x8000000a
.word 0x80000000
.word 0x80008081
.word 0x80000000
.word 0x8080
.word 0x80000000
.word 0x80000001
.word 0x0
.word 0x80008008
.word 0x80000000
.text
.align 4
.globl BlockSha3
@ -2391,6 +2393,7 @@ L_sha3_arm32_begin:
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha3/sha3.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.c
* ruby ./sha3/sha3.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.c
*/
#ifdef HAVE_CONFIG_H
@ -51,6 +52,8 @@
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef WOLFSSL_SHA3
#ifndef WOLFSSL_ARMASM_NO_NEON
static const uint64_t L_sha3_arm2_neon_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
@ -66,29 +69,13 @@ static const uint64_t L_sha3_arm2_neon_rt[] = {
0x0000000080000001UL, 0x8000000080008008UL,
};
static const uint64_t L_sha3_arm2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
0x000000000000808bUL, 0x0000000080000001UL,
0x8000000080008081UL, 0x8000000000008009UL,
0x000000000000008aUL, 0x0000000000000088UL,
0x0000000080008009UL, 0x000000008000000aUL,
0x000000008000808bUL, 0x800000000000008bUL,
0x8000000000008089UL, 0x8000000000008003UL,
0x8000000000008002UL, 0x8000000000000080UL,
0x000000000000800aUL, 0x800000008000000aUL,
0x8000000080008081UL, 0x8000000000008080UL,
0x0000000080000001UL, 0x8000000080008008UL,
};
#include <wolfssl/wolfcrypt/sha3.h>
#ifndef WOLFSSL_ARMASM_NO_NEON
void BlockSha3(word64* state_p)
{
register word64* state asm ("r0") = (word64*)state_p;
register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt;
register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt;
register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") =
(uint64_t*)&L_sha3_arm2_neon_rt;
__asm__ __volatile__ (
"sub sp, sp, #16\n\t"
@ -348,25 +335,43 @@ void BlockSha3(word64* state_p)
"vst1.8 {d20-d23}, [%[state]]!\n\t"
"vst1.8 {d24}, [%[state]]\n\t"
"add sp, sp, #16\n\t"
: [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c)
: [state] "+r" (state),
[L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c)
:
: "memory", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc"
: "memory", "cc", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
"d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
"d26", "d27", "d28", "d29", "d30", "d31"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#ifdef WOLFSSL_ARMASM_NO_NEON
static const uint64_t L_sha3_arm2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
0x000000000000808bUL, 0x0000000080000001UL,
0x8000000080008081UL, 0x8000000000008009UL,
0x000000000000008aUL, 0x0000000000000088UL,
0x0000000080008009UL, 0x000000008000000aUL,
0x000000008000808bUL, 0x800000000000008bUL,
0x8000000000008089UL, 0x8000000000008003UL,
0x8000000000008002UL, 0x8000000000000080UL,
0x000000000000800aUL, 0x800000008000000aUL,
0x8000000080008081UL, 0x8000000000008080UL,
0x0000000080000001UL, 0x8000000080008008UL,
};
#include <wolfssl/wolfcrypt/sha3.h>
#ifdef WOLFSSL_ARMASM_NO_NEON
void BlockSha3(word64* state_p)
{
register word64* state asm ("r0") = (word64*)state_p;
register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt;
register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt;
register uint64_t* L_sha3_arm2_rt_c asm ("r1") =
(uint64_t*)&L_sha3_arm2_rt;
__asm__ __volatile__ (
"sub sp, sp, #0xcc\n\t"
"mov r1, %[L_sha3_arm2_rt]\n\t"
"mov r2, #12\n\t"
"\n"
"L_sha3_arm32_begin_%=: \n\t"
@ -2341,13 +2346,15 @@ void BlockSha3(word64* state_p)
"subs r2, r2, #1\n\t"
"bne L_sha3_arm32_begin_%=\n\t"
"add sp, sp, #0xcc\n\t"
: [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c)
: [state] "+r" (state), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c)
:
: "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
: "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
"r9", "r10", "r11"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
* ruby ./sha2/sha512.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
*/
#ifdef HAVE_CONFIG_H

View File

@ -21,7 +21,8 @@
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
* ruby ./sha2/sha512.rb arm32 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
*/
#ifdef HAVE_CONFIG_H
@ -98,13 +99,14 @@ static const uint64_t L_SHA512_transform_len_k[] = {
0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
};
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len);
void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p);
void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p)
{
register wc_Sha512* sha512 asm ("r0") = (wc_Sha512*)sha512_p;
register const byte* data asm ("r1") = (const byte*)data_p;
register word32 len asm ("r2") = (word32)len_p;
register uint64_t* L_SHA512_transform_len_k_c asm ("r3") = (uint64_t*)&L_SHA512_transform_len_k;
register uint64_t* L_SHA512_transform_len_k_c asm ("r3") =
(uint64_t*)&L_SHA512_transform_len_k;
__asm__ __volatile__ (
"sub sp, sp, #0xc0\n\t"
@ -7601,9 +7603,11 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p)
"bne L_SHA512_transform_len_begin_%=\n\t"
"eor r0, r0, r0\n\t"
"add sp, sp, #0xc0\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len),
[L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
:
: "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"r12"
);
}
@ -7654,13 +7658,14 @@ static const uint64_t L_SHA512_transform_neon_len_k[] = {
0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
};
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len);
void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p);
void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p)
{
register wc_Sha512* sha512 asm ("r0") = (wc_Sha512*)sha512_p;
register const byte* data asm ("r1") = (const byte*)data_p;
register word32 len asm ("r2") = (word32)len_p;
register uint64_t* L_SHA512_transform_neon_len_k_c asm ("r3") = (uint64_t*)&L_SHA512_transform_neon_len_k;
register uint64_t* L_SHA512_transform_neon_len_k_c asm ("r3") =
(uint64_t*)&L_SHA512_transform_neon_len_k;
__asm__ __volatile__ (
/* Load digest into working vars */
@ -9151,9 +9156,12 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p)
"subs %[len], %[len], #0x80\n\t"
"sub r3, r3, #0x280\n\t"
"bne L_SHA512_transform_neon_len_begin_%=\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_neon_len_k] "+r" (L_SHA512_transform_neon_len_k_c)
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len),
[L_SHA512_transform_neon_len_k] "+r" (L_SHA512_transform_neon_len_k_c)
:
: "memory", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc"
: "memory", "cc", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}

View File

@ -29,7 +29,7 @@
#include <wolfssl/wolfcrypt/settings.h>
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON)
#if defined(WOLFSSL_ARMASM)
#ifdef HAVE_CHACHA
#include <wolfssl/wolfcrypt/chacha.h>
@ -73,15 +73,43 @@
* Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
* uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
*/
int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
int wc_Chacha_SetIV(ChaCha* ctx, const byte* iv, word32 counter)
{
#ifndef __aarch64__
int ret = 0;
#ifdef CHACHA_AEAD_TEST
word32 i;
printf("NONCE : ");
if (iv != NULL) {
for (i = 0; i < CHACHA_IV_BYTES; i++) {
printf("%02x", iv[i]);
}
}
printf("\n\n");
#endif
/* Validate parameters. */
if ((ctx == NULL) || (iv == NULL)) {
ret = BAD_FUNC_ARG;
}
if (ret == 0) {
/* No unused bytes to XOR into input. */
ctx->left = 0;
/* Set counter and IV into state. */
wc_chacha_setiv(ctx->X, iv, counter);
}
return ret;
#else
word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
#ifdef CHACHA_AEAD_TEST
word32 i;
printf("NONCE : ");
for (i = 0; i < CHACHA_IV_BYTES; i++) {
printf("%02x", inIv[i]);
printf("%02x", iv[i]);
}
printf("\n\n");
#endif
@ -89,7 +117,7 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
if (ctx == NULL)
return BAD_FUNC_ARG;
XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
XMEMCPY(temp, iv, CHACHA_IV_BYTES);
ctx->left = 0;
ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */
@ -98,18 +126,54 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
return 0;
#endif
}
#ifdef __aarch64__
/* "expand 32-byte k" as unsigned 32 byte */
static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
/* "expand 16-byte k" as unsigned 16 byte */
static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
#endif
/**
* Key setup. 8 word iv (nonce)
*/
int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
{
#ifndef __aarch64__
int ret = 0;
#ifdef CHACHA_AEAD_TEST
printf("ChaCha key used :\n");
if (key != NULL) {
word32 i;
for (i = 0; i < keySz; i++) {
printf("%02x", key[i]);
if ((i % 8) == 7)
printf("\n");
}
}
printf("\n\n");
#endif
/* Validate parameters. */
if ((ctx == NULL) || (key == NULL)) {
ret = BAD_FUNC_ARG;
}
else if ((keySz != (CHACHA_MAX_KEY_SZ / 2)) &&
(keySz != CHACHA_MAX_KEY_SZ )) {
ret = BAD_FUNC_ARG;
}
if (ret == 0) {
ctx->left = 0;
wc_chacha_setkey(ctx->X, key, keySz);
}
return ret;
#else
const word32* constants;
const byte* k;
@ -169,8 +233,10 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
ctx->left = 0;
return 0;
#endif
}
#ifndef WOLFSSL_ARMASM_NO_NEON
static const word32 L_chacha20_neon_inc_first_word[] = {
0x1,
0x0,
@ -2815,7 +2881,6 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
}
/**
* Encrypt a stream of bytes
*/
@ -2862,40 +2927,68 @@ static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
}
}
#endif
/**
* API to encrypt/decrypt a message of any size.
*/
int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
word32 msglen)
word32 len)
{
#ifdef WOLFSSL_ARMASM_NO_NEON
int ret = 0;
if ((ctx == NULL) || (output == NULL) || (input == NULL)) {
ret = BAD_FUNC_ARG;
}
/* Handle left over bytes from last block. */
if ((ret == 0) && (len > 0) && (ctx->left > 0)) {
byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left;
word32 l = min(len, ctx->left);
wc_chacha_use_over(over, output, input, l);
ctx->left -= l;
input += l;
output += l;
len -= l;
}
if ((ret == 0) && (len != 0)) {
wc_chacha_crypt_bytes(ctx, output, input, len);
}
return ret;
#else
if (ctx == NULL || output == NULL || input == NULL)
return BAD_FUNC_ARG;
/* handle left overs */
if (msglen > 0 && ctx->left > 0) {
if (len > 0 && ctx->left > 0) {
byte* out;
word32 i;
out = (byte*)ctx->over + CHACHA_CHUNK_BYTES - ctx->left;
for (i = 0; i < msglen && i < ctx->left; i++) {
for (i = 0; i < len && i < ctx->left; i++) {
output[i] = (byte)(input[i] ^ out[i]);
}
ctx->left -= i;
msglen -= i;
len -= i;
output += i;
input += i;
}
if (msglen == 0) {
if (len == 0) {
return 0;
}
wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
wc_Chacha_encrypt_bytes(ctx, input, output, len);
return 0;
#endif
}
#endif /* HAVE_CHACHA */
#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_ARMASM */

View File

@ -32,7 +32,6 @@
#include <wolfssl/wolfcrypt/types.h>
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifdef HAVE_POLY1305
#include <wolfssl/wolfcrypt/poly1305.h>
@ -49,6 +48,8 @@
#include <stdio.h>
#endif
#ifdef __aarch64__
static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
const unsigned char *m, size_t bytes)
{
@ -1118,6 +1119,127 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
return 0;
}
#endif /* HAVE_POLY1305 */
#else
#ifdef __thumb__
/* Process 16 bytes of message at a time.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
* @param [in] bytes Length of message in bytes.
*/
void poly1305_blocks_thumb2(Poly1305* ctx, const unsigned char* m,
size_t bytes)
{
poly1305_blocks_thumb2_16(ctx, m, bytes, 1);
}
/* Process 16 bytes of message.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
*/
void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m)
{
poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
}
#else
/* Process 16 bytes of message at a time.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
* @param [in] bytes Length of message in bytes.
*/
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes)
{
poly1305_blocks_arm32_16(ctx, m, bytes, 1);
}
/* Process 16 bytes of message.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
*/
void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m)
{
poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
}
#endif
/* Set the key for the Poly1305 operation.
*
* @param [in] ctx Poly1305 context.
* @param [in] key Key data to use.
* @param [in] keySz Size of key in bytes. Must be 32.
* @return 0 on success.
* @return BAD_FUNC_ARG when ctx or key is NULL or keySz is not 32.
*/
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
{
int ret = 0;
#ifdef CHACHA_AEAD_TEST
word32 k;
printf("Poly key used:\n");
if (key != NULL) {
for (k = 0; k < keySz; k++) {
printf("%02x", key[k]);
if ((k+1) % 8 == 0)
printf("\n");
}
}
printf("\n");
#endif
/* Validate parameters. */
if ((ctx == NULL) || (key == NULL) || (keySz != 32)) {
ret = BAD_FUNC_ARG;
}
if (ret == 0) {
poly1305_set_key(ctx, key);
}
return ret;
}
/* Finalize the Poly1305 operation calculating the MAC.
*
* @param [in] ctx Poly1305 context.
* @param [in] mac Buffer to hold the MAC. Myst be at least 16 bytes long.
* @return 0 on success.
* @return BAD_FUNC_ARG when ctx or mac is NULL.
*/
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
{
int ret = 0;
/* Validate parameters. */
if ((ctx == NULL) || (mac == NULL)) {
ret = BAD_FUNC_ARG;
}
/* Process the remaining partial block - last block. */
if (ret == 0) {
if (ctx->leftover) {
size_t i = ctx->leftover;
ctx->buffer[i++] = 1;
for (; i < POLY1305_BLOCK_SIZE; i++) {
ctx->buffer[i] = 0;
}
#ifdef __thumb__
poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE,
0);
#else
poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0);
#endif
}
poly1305_final(ctx, mac);
}
return ret;
}
#endif /* __aarch64__ */
#endif /* HAVE_POLY1305 */
#endif /* WOLFSSL_ARMASM */

View File

@ -107,12 +107,18 @@ WOLFSSL_API int wc_XChacha_SetKey(ChaCha *ctx, const byte *key, word32 keySz,
word32 counter);
#endif
#if defined(WOLFSSL_ARMASM) && defined(__thumb__)
#if defined(WOLFSSL_ARMASM)
#ifndef __aarch64__
void wc_chacha_setiv(word32* x, const byte* iv, word32 counter);
void wc_chacha_setkey(word32* x, const byte* key, word32 keySz);
#endif
#if defined(WOLFSSL_ARMASM_NO_NEON) || defined(__thumb__)
void wc_chacha_use_over(byte* over, byte* output, const byte* input,
word32 len);
void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len);
#endif
#endif

View File

@ -98,7 +98,7 @@ typedef struct Poly1305 {
word64 leftover;
unsigned char buffer[POLY1305_BLOCK_SIZE];
unsigned char finished;
#elif defined(WOLFSSL_ARMASM) && defined(__thumb__)
#elif defined(WOLFSSL_ARMASM)
word32 r[4];
word32 h[5];
word32 pad[4];
@ -147,16 +147,16 @@ WOLFSSL_API int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz,
WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional,
word32 addSz, const byte* input, word32 sz, byte* tag, word32 tagSz);
#if defined(__aarch64__ ) && defined(WOLFSSL_ARMASM)
#if defined(WOLFSSL_ARMASM)
#if defined(__aarch64__ )
#define poly1305_blocks poly1305_blocks_aarch64
#define poly1305_block poly1305_block_aarch64
void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
size_t bytes);
void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m);
#endif
#if defined(__thumb__ ) && defined(WOLFSSL_ARMASM)
#else
#if defined(__thumb__)
#define poly1305_blocks poly1305_blocks_thumb2
#define poly1305_block poly1305_block_thumb2
@ -166,9 +166,20 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char *m);
void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m,
word32 len, int notLast);
#else
#define poly1305_blocks poly1305_blocks_arm32
#define poly1305_block poly1305_block_arm32
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes);
void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m);
void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len,
int notLast);
#endif
void poly1305_set_key(Poly1305* ctx, const byte* key);
void poly1305_final(Poly1305* ctx, byte* mac);
#endif
#endif /* WOLFSSL_ARMASM */
#if defined(WOLFSSL_RISCV_ASM)
#define poly1305_blocks poly1305_blocks_riscv64