Poly1305 ARM32 NEON: add implementation

Add assembly for Poly1305 using ARM32 NEON instruction set.

For Poly1305 ARM32 Base:
  Change name from poly1305_blocks_arm32_16 to poly1305_arm32_blocks_16

poly1305.c:
  ARM32 NEON - buffer up to 4 blocks
  x86_64 - only calculate powers of r once after key is set.
test.c: poly1305 testing with multiple updates.
benchmark: chacha20-poly1305 now uses AAD
This commit is contained in:
Sean Parkinson
2025-01-09 11:39:50 +10:00
parent 71b7d0c9de
commit ecacbae3a0
7 changed files with 2025 additions and 16 deletions

View File

@ -770,7 +770,8 @@
#define BENCH_RNG 0x00000001
#define BENCH_SCRYPT 0x00000002
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) || \
(defined(HAVE_CHACHA) && defined(HAVE_POLY1305))
/* Define AES_AUTH_ADD_SZ already here, since it's used in the
* static declaration of `bench_Usage_msg1`. */
#if !defined(AES_AUTH_ADD_SZ) && \
@ -1945,10 +1946,13 @@ static const char* bench_result_words2[][5] = {
#define BENCH_MIN_RUNTIME_SEC 1.0F
#endif
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) || \
(defined(HAVE_CHACHA) && defined(HAVE_POLY1305))
static word32 aesAuthAddSz = AES_AUTH_ADD_SZ;
#endif
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
#define AES_AUTH_TAG_SZ 16
#define BENCH_CIPHER_ADD AES_AUTH_TAG_SZ
static word32 aesAuthAddSz = AES_AUTH_ADD_SZ;
#if !defined(AES_AAD_OPTIONS_DEFAULT)
#if !defined(NO_MAIN_DRIVER)
#define AES_AAD_OPTIONS_DEFAULT 0x1U
@ -6059,15 +6063,19 @@ void bench_chacha20_poly1305_aead(void)
int ret = 0, i, count;
DECLARE_MULTI_VALUE_STATS_VARS()
WC_DECLARE_VAR(bench_additional, byte, AES_AUTH_ADD_SZ, HEAP_HINT);
WC_DECLARE_VAR(authTag, byte, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE, HEAP_HINT);
WC_ALLOC_VAR(bench_additional, byte, AES_AUTH_ADD_SZ, HEAP_HINT);
WC_ALLOC_VAR(authTag, byte, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE, HEAP_HINT);
XMEMSET(bench_additional, 0, AES_AUTH_ADD_SZ);
XMEMSET(authTag, 0, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE);
bench_stats_start(&count, &start);
do {
for (i = 0; i < numBlocks; i++) {
ret = wc_ChaCha20Poly1305_Encrypt(bench_key, bench_iv, NULL, 0,
bench_plain, bench_size, bench_cipher, authTag);
ret = wc_ChaCha20Poly1305_Encrypt(bench_key, bench_iv,
bench_additional, aesAuthAddSz, bench_plain, bench_size,
bench_cipher, authTag);
if (ret < 0) {
printf("wc_ChaCha20Poly1305_Encrypt error: %d\n", ret);
goto exit;
@ -6089,6 +6097,7 @@ void bench_chacha20_poly1305_aead(void)
exit:
WC_FREE_VAR(authTag, HEAP_HINT);
WC_FREE_VAR(bench_additional, HEAP_HINT);
}
#endif /* HAVE_CHACHA && HAVE_POLY1305 */

View File

@ -529,6 +529,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
#endif
poly1305_setkey_avx(ctx, key);
RESTORE_VECTOR_REGISTERS();
ctx->started = 0;
#elif defined(POLY130564)
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
@ -813,13 +814,49 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
printf("\n");
#endif
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \
!defined(WOLFSSL_ARMASM_NO_NEON)
/* handle leftover */
if (ctx->leftover) {
size_t want = sizeof(ctx->buffer) - ctx->leftover;
if (want > bytes)
want = bytes;
for (i = 0; i < want; i++)
ctx->buffer[ctx->leftover + i] = m[i];
bytes -= (word32)want;
m += want;
ctx->leftover += want;
if (ctx->leftover < sizeof(ctx->buffer)) {
return 0;
}
poly1305_blocks(ctx, ctx->buffer, sizeof(ctx->buffer));
ctx->leftover = 0;
}
/* process full blocks */
if (bytes >= sizeof(ctx->buffer)) {
size_t want = bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1);
poly1305_blocks(ctx, m, want);
m += want;
bytes -= (word32)want;
}
/* store leftover */
if (bytes) {
for (i = 0; i < bytes; i++)
ctx->buffer[ctx->leftover + i] = m[i];
ctx->leftover += bytes;
}
#else
#ifdef USE_INTEL_POLY1305_SPEEDUP
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
/* handle leftover */
if (ctx->leftover) {
size_t want = sizeof(ctx->buffer) - ctx->leftover;
if (want > bytes)
@ -835,8 +872,10 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
return 0;
}
if (!ctx->started)
if (!ctx->started) {
poly1305_calc_powers_avx2(ctx);
ctx->started = 1;
}
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
ctx->leftover = 0;
}
@ -845,8 +884,10 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
if (bytes >= sizeof(ctx->buffer)) {
size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
if (!ctx->started)
if (!ctx->started) {
poly1305_calc_powers_avx2(ctx);
ctx->started = 1;
}
poly1305_blocks_avx2(ctx, m, want);
m += want;
bytes -= (word32)want;
@ -902,6 +943,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
ctx->leftover += bytes;
}
}
#endif
return 0;
}

View File

@ -34,11 +34,12 @@
#if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_POLY1305
#ifdef WOLFSSL_ARMASM_NO_NEON
.text
.align 4
.globl poly1305_blocks_arm32_16
.type poly1305_blocks_arm32_16, %function
poly1305_blocks_arm32_16:
.globl poly1305_arm32_blocks_16
.type poly1305_arm32_blocks_16, %function
poly1305_arm32_blocks_16:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
sub sp, sp, #28
cmp r2, #0
@ -247,7 +248,7 @@ L_poly1305_arm32_16_loop:
L_poly1305_arm32_16_done:
add sp, sp, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16
.size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16
.text
.type L_poly1305_arm32_clamp, %object
.size L_poly1305_arm32_clamp, 16
@ -347,6 +348,941 @@ poly1305_final:
stm r9, {r4, r5, r6, r7}
pop {r4, r5, r6, r7, r8, r9, pc}
.size poly1305_final,.-poly1305_final
#else
.text
.align 4
.globl poly1305_arm32_blocks_16
.type poly1305_arm32_blocks_16, %function
poly1305_arm32_blocks_16:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
sub sp, sp, #28
cmp r2, #0
beq L_poly1305_arm32_16_done
add lr, sp, #12
stm lr, {r0, r1, r2, r3}
# Get h pointer
add lr, r0, #16
ldm lr, {r4, r5, r6, r7, r8}
L_poly1305_arm32_16_loop:
# Add m to h
ldr r1, [sp, #16]
ldr r2, [r1]
ldr r3, [r1, #4]
ldr r9, [r1, #8]
ldr r10, [r1, #12]
ldr r11, [sp, #24]
adds r4, r4, r2
adcs r5, r5, r3
adcs r6, r6, r9
adcs r7, r7, r10
add r1, r1, #16
adc r8, r8, r11
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
stm lr, {r4, r5, r6, r7, r8}
#else
# h[0]-h[2] in r4-r6 for multiplication.
str r7, [lr, #12]
str r8, [lr, #16]
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
str r1, [sp, #16]
ldr r1, [sp, #12]
# Multiply h by r
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
# r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i]
ldr r3, [r1]
eor r0, r0, r0
# r[0] * h[0]
# h[0] in r4
umull r4, r5, r3, r4
# r[0] * h[2]
# h[2] in r6
umull r6, r7, r3, r6
# r[0] * h[4]
# h[4] in r8
mul r8, r3, r8
# r[0] * h[1]
ldr r2, [lr, #4]
mov r12, r0
umlal r5, r12, r3, r2
# r[0] * h[3]
ldr r2, [lr, #12]
adds r6, r6, r12
adc r7, r7, r0
umlal r7, r8, r3, r2
# r[1] * h[0]
ldr r3, [r1, #4]
ldr r2, [lr]
mov r12, r0
umlal r5, r12, r3, r2
# r[1] * h[1]
ldr r2, [lr, #4]
adds r6, r6, r12
adc r12, r0, r0
umlal r6, r12, r3, r2
# r[1] * h[2]
ldr r2, [lr, #8]
adds r7, r7, r12
adc r12, r0, r0
umlal r7, r12, r3, r2
# r[1] * h[3]
ldr r2, [lr, #12]
adds r8, r8, r12
adc r9, r0, r0
umlal r8, r9, r3, r2
# r[1] * h[4]
ldr r2, [lr, #16]
mla r9, r3, r2, r9
# r[2] * h[0]
ldr r3, [r1, #8]
ldr r2, [lr]
mov r12, r0
umlal r6, r12, r3, r2
# r[2] * h[1]
ldr r2, [lr, #4]
adds r7, r7, r12
adc r12, r0, r0
umlal r7, r12, r3, r2
# r[2] * h[2]
ldr r2, [lr, #8]
adds r8, r8, r12
adc r12, r0, r0
umlal r8, r12, r3, r2
# r[2] * h[3]
ldr r2, [lr, #12]
adds r9, r9, r12
adc r10, r0, r0
umlal r9, r10, r3, r2
# r[2] * h[4]
ldr r2, [lr, #16]
mla r10, r3, r2, r10
# r[3] * h[0]
ldr r3, [r1, #12]
ldr r2, [lr]
mov r12, r0
umlal r7, r12, r3, r2
# r[3] * h[1]
ldr r2, [lr, #4]
adds r8, r8, r12
adc r12, r0, r0
umlal r8, r12, r3, r2
# r[3] * h[2]
ldr r2, [lr, #8]
adds r9, r9, r12
adc r10, r10, r0
umlal r9, r10, r3, r2
# r[3] * h[3]
ldr r2, [lr, #12]
mov r11, r0
umlal r10, r11, r3, r2
# r[3] * h[4]
ldr r2, [lr, #16]
mov r12, r0
mla r11, r3, r2, r11
#else
ldm r1, {r0, r1, r2, r3}
# r[0] * h[0]
umull r10, r11, r0, r4
# r[1] * h[0]
umull r12, r7, r1, r4
# r[0] * h[1]
umaal r11, r12, r0, r5
# r[2] * h[0]
umull r8, r9, r2, r4
# r[1] * h[1]
umaal r12, r8, r1, r5
# r[0] * h[2]
umaal r12, r7, r0, r6
# r[3] * h[0]
umaal r8, r9, r3, r4
stm sp, {r10, r11, r12}
# r[2] * h[1]
umaal r7, r8, r2, r5
# Replace h[0] with h[3]
ldr r4, [lr, #12]
# r[1] * h[2]
umull r10, r11, r1, r6
# r[2] * h[2]
umaal r8, r9, r2, r6
# r[0] * h[3]
umaal r7, r10, r0, r4
# r[3] * h[1]
umaal r8, r11, r3, r5
# r[1] * h[3]
umaal r8, r10, r1, r4
# r[3] * h[2]
umaal r9, r11, r3, r6
# r[2] * h[3]
umaal r9, r10, r2, r4
# Replace h[1] with h[4]
ldr r5, [lr, #16]
# r[3] * h[3]
umaal r10, r11, r3, r4
mov r12, #0
# r[0] * h[4]
umaal r8, r12, r0, r5
# r[1] * h[4]
umaal r9, r12, r1, r5
# r[2] * h[4]
umaal r10, r12, r2, r5
# r[3] * h[4]
umaal r11, r12, r3, r5
# DONE
ldm sp, {r4, r5, r6}
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
# r12 will be zero because r is masked.
# Load length
ldr r2, [sp, #20]
# Reduce mod 2^130 - 5
bic r3, r8, #0x3
and r8, r8, #3
adds r4, r4, r3
lsr r3, r3, #2
adcs r5, r5, r9
orr r3, r3, r9, LSL #30
adcs r6, r6, r10
lsr r9, r9, #2
adcs r7, r7, r11
orr r9, r9, r10, LSL #30
adc r8, r8, r12
lsr r10, r10, #2
adds r4, r4, r3
orr r10, r10, r11, LSL #30
adcs r5, r5, r9
lsr r11, r11, #2
adcs r6, r6, r10
adcs r7, r7, r11
adc r8, r8, r12
# Sub 16 from length.
subs r2, r2, #16
# Store length.
str r2, [sp, #20]
# Loop again if more message to do.
bgt L_poly1305_arm32_16_loop
stm lr, {r4, r5, r6, r7, r8}
L_poly1305_arm32_16_done:
add sp, sp, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16
.text
.align 4
.globl poly1305_arm32_blocks
.type poly1305_arm32_blocks, %function
poly1305_arm32_blocks:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vpush {d8-d15}
cmp r2, #16
add r12, r0, #16
bgt L_poly1305_arm32_blocks_begin_neon
ldm r12, {r7, r8, r9, r10, r11}
b L_poly1305_arm32_blocks_start_1
L_poly1305_arm32_blocks_begin_neon:
vmov.i16 q15, #0xffff
vshr.u64 q15, q15, #38
vld1.64 {d0-d2}, [r12]
vshl.u64 d4, d2, #24
vsri.u64 d4, d1, #40
vshr.u64 d3, d1, #14
vshl.u64 d2, d1, #12
vsri.u64 d1, d0, #26
vsri.u64 d2, d0, #52
vand.u64 d0, d0, d31
vand.u64 d3, d3, d31
vand.u64 d2, d2, d31
vand.u64 d1, d1, d31
add r3, r0, #0x7c
vldm.32 r3, {d20-d24}
cmp r2, #0x40
bge L_poly1305_arm32_blocks_begin_4
vshl.u32 d6, d21, #2
vshl.u32 d7, d22, #2
vshl.u32 d8, d23, #2
vshl.u32 d9, d24, #2
vadd.u32 d6, d6, d21
vadd.u32 d7, d7, d22
vadd.u32 d8, d8, d23
vadd.u32 d9, d9, d24
b L_poly1305_arm32_blocks_start_2
L_poly1305_arm32_blocks_begin_4:
add r3, r0, #0xa4
vldm.32 r3, {d26-d30}
L_poly1305_arm32_blocks_start_4:
sub r2, #0x40
vld4.32 {d10-d13}, [r1]!
vshl.u32 d6, d27, #2
vshl.u32 d7, d28, #2
vshl.u32 d8, d29, #2
vshl.u32 d9, d30, #2
vadd.u32 d6, d6, d27
vadd.u32 d7, d7, d28
vadd.u32 d8, d8, d29
vadd.u32 d9, d9, d30
vshr.u32 d14, d13, #8
vshl.u32 d13, d13, #18
vorr.i32 d14, d14, #0x1000000
vsri.u32 d13, d12, #14
vshl.u32 d12, d12, #12
vand.i32 d13, d13, #0x3ffffff
vsri.u32 d12, d11, #20
vshl.u32 d11, d11, #6
vand.i32 d12, d12, #0x3ffffff
vsri.u32 d11, d10, #26
vand.i32 d10, d10, #0x3ffffff
vand.i32 d11, d11, #0x3ffffff
vadd.u32 d4, d4, d14
vadd.u32 q1, q1, q6
vadd.u32 q0, q0, q5
vmull.u32 q5, d0, d26
vmull.u32 q6, d0, d27
vmull.u32 q7, d0, d28
vmull.u32 q8, d0, d29
vmull.u32 q9, d0, d30
vmlal.u32 q5, d1, d9
vmlal.u32 q6, d1, d26
vmlal.u32 q7, d1, d27
vmlal.u32 q8, d1, d28
vmlal.u32 q9, d1, d29
vmlal.u32 q5, d2, d8
vmlal.u32 q6, d2, d9
vmlal.u32 q7, d2, d26
vmlal.u32 q8, d2, d27
vmlal.u32 q9, d2, d28
vmlal.u32 q5, d3, d7
vmlal.u32 q6, d3, d8
vmlal.u32 q7, d3, d9
vmlal.u32 q8, d3, d26
vmlal.u32 q9, d3, d27
vmlal.u32 q5, d4, d6
vmlal.u32 q6, d4, d7
vmlal.u32 q7, d4, d8
vmlal.u32 q8, d4, d9
vmlal.u32 q9, d4, d26
vld4.32 {d0-d3}, [r1]!
vshl.u32 d6, d21, #2
vshl.u32 d7, d22, #2
vshl.u32 d8, d23, #2
vshl.u32 d9, d24, #2
vadd.u32 d6, d6, d21
vadd.u32 d7, d7, d22
vadd.u32 d8, d8, d23
vadd.u32 d9, d9, d24
vshr.u32 d4, d3, #8
vshl.u32 d3, d3, #18
vorr.i32 d4, d4, #0x1000000
vsri.u32 d3, d2, #14
vshl.u32 d2, d2, #12
vand.i32 d3, d3, #0x3ffffff
vsri.u32 d2, d1, #20
vshl.u32 d1, d1, #6
vand.i32 d2, d2, #0x3ffffff
vsri.u32 d1, d0, #26
vand.i32 d0, d0, #0x3ffffff
vand.i32 d1, d1, #0x3ffffff
vmlal.u32 q5, d0, d20
vmlal.u32 q6, d0, d21
vmlal.u32 q7, d0, d22
vmlal.u32 q8, d0, d23
vmlal.u32 q9, d0, d24
vmlal.u32 q5, d1, d9
vmlal.u32 q6, d1, d20
vmlal.u32 q7, d1, d21
vmlal.u32 q8, d1, d22
vmlal.u32 q9, d1, d23
vmlal.u32 q5, d2, d8
vmlal.u32 q6, d2, d9
vmlal.u32 q7, d2, d20
vmlal.u32 q8, d2, d21
vmlal.u32 q9, d2, d22
vmlal.u32 q5, d3, d7
vmlal.u32 q6, d3, d8
vmlal.u32 q7, d3, d9
vmlal.u32 q8, d3, d20
vmlal.u32 q9, d3, d21
vmlal.u32 q5, d4, d6
vmlal.u32 q6, d4, d7
vmlal.u32 q7, d4, d8
vmlal.u32 q8, d4, d9
vmlal.u32 q9, d4, d20
vadd.u64 d0, d10, d11
vadd.u64 d1, d12, d13
vadd.u64 d2, d14, d15
vadd.u64 d3, d16, d17
vadd.u64 d4, d18, d19
vsra.u64 d1, d0, #26
vand.u64 d0, d0, d31
vsra.u64 d2, d1, #26
vand.u64 d1, d1, d31
vsra.u64 d3, d2, #26
vand.u64 d2, d2, d31
vsra.u64 d4, d3, #26
vand.u64 d3, d3, d31
vshr.u64 d15, d4, #26
vand.u64 d4, d4, d31
vadd.u64 d0, d0, d15
vshl.u64 d15, d15, #2
vadd.u64 d0, d0, d15
vsra.u64 d1, d0, #26
vand.u64 d0, d0, d31
cmp r2, #0x40
bge L_poly1305_arm32_blocks_start_4
cmp r2, #32
blt L_poly1305_arm32_blocks_done_neon
L_poly1305_arm32_blocks_start_2:
sub r2, #32
vld4.32 {d10-d13}, [r1]!
vshr.u32 d14, d13, #8
vshl.u32 d13, d13, #18
vorr.i32 d14, d14, #0x1000000
vsri.u32 d13, d12, #14
vshl.u32 d12, d12, #12
vand.i32 d13, d13, #0x3ffffff
vsri.u32 d12, d11, #20
vshl.u32 d11, d11, #6
vand.i32 d12, d12, #0x3ffffff
vsri.u32 d11, d10, #26
vand.i32 d10, d10, #0x3ffffff
vand.i32 d11, d11, #0x3ffffff
vadd.u32 d4, d4, d14
vadd.u32 q1, q1, q6
vadd.u32 q0, q0, q5
vmull.u32 q5, d0, d20
vmull.u32 q6, d0, d21
vmull.u32 q7, d0, d22
vmull.u32 q8, d0, d23
vmull.u32 q9, d0, d24
vmlal.u32 q5, d1, d9
vmlal.u32 q6, d1, d20
vmlal.u32 q7, d1, d21
vmlal.u32 q8, d1, d22
vmlal.u32 q9, d1, d23
vmlal.u32 q5, d2, d8
vmlal.u32 q6, d2, d9
vmlal.u32 q7, d2, d20
vmlal.u32 q8, d2, d21
vmlal.u32 q9, d2, d22
vmlal.u32 q5, d3, d7
vmlal.u32 q6, d3, d8
vmlal.u32 q7, d3, d9
vmlal.u32 q8, d3, d20
vmlal.u32 q9, d3, d21
vmlal.u32 q5, d4, d6
vmlal.u32 q6, d4, d7
vmlal.u32 q7, d4, d8
vmlal.u32 q8, d4, d9
vmlal.u32 q9, d4, d20
vadd.u64 d0, d10, d11
vadd.u64 d1, d12, d13
vadd.u64 d2, d14, d15
vadd.u64 d3, d16, d17
vadd.u64 d4, d18, d19
vsra.u64 d1, d0, #26
vand.u64 d0, d0, d31
vsra.u64 d2, d1, #26
vand.u64 d1, d1, d31
vsra.u64 d3, d2, #26
vand.u64 d2, d2, d31
vsra.u64 d4, d3, #26
vand.u64 d3, d3, d31
vshr.u64 d5, d4, #26
vand.u64 d4, d4, d31
vadd.u64 d0, d0, d5
vshl.u64 d5, d5, #2
vadd.u64 d0, d0, d5
vsra.u64 d1, d0, #26
vand.u64 d0, d0, d31
L_poly1305_arm32_blocks_done_neon:
cmp r2, #16
beq L_poly1305_arm32_blocks_begin_1
add r12, r0, #16
vsli.u64 d0, d1, #26
vsli.u64 d0, d2, #52
vshr.u64 d1, d2, #12
vsli.u64 d1, d3, #14
vsli.u64 d1, d4, #40
vshr.u64 d2, d4, #24
vst1.64 {d0-d2}, [r12]
b L_poly1305_arm32_blocks_done
L_poly1305_arm32_blocks_begin_1:
vsli.u64 d0, d1, #26
vsli.u64 d0, d2, #52
vshr.u64 d1, d2, #12
vsli.u64 d1, d3, #14
vsli.u64 d1, d4, #40
vshr.u64 d2, d4, #24
vmov r7, r8, d0
vmov r9, r10, d1
vmov r11, d2[0]
L_poly1305_arm32_blocks_start_1:
mov r12, #1
push {r2}
# Load message
ldm r1, {r2, r3, r4, r5}
# Add message
adds r7, r7, r2
adcs r8, r8, r3
adcs r9, r9, r4
adcs r10, r10, r5
adc r11, r11, r12
push {r0, r1}
add r1, r0, #0
add lr, r0, #16
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
stm lr, {r7, r8, r9, r10, r11}
#else
# h[0]-h[2] in r4-r6 for multiplication.
str r10, [lr, #12]
str r11, [lr, #16]
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
# r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i]
ldr r3, [r1]
eor r0, r0, r0
# r[0] * h[0]
# h[0] in r4
umull r7, r8, r3, r7
# r[0] * h[2]
# h[2] in r6
umull r9, r10, r3, r9
# r[0] * h[4]
# h[4] in r8
mul r11, r3, r11
# r[0] * h[1]
ldr r2, [lr, #4]
mov r12, r0
umlal r8, r12, r3, r2
# r[0] * h[3]
ldr r2, [lr, #12]
adds r9, r9, r12
adc r10, r10, r0
umlal r10, r11, r3, r2
# r[1] * h[0]
ldr r3, [r1, #4]
ldr r2, [lr]
mov r12, r0
umlal r8, r12, r3, r2
# r[1] * h[1]
ldr r2, [lr, #4]
adds r9, r9, r12
adc r12, r0, r0
umlal r9, r12, r3, r2
# r[1] * h[2]
ldr r2, [lr, #8]
adds r10, r10, r12
adc r12, r0, r0
umlal r10, r12, r3, r2
# r[1] * h[3]
ldr r2, [lr, #12]
adds r11, r11, r12
adc r4, r0, r0
umlal r11, r4, r3, r2
# r[1] * h[4]
ldr r2, [lr, #16]
mla r4, r3, r2, r4
# r[2] * h[0]
ldr r3, [r1, #8]
ldr r2, [lr]
mov r12, r0
umlal r9, r12, r3, r2
# r[2] * h[1]
ldr r2, [lr, #4]
adds r10, r10, r12
adc r12, r0, r0
umlal r10, r12, r3, r2
# r[2] * h[2]
ldr r2, [lr, #8]
adds r11, r11, r12
adc r12, r0, r0
umlal r11, r12, r3, r2
# r[2] * h[3]
ldr r2, [lr, #12]
adds r4, r4, r12
adc r5, r0, r0
umlal r4, r5, r3, r2
# r[2] * h[4]
ldr r2, [lr, #16]
mla r5, r3, r2, r5
# r[3] * h[0]
ldr r3, [r1, #12]
ldr r2, [lr]
mov r12, r0
umlal r10, r12, r3, r2
# r[3] * h[1]
ldr r2, [lr, #4]
adds r11, r11, r12
adc r12, r0, r0
umlal r11, r12, r3, r2
# r[3] * h[2]
ldr r2, [lr, #8]
adds r4, r4, r12
adc r5, r5, r0
umlal r4, r5, r3, r2
# r[3] * h[3]
ldr r2, [lr, #12]
mov r6, r0
umlal r5, r6, r3, r2
# r[3] * h[4]
ldr r2, [lr, #16]
mov r12, r0
mla r6, r3, r2, r6
#else
sub sp, sp, #12
ldm r1, {r0, r1, r2, r3}
# r[0] * h[0]
umull r5, r6, r0, r7
# r[1] * h[0]
umull r12, r10, r1, r7
# r[0] * h[1]
umaal r6, r12, r0, r8
# r[2] * h[0]
umull r11, r4, r2, r7
# r[1] * h[1]
umaal r12, r11, r1, r8
# r[0] * h[2]
umaal r12, r10, r0, r9
# r[3] * h[0]
umaal r11, r4, r3, r7
stm sp, {r5, r6, r12}
# r[2] * h[1]
umaal r10, r11, r2, r8
# Replace h[0] with h[3]
ldr r7, [lr, #12]
# r[1] * h[2]
umull r5, r6, r1, r9
# r[2] * h[2]
umaal r11, r4, r2, r9
# r[0] * h[3]
umaal r10, r5, r0, r7
# r[3] * h[1]
umaal r11, r6, r3, r8
# r[1] * h[3]
umaal r11, r5, r1, r7
# r[3] * h[2]
umaal r4, r6, r3, r9
# r[2] * h[3]
umaal r4, r5, r2, r7
# Replace h[1] with h[4]
ldr r8, [lr, #16]
# r[3] * h[3]
umaal r5, r6, r3, r7
mov r12, #0
# r[0] * h[4]
umaal r11, r12, r0, r8
# r[1] * h[4]
umaal r4, r12, r1, r8
# r[2] * h[4]
umaal r5, r12, r2, r8
# r[3] * h[4]
umaal r6, r12, r3, r8
# DONE
ldm sp, {r7, r8, r9}
add sp, sp, #12
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
# Reduce mod 2^130 - 5
bic r3, r11, #0x3
and r11, r11, #3
adds r7, r7, r3
lsr r3, r3, #2
adcs r8, r8, r4
orr r3, r3, r4, LSL #30
adcs r9, r9, r5
lsr r4, r4, #2
adcs r10, r10, r6
orr r4, r4, r5, LSL #30
adc r11, r11, r12
lsr r5, r5, #2
adds r7, r7, r3
orr r5, r5, r6, LSL #30
adcs r8, r8, r4
lsr r6, r6, #2
adcs r9, r9, r5
adcs r10, r10, r6
adc r11, r11, r12
pop {r0, r1}
pop {r2}
add r12, r0, #16
stm r12, {r7, r8, r9, r10, r11}
L_poly1305_arm32_blocks_done:
vpop {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size poly1305_arm32_blocks,.-poly1305_arm32_blocks
.text
.type L_poly1305_arm32_clamp, %object
.size L_poly1305_arm32_clamp, 16
.align 4
L_poly1305_arm32_clamp:
.word 0xfffffff
.word 0xffffffc
.word 0xffffffc
.word 0xffffffc
.text
.align 4
.globl poly1305_set_key
.type poly1305_set_key, %function
poly1305_set_key:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vpush {d8-d15}
# Load mask.
adr lr, L_poly1305_arm32_clamp
ldm lr, {r6, r7, r8, r9}
# Load and cache padding.
ldr r2, [r1, #16]
ldr r3, [r1, #20]
ldr r4, [r1, #24]
ldr r5, [r1, #28]
add lr, r0, #40
stm lr, {r2, r3, r4, r5}
# Load, mask and store r.
ldr r2, [r1]
ldr r3, [r1, #4]
ldr r4, [r1, #8]
ldr r5, [r1, #12]
and r2, r2, r6
and r3, r3, r7
and r4, r4, r8
and r5, r5, r9
add lr, r0, #0
stm lr, {r2, r3, r4, r5}
vmov.i16 q10, #0xffff
vshr.u64 q10, q10, #38
lsr r8, r2, #26
lsr r9, r3, #20
lsr r10, r4, #14
lsr r11, r5, #8
eor r8, r8, r3, lsl #6
eor r9, r9, r4, lsl #12
eor r10, r10, r5, lsl #18
and r7, r2, #0x3ffffff
and r8, r8, #0x3ffffff
and r9, r9, #0x3ffffff
and r10, r10, #0x3ffffff
vmov.i32 s1, r7
vmov.i32 s3, r8
vmov.i32 s5, r9
vmov.i32 s7, r10
vmov.i32 s9, r11
push {r0, r1}
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
# Square r
umull r1, r6, r2, r3
mov r12, #0
umull r7, r8, r2, r5
mov lr, r12
umlal r6, lr, r2, r4
adds r7, r7, lr
adc lr, r12, r12
umlal r7, lr, r3, r4
mov r9, r12
umlal lr, r9, r3, r5
adds r8, r8, lr
adcs r9, r9, r12
adc r10, r12, r12
umlal r9, r10, r4, r5
adds r1, r1, r1
adcs r6, r6, r6
adcs r7, r7, r7
adcs r8, r8, r8
adcs r9, r9, r9
adcs r10, r10, r10
adc r11, r12, r12
umull r0, lr, r2, r2
adds r1, r1, lr
adcs r6, r6, r12
adc lr, r12, r12
umlal r6, lr, r3, r3
adds r7, r7, lr
adcs r8, r8, r12
adc lr, r12, r12
umlal r8, lr, r4, r4
adds r9, r9, lr
adcs r10, r10, r12
adc r11, r11, r12
umlal r10, r11, r5, r5
#else
umull r0, r1, r2, r2
umull r6, r7, r2, r3
adds r6, r6, r6
mov r12, #0
umaal r1, r6, r12, r12
mov r8, r12
umaal r8, r7, r2, r4
adcs r8, r8, r8
umaal r6, r8, r3, r3
umull r9, r10, r2, r5
umaal r7, r9, r3, r4
adcs r7, r7, r7
umaal r7, r8, r12, r12
umaal r10, r9, r3, r5
adcs r10, r10, r10
umaal r8, r10, r4, r4
mov r11, r12
umaal r9, r11, r4, r5
adcs r9, r9, r9
umaal r9, r10, r12, r12
adcs r11, r11, r11
umaal r10, r11, r5, r5
adc r11, r11, r12
#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */
# Reduce mod 2^130 - 5
bic r2, r8, #0x3
and r8, r8, #3
adds r0, r0, r2
lsr r2, r2, #2
adcs r1, r1, r9
orr r2, r2, r9, LSL #30
adcs r6, r6, r10
lsr r9, r9, #2
adcs r7, r7, r11
orr r9, r9, r10, LSL #30
adc r8, r8, r12
lsr r10, r10, #2
adds r0, r0, r2
orr r10, r10, r11, LSL #30
adcs r1, r1, r9
lsr r11, r11, #2
adcs r6, r6, r10
adcs r7, r7, r11
adc r8, r8, r12
lsr r3, r0, #26
lsr r4, r1, #20
lsr r5, r6, #14
lsr r10, r7, #8
eor r3, r3, r1, lsl #6
eor r4, r4, r6, lsl #12
eor r5, r5, r7, lsl #18
eor r10, r10, r8, lsl #24
and r2, r0, #0x3ffffff
and r3, r3, #0x3ffffff
and r4, r4, #0x3ffffff
and r5, r5, #0x3ffffff
vmov.i32 s0, r2
vmov.i32 s2, r3
vmov.i32 s4, r4
vmov.i32 s6, r5
vmov.i32 s8, r10
pop {r0, r1}
add lr, r0, #0x7c
vstm.32 lr, {d0-d4}
# Multiply r^2, r by r^2
vshl.u32 d6, d1, #2
vshl.u32 d7, d2, #2
vshl.u32 d8, d3, #2
vshl.u32 d9, d4, #2
vadd.u32 d6, d6, d1
vadd.u32 d7, d7, d2
vadd.u32 d8, d8, d3
vadd.u32 d9, d9, d4
vmull.u32 q5, d0, d0[0]
vmull.u32 q6, d0, d1[0]
vmull.u32 q7, d0, d2[0]
vmull.u32 q8, d0, d3[0]
vmull.u32 q9, d0, d4[0]
vmlal.u32 q5, d1, d9[0]
vmlal.u32 q6, d1, d0[0]
vmlal.u32 q7, d1, d1[0]
vmlal.u32 q8, d1, d2[0]
vmlal.u32 q9, d1, d3[0]
vmlal.u32 q5, d2, d8[0]
vmlal.u32 q6, d2, d9[0]
vmlal.u32 q7, d2, d0[0]
vmlal.u32 q8, d2, d1[0]
vmlal.u32 q9, d2, d2[0]
vmlal.u32 q5, d3, d7[0]
vmlal.u32 q6, d3, d8[0]
vmlal.u32 q7, d3, d9[0]
vmlal.u32 q8, d3, d0[0]
vmlal.u32 q9, d3, d1[0]
vmlal.u32 q5, d4, d6[0]
vmlal.u32 q6, d4, d7[0]
vmlal.u32 q7, d4, d8[0]
vmlal.u32 q8, d4, d9[0]
vmlal.u32 q9, d4, d0[0]
vsra.u64 q6, q5, #26
vand.u64 q5, q5, q10
vsra.u64 q7, q6, #26
vand.u64 q6, q6, q10
vsra.u64 q8, q7, #26
vand.u64 q7, q7, q10
vsra.u64 q9, q8, #26
vand.u64 q8, q8, q10
vshr.u64 q3, q9, #26
vand.u64 q9, q9, q10
vadd.u64 q5, q5, q3
vshl.u64 q3, q3, #2
vadd.u64 q5, q5, q3
vsra.u64 q6, q5, #26
vand.u64 q5, q5, q10
vmovn.i64 d10, q5
vmovn.i64 d11, q6
vmovn.i64 d12, q7
vmovn.i64 d13, q8
vmovn.i64 d14, q9
add lr, r0, #0xa4
vstm.32 lr, {d10-d14}
# h (accumulator) = 0
eor r6, r6, r6
eor r7, r7, r7
eor r8, r8, r8
eor r9, r9, r9
add lr, r0, #16
eor r4, r4, r4
eor r5, r5, r5
stm lr, {r4, r5, r6, r7, r8, r9}
# Zero leftover
str r5, [r0, #56]
vpop {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size poly1305_set_key,.-poly1305_set_key
.text
.align 4
.globl poly1305_final
.type poly1305_final, %function
poly1305_final:
push {r4, r5, r6, r7, r8, r9, lr}
add r9, r0, #16
ldm r9, {r4, r5, r6, r7, r8}
# Add 5 and check for h larger than p.
adds r2, r4, #5
adcs r2, r5, #0
adcs r2, r6, #0
adcs r2, r7, #0
adc r2, r8, #0
sub r2, r2, #4
lsr r2, r2, #31
sub r2, r2, #1
and r2, r2, #5
# Add 0/5 to h.
adds r4, r4, r2
adcs r5, r5, #0
adcs r6, r6, #0
adc r7, r7, #0
# Add padding
add r9, r0, #40
ldm r9, {r2, r3, r12, lr}
adds r4, r4, r2
adcs r5, r5, r3
adcs r6, r6, r12
adc r7, r7, lr
# Store MAC
str r4, [r1]
str r5, [r1, #4]
str r6, [r1, #8]
str r7, [r1, #12]
# Zero out h.
eor r4, r4, r4
eor r5, r5, r5
eor r6, r6, r6
eor r7, r7, r7
eor r8, r8, r8
add r9, r0, #16
stm r9, {r4, r5, r6, r7, r8}
# Zero out r.
add r9, r0, #0
stm r9, {r4, r5, r6, r7}
# Zero out padding.
add r9, r0, #40
stm r9, {r4, r5, r6, r7}
pop {r4, r5, r6, r7, r8, r9, pc}
.size poly1305_final,.-poly1305_final
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_POLY1305 */
#endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */
#endif /* WOLFSSL_ARMASM */

View File

@ -52,7 +52,8 @@
#ifdef HAVE_POLY1305
#include <wolfssl/wolfcrypt/poly1305.h>
void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
#ifdef WOLFSSL_ARMASM_NO_NEON
void poly1305_arm32_blocks_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
int notLast_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
@ -383,6 +384,976 @@ void poly1305_final(Poly1305* ctx_p, byte* mac_p)
);
}
#else
void poly1305_arm32_blocks_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
int notLast_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const byte* m asm ("r1") = (const byte*)m_p;
register word32 len asm ("r2") = (word32)len_p;
register int notLast asm ("r3") = (int)notLast_p;
__asm__ __volatile__ (
"sub sp, sp, #28\n\t"
"cmp %[len], #0\n\t"
"beq L_poly1305_arm32_16_done_%=\n\t"
"add lr, sp, #12\n\t"
"stm lr, {r0, r1, r2, r3}\n\t"
/* Get h pointer */
"add lr, %[ctx], #16\n\t"
"ldm lr, {r4, r5, r6, r7, r8}\n\t"
"\n"
"L_poly1305_arm32_16_loop_%=: \n\t"
/* Add m to h */
"ldr %[m], [sp, #16]\n\t"
"ldr %[len], [%[m]]\n\t"
"ldr %[notLast], [%[m], #4]\n\t"
"ldr r9, [%[m], #8]\n\t"
"ldr r10, [%[m], #12]\n\t"
"ldr r11, [sp, #24]\n\t"
"adds r4, r4, %[len]\n\t"
"adcs r5, r5, %[notLast]\n\t"
"adcs r6, r6, r9\n\t"
"adcs r7, r7, r10\n\t"
"add %[m], %[m], #16\n\t"
"adc r8, r8, r11\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
"stm lr, {r4, r5, r6, r7, r8}\n\t"
#else
/* h[0]-h[2] in r4-r6 for multiplication. */
"str r7, [lr, #12]\n\t"
"str r8, [lr, #16]\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
"str %[m], [sp, #16]\n\t"
"ldr %[m], [sp, #12]\n\t"
/* Multiply h by r */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
"ldr %[notLast], [%[m]]\n\t"
"eor %[ctx], %[ctx], %[ctx]\n\t"
/* r[0] * h[0] */
/* h[0] in r4 */
"umull r4, r5, %[notLast], r4\n\t"
/* r[0] * h[2] */
/* h[2] in r6 */
"umull r6, r7, %[notLast], r6\n\t"
/* r[0] * h[4] */
/* h[4] in r8 */
"mul r8, %[notLast], r8\n\t"
/* r[0] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r5, r12, %[notLast], %[len]\n\t"
/* r[0] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r6, r6, r12\n\t"
"adc r7, r7, %[ctx]\n\t"
"umlal r7, r8, %[notLast], %[len]\n\t"
/* r[1] * h[0] */
"ldr %[notLast], [%[m], #4]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r5, r12, %[notLast], %[len]\n\t"
/* r[1] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r6, r6, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r6, r12, %[notLast], %[len]\n\t"
/* r[1] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r7, r7, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[1] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r8, r8, r12\n\t"
"adc r9, %[ctx], %[ctx]\n\t"
"umlal r8, r9, %[notLast], %[len]\n\t"
/* r[1] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mla r9, %[notLast], %[len], r9\n\t"
/* r[2] * h[0] */
"ldr %[notLast], [%[m], #8]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r6, r12, %[notLast], %[len]\n\t"
/* r[2] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r7, r7, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[2] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r8, r8, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r8, r12, %[notLast], %[len]\n\t"
/* r[2] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, %[ctx], %[ctx]\n\t"
"umlal r9, r10, %[notLast], %[len]\n\t"
/* r[2] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mla r10, %[notLast], %[len], r10\n\t"
/* r[3] * h[0] */
"ldr %[notLast], [%[m], #12]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[3] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r8, r8, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r8, r12, %[notLast], %[len]\n\t"
/* r[3] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, r10, %[ctx]\n\t"
"umlal r9, r10, %[notLast], %[len]\n\t"
/* r[3] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"mov r11, %[ctx]\n\t"
"umlal r10, r11, %[notLast], %[len]\n\t"
/* r[3] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mov r12, %[ctx]\n\t"
"mla r11, %[notLast], %[len], r11\n\t"
#else
"ldm %[m], {r0, r1, r2, r3}\n\t"
/* r[0] * h[0] */
"umull r10, r11, %[ctx], r4\n\t"
/* r[1] * h[0] */
"umull r12, r7, %[m], r4\n\t"
/* r[0] * h[1] */
"umaal r11, r12, %[ctx], r5\n\t"
/* r[2] * h[0] */
"umull r8, r9, %[len], r4\n\t"
/* r[1] * h[1] */
"umaal r12, r8, %[m], r5\n\t"
/* r[0] * h[2] */
"umaal r12, r7, %[ctx], r6\n\t"
/* r[3] * h[0] */
"umaal r8, r9, %[notLast], r4\n\t"
"stm sp, {r10, r11, r12}\n\t"
/* r[2] * h[1] */
"umaal r7, r8, %[len], r5\n\t"
/* Replace h[0] with h[3] */
"ldr r4, [lr, #12]\n\t"
/* r[1] * h[2] */
"umull r10, r11, %[m], r6\n\t"
/* r[2] * h[2] */
"umaal r8, r9, %[len], r6\n\t"
/* r[0] * h[3] */
"umaal r7, r10, %[ctx], r4\n\t"
/* r[3] * h[1] */
"umaal r8, r11, %[notLast], r5\n\t"
/* r[1] * h[3] */
"umaal r8, r10, %[m], r4\n\t"
/* r[3] * h[2] */
"umaal r9, r11, %[notLast], r6\n\t"
/* r[2] * h[3] */
"umaal r9, r10, %[len], r4\n\t"
/* Replace h[1] with h[4] */
"ldr r5, [lr, #16]\n\t"
/* r[3] * h[3] */
"umaal r10, r11, %[notLast], r4\n\t"
"mov r12, #0\n\t"
/* r[0] * h[4] */
"umaal r8, r12, %[ctx], r5\n\t"
/* r[1] * h[4] */
"umaal r9, r12, %[m], r5\n\t"
/* r[2] * h[4] */
"umaal r10, r12, %[len], r5\n\t"
/* r[3] * h[4] */
"umaal r11, r12, %[notLast], r5\n\t"
/* DONE */
"ldm sp, {r4, r5, r6}\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
/* r12 will be zero because r is masked. */
/* Load length */
"ldr %[len], [sp, #20]\n\t"
/* Reduce mod 2^130 - 5 */
"bic %[notLast], r8, #0x3\n\t"
"and r8, r8, #3\n\t"
"adds r4, r4, %[notLast]\n\t"
"lsr %[notLast], %[notLast], #2\n\t"
"adcs r5, r5, r9\n\t"
"orr %[notLast], %[notLast], r9, LSL #30\n\t"
"adcs r6, r6, r10\n\t"
"lsr r9, r9, #2\n\t"
"adcs r7, r7, r11\n\t"
"orr r9, r9, r10, LSL #30\n\t"
"adc r8, r8, r12\n\t"
"lsr r10, r10, #2\n\t"
"adds r4, r4, %[notLast]\n\t"
"orr r10, r10, r11, LSL #30\n\t"
"adcs r5, r5, r9\n\t"
"lsr r11, r11, #2\n\t"
"adcs r6, r6, r10\n\t"
"adcs r7, r7, r11\n\t"
"adc r8, r8, r12\n\t"
/* Sub 16 from length. */
"subs %[len], %[len], #16\n\t"
/* Store length. */
"str %[len], [sp, #20]\n\t"
/* Loop again if more message to do. */
"bgt L_poly1305_arm32_16_loop_%=\n\t"
"stm lr, {r4, r5, r6, r7, r8}\n\t"
"\n"
"L_poly1305_arm32_16_done_%=: \n\t"
"add sp, sp, #28\n\t"
: [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
[notLast] "+r" (notLast)
:
: "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11"
);
}
void poly1305_arm32_blocks(Poly1305* ctx_p, const unsigned char* m_p,
size_t bytes_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const unsigned char* m asm ("r1") = (const unsigned char*)m_p;
register size_t bytes asm ("r2") = (size_t)bytes_p;
__asm__ __volatile__ (
"cmp %[bytes], #16\n\t"
"add r12, %[ctx], #16\n\t"
"bgt L_poly1305_arm32_blocks_begin_neon_%=\n\t"
"ldm r12, {r7, r8, r9, r10, r11}\n\t"
"b L_poly1305_arm32_blocks_start_1_%=\n\t"
"\n"
"L_poly1305_arm32_blocks_begin_neon_%=: \n\t"
"vmov.i16 q15, #0xffff\n\t"
"vshr.u64 q15, q15, #38\n\t"
"vld1.64 {d0-d2}, [r12]\n\t"
"vshl.u64 d4, d2, #24\n\t"
"vsri.u64 d4, d1, #40\n\t"
"vshr.u64 d3, d1, #14\n\t"
"vshl.u64 d2, d1, #12\n\t"
"vsri.u64 d1, d0, #26\n\t"
"vsri.u64 d2, d0, #52\n\t"
"vand.u64 d0, d0, d31\n\t"
"vand.u64 d3, d3, d31\n\t"
"vand.u64 d2, d2, d31\n\t"
"vand.u64 d1, d1, d31\n\t"
"add r3, %[ctx], #0x7c\n\t"
"vldm.32 r3, {d20-d24}\n\t"
"cmp %[bytes], #0x40\n\t"
"bge L_poly1305_arm32_blocks_begin_4_%=\n\t"
"vshl.u32 d6, d21, #2\n\t"
"vshl.u32 d7, d22, #2\n\t"
"vshl.u32 d8, d23, #2\n\t"
"vshl.u32 d9, d24, #2\n\t"
"vadd.u32 d6, d6, d21\n\t"
"vadd.u32 d7, d7, d22\n\t"
"vadd.u32 d8, d8, d23\n\t"
"vadd.u32 d9, d9, d24\n\t"
"b L_poly1305_arm32_blocks_start_2_%=\n\t"
"\n"
"L_poly1305_arm32_blocks_begin_4_%=: \n\t"
"add r3, %[ctx], #0xa4\n\t"
"vldm.32 r3, {d26-d30}\n\t"
"\n"
"L_poly1305_arm32_blocks_start_4_%=: \n\t"
"sub %[bytes], #0x40\n\t"
"vld4.32 {d10-d13}, [%[m]]!\n\t"
"vshl.u32 d6, d27, #2\n\t"
"vshl.u32 d7, d28, #2\n\t"
"vshl.u32 d8, d29, #2\n\t"
"vshl.u32 d9, d30, #2\n\t"
"vadd.u32 d6, d6, d27\n\t"
"vadd.u32 d7, d7, d28\n\t"
"vadd.u32 d8, d8, d29\n\t"
"vadd.u32 d9, d9, d30\n\t"
"vshr.u32 d14, d13, #8\n\t"
"vshl.u32 d13, d13, #18\n\t"
"vorr.i32 d14, d14, #0x1000000\n\t"
"vsri.u32 d13, d12, #14\n\t"
"vshl.u32 d12, d12, #12\n\t"
"vand.i32 d13, d13, #0x3ffffff\n\t"
"vsri.u32 d12, d11, #20\n\t"
"vshl.u32 d11, d11, #6\n\t"
"vand.i32 d12, d12, #0x3ffffff\n\t"
"vsri.u32 d11, d10, #26\n\t"
"vand.i32 d10, d10, #0x3ffffff\n\t"
"vand.i32 d11, d11, #0x3ffffff\n\t"
"vadd.u32 d4, d4, d14\n\t"
"vadd.u32 q1, q1, q6\n\t"
"vadd.u32 q0, q0, q5\n\t"
"vmull.u32 q5, d0, d26\n\t"
"vmull.u32 q6, d0, d27\n\t"
"vmull.u32 q7, d0, d28\n\t"
"vmull.u32 q8, d0, d29\n\t"
"vmull.u32 q9, d0, d30\n\t"
"vmlal.u32 q5, d1, d9\n\t"
"vmlal.u32 q6, d1, d26\n\t"
"vmlal.u32 q7, d1, d27\n\t"
"vmlal.u32 q8, d1, d28\n\t"
"vmlal.u32 q9, d1, d29\n\t"
"vmlal.u32 q5, d2, d8\n\t"
"vmlal.u32 q6, d2, d9\n\t"
"vmlal.u32 q7, d2, d26\n\t"
"vmlal.u32 q8, d2, d27\n\t"
"vmlal.u32 q9, d2, d28\n\t"
"vmlal.u32 q5, d3, d7\n\t"
"vmlal.u32 q6, d3, d8\n\t"
"vmlal.u32 q7, d3, d9\n\t"
"vmlal.u32 q8, d3, d26\n\t"
"vmlal.u32 q9, d3, d27\n\t"
"vmlal.u32 q5, d4, d6\n\t"
"vmlal.u32 q6, d4, d7\n\t"
"vmlal.u32 q7, d4, d8\n\t"
"vmlal.u32 q8, d4, d9\n\t"
"vmlal.u32 q9, d4, d26\n\t"
"vld4.32 {d0-d3}, [%[m]]!\n\t"
"vshl.u32 d6, d21, #2\n\t"
"vshl.u32 d7, d22, #2\n\t"
"vshl.u32 d8, d23, #2\n\t"
"vshl.u32 d9, d24, #2\n\t"
"vadd.u32 d6, d6, d21\n\t"
"vadd.u32 d7, d7, d22\n\t"
"vadd.u32 d8, d8, d23\n\t"
"vadd.u32 d9, d9, d24\n\t"
"vshr.u32 d4, d3, #8\n\t"
"vshl.u32 d3, d3, #18\n\t"
"vorr.i32 d4, d4, #0x1000000\n\t"
"vsri.u32 d3, d2, #14\n\t"
"vshl.u32 d2, d2, #12\n\t"
"vand.i32 d3, d3, #0x3ffffff\n\t"
"vsri.u32 d2, d1, #20\n\t"
"vshl.u32 d1, d1, #6\n\t"
"vand.i32 d2, d2, #0x3ffffff\n\t"
"vsri.u32 d1, d0, #26\n\t"
"vand.i32 d0, d0, #0x3ffffff\n\t"
"vand.i32 d1, d1, #0x3ffffff\n\t"
"vmlal.u32 q5, d0, d20\n\t"
"vmlal.u32 q6, d0, d21\n\t"
"vmlal.u32 q7, d0, d22\n\t"
"vmlal.u32 q8, d0, d23\n\t"
"vmlal.u32 q9, d0, d24\n\t"
"vmlal.u32 q5, d1, d9\n\t"
"vmlal.u32 q6, d1, d20\n\t"
"vmlal.u32 q7, d1, d21\n\t"
"vmlal.u32 q8, d1, d22\n\t"
"vmlal.u32 q9, d1, d23\n\t"
"vmlal.u32 q5, d2, d8\n\t"
"vmlal.u32 q6, d2, d9\n\t"
"vmlal.u32 q7, d2, d20\n\t"
"vmlal.u32 q8, d2, d21\n\t"
"vmlal.u32 q9, d2, d22\n\t"
"vmlal.u32 q5, d3, d7\n\t"
"vmlal.u32 q6, d3, d8\n\t"
"vmlal.u32 q7, d3, d9\n\t"
"vmlal.u32 q8, d3, d20\n\t"
"vmlal.u32 q9, d3, d21\n\t"
"vmlal.u32 q5, d4, d6\n\t"
"vmlal.u32 q6, d4, d7\n\t"
"vmlal.u32 q7, d4, d8\n\t"
"vmlal.u32 q8, d4, d9\n\t"
"vmlal.u32 q9, d4, d20\n\t"
"vadd.u64 d0, d10, d11\n\t"
"vadd.u64 d1, d12, d13\n\t"
"vadd.u64 d2, d14, d15\n\t"
"vadd.u64 d3, d16, d17\n\t"
"vadd.u64 d4, d18, d19\n\t"
"vsra.u64 d1, d0, #26\n\t"
"vand.u64 d0, d0, d31\n\t"
"vsra.u64 d2, d1, #26\n\t"
"vand.u64 d1, d1, d31\n\t"
"vsra.u64 d3, d2, #26\n\t"
"vand.u64 d2, d2, d31\n\t"
"vsra.u64 d4, d3, #26\n\t"
"vand.u64 d3, d3, d31\n\t"
"vshr.u64 d15, d4, #26\n\t"
"vand.u64 d4, d4, d31\n\t"
"vadd.u64 d0, d0, d15\n\t"
"vshl.u64 d15, d15, #2\n\t"
"vadd.u64 d0, d0, d15\n\t"
"vsra.u64 d1, d0, #26\n\t"
"vand.u64 d0, d0, d31\n\t"
"cmp %[bytes], #0x40\n\t"
"bge L_poly1305_arm32_blocks_start_4_%=\n\t"
"cmp %[bytes], #32\n\t"
"blt L_poly1305_arm32_blocks_done_neon_%=\n\t"
"\n"
"L_poly1305_arm32_blocks_start_2_%=: \n\t"
"sub %[bytes], #32\n\t"
"vld4.32 {d10-d13}, [%[m]]!\n\t"
"vshr.u32 d14, d13, #8\n\t"
"vshl.u32 d13, d13, #18\n\t"
"vorr.i32 d14, d14, #0x1000000\n\t"
"vsri.u32 d13, d12, #14\n\t"
"vshl.u32 d12, d12, #12\n\t"
"vand.i32 d13, d13, #0x3ffffff\n\t"
"vsri.u32 d12, d11, #20\n\t"
"vshl.u32 d11, d11, #6\n\t"
"vand.i32 d12, d12, #0x3ffffff\n\t"
"vsri.u32 d11, d10, #26\n\t"
"vand.i32 d10, d10, #0x3ffffff\n\t"
"vand.i32 d11, d11, #0x3ffffff\n\t"
"vadd.u32 d4, d4, d14\n\t"
"vadd.u32 q1, q1, q6\n\t"
"vadd.u32 q0, q0, q5\n\t"
"vmull.u32 q5, d0, d20\n\t"
"vmull.u32 q6, d0, d21\n\t"
"vmull.u32 q7, d0, d22\n\t"
"vmull.u32 q8, d0, d23\n\t"
"vmull.u32 q9, d0, d24\n\t"
"vmlal.u32 q5, d1, d9\n\t"
"vmlal.u32 q6, d1, d20\n\t"
"vmlal.u32 q7, d1, d21\n\t"
"vmlal.u32 q8, d1, d22\n\t"
"vmlal.u32 q9, d1, d23\n\t"
"vmlal.u32 q5, d2, d8\n\t"
"vmlal.u32 q6, d2, d9\n\t"
"vmlal.u32 q7, d2, d20\n\t"
"vmlal.u32 q8, d2, d21\n\t"
"vmlal.u32 q9, d2, d22\n\t"
"vmlal.u32 q5, d3, d7\n\t"
"vmlal.u32 q6, d3, d8\n\t"
"vmlal.u32 q7, d3, d9\n\t"
"vmlal.u32 q8, d3, d20\n\t"
"vmlal.u32 q9, d3, d21\n\t"
"vmlal.u32 q5, d4, d6\n\t"
"vmlal.u32 q6, d4, d7\n\t"
"vmlal.u32 q7, d4, d8\n\t"
"vmlal.u32 q8, d4, d9\n\t"
"vmlal.u32 q9, d4, d20\n\t"
"vadd.u64 d0, d10, d11\n\t"
"vadd.u64 d1, d12, d13\n\t"
"vadd.u64 d2, d14, d15\n\t"
"vadd.u64 d3, d16, d17\n\t"
"vadd.u64 d4, d18, d19\n\t"
"vsra.u64 d1, d0, #26\n\t"
"vand.u64 d0, d0, d31\n\t"
"vsra.u64 d2, d1, #26\n\t"
"vand.u64 d1, d1, d31\n\t"
"vsra.u64 d3, d2, #26\n\t"
"vand.u64 d2, d2, d31\n\t"
"vsra.u64 d4, d3, #26\n\t"
"vand.u64 d3, d3, d31\n\t"
"vshr.u64 d5, d4, #26\n\t"
"vand.u64 d4, d4, d31\n\t"
"vadd.u64 d0, d0, d5\n\t"
"vshl.u64 d5, d5, #2\n\t"
"vadd.u64 d0, d0, d5\n\t"
"vsra.u64 d1, d0, #26\n\t"
"vand.u64 d0, d0, d31\n\t"
"\n"
"L_poly1305_arm32_blocks_done_neon_%=: \n\t"
"cmp %[bytes], #16\n\t"
"beq L_poly1305_arm32_blocks_begin_1_%=\n\t"
"add r12, %[ctx], #16\n\t"
"vsli.u64 d0, d1, #26\n\t"
"vsli.u64 d0, d2, #52\n\t"
"vshr.u64 d1, d2, #12\n\t"
"vsli.u64 d1, d3, #14\n\t"
"vsli.u64 d1, d4, #40\n\t"
"vshr.u64 d2, d4, #24\n\t"
"vst1.64 {d0-d2}, [r12]\n\t"
"b L_poly1305_arm32_blocks_done_%=\n\t"
"\n"
"L_poly1305_arm32_blocks_begin_1_%=: \n\t"
"vsli.u64 d0, d1, #26\n\t"
"vsli.u64 d0, d2, #52\n\t"
"vshr.u64 d1, d2, #12\n\t"
"vsli.u64 d1, d3, #14\n\t"
"vsli.u64 d1, d4, #40\n\t"
"vshr.u64 d2, d4, #24\n\t"
"vmov r7, r8, d0\n\t"
"vmov r9, r10, d1\n\t"
"vmov r11, d2[0]\n\t"
"\n"
"L_poly1305_arm32_blocks_start_1_%=: \n\t"
"mov r12, #1\n\t"
"push {r2}\n\t"
/* Load message */
"ldm %[m], {r2, r3, r4, r5}\n\t"
/* Add message */
"adds r7, r7, %[bytes]\n\t"
"adcs r8, r8, r3\n\t"
"adcs r9, r9, r4\n\t"
"adcs r10, r10, r5\n\t"
"adc r11, r11, r12\n\t"
"push {r0-r1}\n\t"
"add %[m], %[ctx], #0\n\t"
"add lr, %[ctx], #16\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
"stm lr, {r7, r8, r9, r10, r11}\n\t"
#else
/* h[0]-h[2] in r4-r6 for multiplication. */
"str r10, [lr, #12]\n\t"
"str r11, [lr, #16]\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
"ldr r3, [%[m]]\n\t"
"eor %[ctx], %[ctx], %[ctx]\n\t"
/* r[0] * h[0] */
/* h[0] in r4 */
"umull r7, r8, r3, r7\n\t"
/* r[0] * h[2] */
/* h[2] in r6 */
"umull r9, r10, r3, r9\n\t"
/* r[0] * h[4] */
/* h[4] in r8 */
"mul r11, r3, r11\n\t"
/* r[0] * h[1] */
"ldr %[bytes], [lr, #4]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r8, r12, r3, %[bytes]\n\t"
/* r[0] * h[3] */
"ldr %[bytes], [lr, #12]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, r10, %[ctx]\n\t"
"umlal r10, r11, r3, %[bytes]\n\t"
/* r[1] * h[0] */
"ldr r3, [%[m], #4]\n\t"
"ldr %[bytes], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r8, r12, r3, %[bytes]\n\t"
/* r[1] * h[1] */
"ldr %[bytes], [lr, #4]\n\t"
"adds r9, r9, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r9, r12, r3, %[bytes]\n\t"
/* r[1] * h[2] */
"ldr %[bytes], [lr, #8]\n\t"
"adds r10, r10, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r10, r12, r3, %[bytes]\n\t"
/* r[1] * h[3] */
"ldr %[bytes], [lr, #12]\n\t"
"adds r11, r11, r12\n\t"
"adc r4, %[ctx], %[ctx]\n\t"
"umlal r11, r4, r3, %[bytes]\n\t"
/* r[1] * h[4] */
"ldr %[bytes], [lr, #16]\n\t"
"mla r4, r3, %[bytes], r4\n\t"
/* r[2] * h[0] */
"ldr r3, [%[m], #8]\n\t"
"ldr %[bytes], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r9, r12, r3, %[bytes]\n\t"
/* r[2] * h[1] */
"ldr %[bytes], [lr, #4]\n\t"
"adds r10, r10, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r10, r12, r3, %[bytes]\n\t"
/* r[2] * h[2] */
"ldr %[bytes], [lr, #8]\n\t"
"adds r11, r11, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r11, r12, r3, %[bytes]\n\t"
/* r[2] * h[3] */
"ldr %[bytes], [lr, #12]\n\t"
"adds r4, r4, r12\n\t"
"adc r5, %[ctx], %[ctx]\n\t"
"umlal r4, r5, r3, %[bytes]\n\t"
/* r[2] * h[4] */
"ldr %[bytes], [lr, #16]\n\t"
"mla r5, r3, %[bytes], r5\n\t"
/* r[3] * h[0] */
"ldr r3, [%[m], #12]\n\t"
"ldr %[bytes], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r10, r12, r3, %[bytes]\n\t"
/* r[3] * h[1] */
"ldr %[bytes], [lr, #4]\n\t"
"adds r11, r11, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r11, r12, r3, %[bytes]\n\t"
/* r[3] * h[2] */
"ldr %[bytes], [lr, #8]\n\t"
"adds r4, r4, r12\n\t"
"adc r5, r5, %[ctx]\n\t"
"umlal r4, r5, r3, %[bytes]\n\t"
/* r[3] * h[3] */
"ldr %[bytes], [lr, #12]\n\t"
"mov r6, %[ctx]\n\t"
"umlal r5, r6, r3, %[bytes]\n\t"
/* r[3] * h[4] */
"ldr %[bytes], [lr, #16]\n\t"
"mov r12, %[ctx]\n\t"
"mla r6, r3, %[bytes], r6\n\t"
#else
"sub sp, sp, #12\n\t"
"ldm %[m], {r0, r1, r2, r3}\n\t"
/* r[0] * h[0] */
"umull r5, r6, %[ctx], r7\n\t"
/* r[1] * h[0] */
"umull r12, r10, %[m], r7\n\t"
/* r[0] * h[1] */
"umaal r6, r12, %[ctx], r8\n\t"
/* r[2] * h[0] */
"umull r11, r4, %[bytes], r7\n\t"
/* r[1] * h[1] */
"umaal r12, r11, %[m], r8\n\t"
/* r[0] * h[2] */
"umaal r12, r10, %[ctx], r9\n\t"
/* r[3] * h[0] */
"umaal r11, r4, r3, r7\n\t"
"stm sp, {r5, r6, r12}\n\t"
/* r[2] * h[1] */
"umaal r10, r11, %[bytes], r8\n\t"
/* Replace h[0] with h[3] */
"ldr r7, [lr, #12]\n\t"
/* r[1] * h[2] */
"umull r5, r6, %[m], r9\n\t"
/* r[2] * h[2] */
"umaal r11, r4, %[bytes], r9\n\t"
/* r[0] * h[3] */
"umaal r10, r5, %[ctx], r7\n\t"
/* r[3] * h[1] */
"umaal r11, r6, r3, r8\n\t"
/* r[1] * h[3] */
"umaal r11, r5, %[m], r7\n\t"
/* r[3] * h[2] */
"umaal r4, r6, r3, r9\n\t"
/* r[2] * h[3] */
"umaal r4, r5, %[bytes], r7\n\t"
/* Replace h[1] with h[4] */
"ldr r8, [lr, #16]\n\t"
/* r[3] * h[3] */
"umaal r5, r6, r3, r7\n\t"
"mov r12, #0\n\t"
/* r[0] * h[4] */
"umaal r11, r12, %[ctx], r8\n\t"
/* r[1] * h[4] */
"umaal r4, r12, %[m], r8\n\t"
/* r[2] * h[4] */
"umaal r5, r12, %[bytes], r8\n\t"
/* r[3] * h[4] */
"umaal r6, r12, r3, r8\n\t"
/* DONE */
"ldm sp, {r7, r8, r9}\n\t"
"add sp, sp, #12\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
/* Reduce mod 2^130 - 5 */
"bic r3, r11, #0x3\n\t"
"and r11, r11, #3\n\t"
"adds r7, r7, r3\n\t"
"lsr r3, r3, #2\n\t"
"adcs r8, r8, r4\n\t"
"orr r3, r3, r4, LSL #30\n\t"
"adcs r9, r9, r5\n\t"
"lsr r4, r4, #2\n\t"
"adcs r10, r10, r6\n\t"
"orr r4, r4, r5, LSL #30\n\t"
"adc r11, r11, r12\n\t"
"lsr r5, r5, #2\n\t"
"adds r7, r7, r3\n\t"
"orr r5, r5, r6, LSL #30\n\t"
"adcs r8, r8, r4\n\t"
"lsr r6, r6, #2\n\t"
"adcs r9, r9, r5\n\t"
"adcs r10, r10, r6\n\t"
"adc r11, r11, r12\n\t"
"pop {r0-r1}\n\t"
"pop {r2}\n\t"
"add r12, %[ctx], #16\n\t"
"stm r12, {r7, r8, r9, r10, r11}\n\t"
"\n"
"L_poly1305_arm32_blocks_done_%=: \n\t"
: [ctx] "+r" (ctx), [m] "+r" (m), [bytes] "+r" (bytes)
:
: "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
"d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18",
"d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31"
);
}
static const word32 L_poly1305_arm32_clamp[] = {
0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
};
void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const byte* key asm ("r1") = (const byte*)key_p;
register word32* L_poly1305_arm32_clamp_c asm ("r2") =
(word32*)&L_poly1305_arm32_clamp;
__asm__ __volatile__ (
/* Load mask. */
"mov lr, %[L_poly1305_arm32_clamp]\n\t"
"ldm lr, {r6, r7, r8, r9}\n\t"
/* Load and cache padding. */
"ldr r2, [%[key], #16]\n\t"
"ldr r3, [%[key], #20]\n\t"
"ldr r4, [%[key], #24]\n\t"
"ldr r5, [%[key], #28]\n\t"
"add lr, %[ctx], #40\n\t"
"stm lr, {r2, r3, r4, r5}\n\t"
/* Load, mask and store r. */
"ldr r2, [%[key]]\n\t"
"ldr r3, [%[key], #4]\n\t"
"ldr r4, [%[key], #8]\n\t"
"ldr r5, [%[key], #12]\n\t"
"and r2, r2, r6\n\t"
"and r3, r3, r7\n\t"
"and r4, r4, r8\n\t"
"and r5, r5, r9\n\t"
"add lr, %[ctx], #0\n\t"
"stm lr, {r2, r3, r4, r5}\n\t"
"vmov.i16 q10, #0xffff\n\t"
"vshr.u64 q10, q10, #38\n\t"
"lsr r8, r2, #26\n\t"
"lsr r9, r3, #20\n\t"
"lsr r10, r4, #14\n\t"
"lsr r11, r5, #8\n\t"
"eor r8, r8, r3, lsl #6\n\t"
"eor r9, r9, r4, lsl #12\n\t"
"eor r10, r10, r5, lsl #18\n\t"
"and r7, r2, #0x3ffffff\n\t"
"and r8, r8, #0x3ffffff\n\t"
"and r9, r9, #0x3ffffff\n\t"
"and r10, r10, #0x3ffffff\n\t"
"vmov.i32 s1, r7\n\t"
"vmov.i32 s3, r8\n\t"
"vmov.i32 s5, r9\n\t"
"vmov.i32 s7, r10\n\t"
"vmov.i32 s9, r11\n\t"
"push {%[ctx]-%[key]}\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
/* Square r */
"umull %[key], r6, r2, r3\n\t"
"mov r12, #0\n\t"
"umull r7, r8, r2, r5\n\t"
"mov lr, r12\n\t"
"umlal r6, lr, r2, r4\n\t"
"adds r7, r7, lr\n\t"
"adc lr, r12, r12\n\t"
"umlal r7, lr, r3, r4\n\t"
"mov r9, r12\n\t"
"umlal lr, r9, r3, r5\n\t"
"adds r8, r8, lr\n\t"
"adcs r9, r9, r12\n\t"
"adc r10, r12, r12\n\t"
"umlal r9, r10, r4, r5\n\t"
"adds %[key], %[key], %[key]\n\t"
"adcs r6, r6, r6\n\t"
"adcs r7, r7, r7\n\t"
"adcs r8, r8, r8\n\t"
"adcs r9, r9, r9\n\t"
"adcs r10, r10, r10\n\t"
"adc r11, r12, r12\n\t"
"umull %[ctx], lr, r2, r2\n\t"
"adds %[key], %[key], lr\n\t"
"adcs r6, r6, r12\n\t"
"adc lr, r12, r12\n\t"
"umlal r6, lr, r3, r3\n\t"
"adds r7, r7, lr\n\t"
"adcs r8, r8, r12\n\t"
"adc lr, r12, r12\n\t"
"umlal r8, lr, r4, r4\n\t"
"adds r9, r9, lr\n\t"
"adcs r10, r10, r12\n\t"
"adc r11, r11, r12\n\t"
"umlal r10, r11, r5, r5\n\t"
#else
"umull %[ctx], %[key], r2, r2\n\t"
"umull r6, r7, r2, r3\n\t"
"adds r6, r6, r6\n\t"
"mov r12, #0\n\t"
"umaal %[key], r6, r12, r12\n\t"
"mov r8, r12\n\t"
"umaal r8, r7, r2, r4\n\t"
"adcs r8, r8, r8\n\t"
"umaal r6, r8, r3, r3\n\t"
"umull r9, r10, r2, r5\n\t"
"umaal r7, r9, r3, r4\n\t"
"adcs r7, r7, r7\n\t"
"umaal r7, r8, r12, r12\n\t"
"umaal r10, r9, r3, r5\n\t"
"adcs r10, r10, r10\n\t"
"umaal r8, r10, r4, r4\n\t"
"mov r11, r12\n\t"
"umaal r9, r11, r4, r5\n\t"
"adcs r9, r9, r9\n\t"
"umaal r9, r10, r12, r12\n\t"
"adcs r11, r11, r11\n\t"
"umaal r10, r11, r5, r5\n\t"
"adc r11, r11, r12\n\t"
#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */
/* Reduce mod 2^130 - 5 */
"bic r2, r8, #0x3\n\t"
"and r8, r8, #3\n\t"
"adds %[ctx], %[ctx], r2\n\t"
"lsr r2, r2, #2\n\t"
"adcs %[key], %[key], r9\n\t"
"orr r2, r2, r9, LSL #30\n\t"
"adcs r6, r6, r10\n\t"
"lsr r9, r9, #2\n\t"
"adcs r7, r7, r11\n\t"
"orr r9, r9, r10, LSL #30\n\t"
"adc r8, r8, r12\n\t"
"lsr r10, r10, #2\n\t"
"adds %[ctx], %[ctx], r2\n\t"
"orr r10, r10, r11, LSL #30\n\t"
"adcs %[key], %[key], r9\n\t"
"lsr r11, r11, #2\n\t"
"adcs r6, r6, r10\n\t"
"adcs r7, r7, r11\n\t"
"adc r8, r8, r12\n\t"
"lsr r3, %[ctx], #26\n\t"
"lsr r4, %[key], #20\n\t"
"lsr r5, r6, #14\n\t"
"lsr r10, r7, #8\n\t"
"eor r3, r3, %[key], lsl #6\n\t"
"eor r4, r4, r6, lsl #12\n\t"
"eor r5, r5, r7, lsl #18\n\t"
"eor r10, r10, r8, lsl #24\n\t"
"and r2, %[ctx], #0x3ffffff\n\t"
"and r3, r3, #0x3ffffff\n\t"
"and r4, r4, #0x3ffffff\n\t"
"and r5, r5, #0x3ffffff\n\t"
"vmov.i32 s0, r2\n\t"
"vmov.i32 s2, r3\n\t"
"vmov.i32 s4, r4\n\t"
"vmov.i32 s6, r5\n\t"
"vmov.i32 s8, r10\n\t"
"pop {%[ctx]-%[key]}\n\t"
"add lr, %[ctx], #0x7c\n\t"
"vstm.32 lr, {d0-d4}\n\t"
/* Multiply r^2, r by r^2 */
"vshl.u32 d6, d1, #2\n\t"
"vshl.u32 d7, d2, #2\n\t"
"vshl.u32 d8, d3, #2\n\t"
"vshl.u32 d9, d4, #2\n\t"
"vadd.u32 d6, d6, d1\n\t"
"vadd.u32 d7, d7, d2\n\t"
"vadd.u32 d8, d8, d3\n\t"
"vadd.u32 d9, d9, d4\n\t"
"vmull.u32 q5, d0, d0[0]\n\t"
"vmull.u32 q6, d0, d1[0]\n\t"
"vmull.u32 q7, d0, d2[0]\n\t"
"vmull.u32 q8, d0, d3[0]\n\t"
"vmull.u32 q9, d0, d4[0]\n\t"
"vmlal.u32 q5, d1, d9[0]\n\t"
"vmlal.u32 q6, d1, d0[0]\n\t"
"vmlal.u32 q7, d1, d1[0]\n\t"
"vmlal.u32 q8, d1, d2[0]\n\t"
"vmlal.u32 q9, d1, d3[0]\n\t"
"vmlal.u32 q5, d2, d8[0]\n\t"
"vmlal.u32 q6, d2, d9[0]\n\t"
"vmlal.u32 q7, d2, d0[0]\n\t"
"vmlal.u32 q8, d2, d1[0]\n\t"
"vmlal.u32 q9, d2, d2[0]\n\t"
"vmlal.u32 q5, d3, d7[0]\n\t"
"vmlal.u32 q6, d3, d8[0]\n\t"
"vmlal.u32 q7, d3, d9[0]\n\t"
"vmlal.u32 q8, d3, d0[0]\n\t"
"vmlal.u32 q9, d3, d1[0]\n\t"
"vmlal.u32 q5, d4, d6[0]\n\t"
"vmlal.u32 q6, d4, d7[0]\n\t"
"vmlal.u32 q7, d4, d8[0]\n\t"
"vmlal.u32 q8, d4, d9[0]\n\t"
"vmlal.u32 q9, d4, d0[0]\n\t"
"vsra.u64 q6, q5, #26\n\t"
"vand.u64 q5, q5, q10\n\t"
"vsra.u64 q7, q6, #26\n\t"
"vand.u64 q6, q6, q10\n\t"
"vsra.u64 q8, q7, #26\n\t"
"vand.u64 q7, q7, q10\n\t"
"vsra.u64 q9, q8, #26\n\t"
"vand.u64 q8, q8, q10\n\t"
"vshr.u64 q3, q9, #26\n\t"
"vand.u64 q9, q9, q10\n\t"
"vadd.u64 q5, q5, q3\n\t"
"vshl.u64 q3, q3, #2\n\t"
"vadd.u64 q5, q5, q3\n\t"
"vsra.u64 q6, q5, #26\n\t"
"vand.u64 q5, q5, q10\n\t"
"vmovn.i64 d10, q5\n\t"
"vmovn.i64 d11, q6\n\t"
"vmovn.i64 d12, q7\n\t"
"vmovn.i64 d13, q8\n\t"
"vmovn.i64 d14, q9\n\t"
"add lr, %[ctx], #0xa4\n\t"
"vstm.32 lr, {d10-d14}\n\t"
/* h (accumulator) = 0 */
"eor r6, r6, r6\n\t"
"eor r7, r7, r7\n\t"
"eor r8, r8, r8\n\t"
"eor r9, r9, r9\n\t"
"add lr, %[ctx], #16\n\t"
"eor r4, r4, r4\n\t"
"eor r5, r5, r5\n\t"
"stm lr, {r4, r5, r6, r7, r8, r9}\n\t"
/* Zero leftover */
"str r5, [%[ctx], #56]\n\t"
: [ctx] "+r" (ctx), [key] "+r" (key),
[L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c)
:
: "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
"r10", "r11", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
"d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18",
"d19", "d20", "d21"
);
}
void poly1305_final(Poly1305* ctx_p, byte* mac_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register byte* mac asm ("r1") = (byte*)mac_p;
__asm__ __volatile__ (
"add r9, %[ctx], #16\n\t"
"ldm r9, {r4, r5, r6, r7, r8}\n\t"
/* Add 5 and check for h larger than p. */
"adds r2, r4, #5\n\t"
"adcs r2, r5, #0\n\t"
"adcs r2, r6, #0\n\t"
"adcs r2, r7, #0\n\t"
"adc r2, r8, #0\n\t"
"sub r2, r2, #4\n\t"
"lsr r2, r2, #31\n\t"
"sub r2, r2, #1\n\t"
"and r2, r2, #5\n\t"
/* Add 0/5 to h. */
"adds r4, r4, r2\n\t"
"adcs r5, r5, #0\n\t"
"adcs r6, r6, #0\n\t"
"adc r7, r7, #0\n\t"
/* Add padding */
"add r9, %[ctx], #40\n\t"
"ldm r9, {r2, r3, r12, lr}\n\t"
"adds r4, r4, r2\n\t"
"adcs r5, r5, r3\n\t"
"adcs r6, r6, r12\n\t"
"adc r7, r7, lr\n\t"
/* Store MAC */
"str r4, [%[mac]]\n\t"
"str r5, [%[mac], #4]\n\t"
"str r6, [%[mac], #8]\n\t"
"str r7, [%[mac], #12]\n\t"
/* Zero out h. */
"eor r4, r4, r4\n\t"
"eor r5, r5, r5\n\t"
"eor r6, r6, r6\n\t"
"eor r7, r7, r7\n\t"
"eor r8, r8, r8\n\t"
"add r9, %[ctx], #16\n\t"
"stm r9, {r4, r5, r6, r7, r8}\n\t"
/* Zero out r. */
"add r9, %[ctx], #0\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
/* Zero out padding. */
"add r9, %[ctx], #40\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
: [ctx] "+r" (ctx), [mac] "+r" (mac)
:
: "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
"r9"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_POLY1305 */
#endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */
#endif /* WOLFSSL_ARMASM */

View File

@ -1150,7 +1150,11 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m)
*/
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes)
{
poly1305_blocks_arm32_16(ctx, m, bytes, 1);
#ifndef WOLFSSL_ARMASM_NO_NEON
poly1305_arm32_blocks(ctx, m, bytes);
#else
poly1305_arm32_blocks_16(ctx, m, bytes, 1);
#endif
}
/* Process 16 bytes of message.
@ -1160,7 +1164,7 @@ void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes)
*/
void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m)
{
poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
poly1305_arm32_blocks_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
}
#endif
@ -1219,6 +1223,16 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
/* Process the remaining partial block - last block. */
if (ret == 0) {
#if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON)
if (ctx->leftover >= POLY1305_BLOCK_SIZE) {
size_t len = ctx->leftover & (~(POLY1305_BLOCK_SIZE - 1));
poly1305_arm32_blocks(ctx, ctx->buffer, len);
ctx->leftover -= len;
if (ctx->leftover) {
XMEMCPY(ctx->buffer, ctx->buffer + len, ctx->leftover);
}
}
#endif
if (ctx->leftover) {
size_t i = ctx->leftover;
ctx->buffer[i++] = 1;
@ -1229,7 +1243,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE,
0);
#else
poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0);
poly1305_arm32_blocks_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0);
#endif
}

View File

@ -8201,6 +8201,31 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void)
return WC_TEST_RET_ENC_I(i);
}
/* Testing multiple updates with various sizes works. */
for (i = 1; i < (int)sizeof(msg6); i++) {
int j;
ret = wc_Poly1305SetKey(&enc, key, 32);
if (ret != 0)
return WC_TEST_RET_ENC_I(i);
for (j = 0; j < (int)sizeof(msg6); j += i) {
int len = (int)sizeof(msg6) - j;
if (len > i)
len = i;
ret = wc_Poly1305Update(&enc, msg6 + j, len);
if (ret != 0)
return WC_TEST_RET_ENC_I(j);
}
ret = wc_Poly1305Final(&enc, tag);
if (ret != 0)
return WC_TEST_RET_ENC_I(i);
if (XMEMCMP(tag, correct6, sizeof(tag)))
return WC_TEST_RET_ENC_I(i);
}
/* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */
XMEMSET(tag, 0, sizeof(tag));
ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4));

View File

@ -98,7 +98,18 @@ typedef struct Poly1305 {
word64 leftover;
unsigned char buffer[POLY1305_BLOCK_SIZE];
unsigned char finished;
#elif defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \
!defined(WOLFSSL_ARMASM_NO_NEON)
/* NEON implementation for ARM32 */
word32 r[4];
word32 h[6];
word32 pad[4];
word32 leftover;
unsigned char buffer[4*POLY1305_BLOCK_SIZE];
word32 r_21[10];
word32 r_43[10];
#elif defined(WOLFSSL_ARMASM)
/* ARM32 (non-NEON) and Thumb2 */
word32 r[4];
word32 h[5];
word32 pad[4];
@ -173,7 +184,8 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m,
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes);
void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m);
void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len,
void poly1305_arm32_blocks(Poly1305* ctx, const unsigned char* m, word32 len);
void poly1305_arm32_blocks_16(Poly1305* ctx, const unsigned char* m, word32 len,
int notLast);
#endif
void poly1305_set_key(Poly1305* ctx, const byte* key);