mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2025-07-29 18:27:29 +02:00
Poly1305 ARM32 NEON: add implementation
Add assembly for Poly1305 using ARM32 NEON instruction set. For Poly1305 ARM32 Base: Change name from poly1305_blocks_arm32_16 to poly1305_arm32_blocks_16 poly1305.c: ARM32 NEON - buffer up to 4 blocks x86_64 - only calculate powers of r once after key is set. test.c: poly1305 testing with multiple updates. benchmark: chacha20-poly1305 now uses AAD
This commit is contained in:
@ -770,7 +770,8 @@
|
||||
#define BENCH_RNG 0x00000001
|
||||
#define BENCH_SCRYPT 0x00000002
|
||||
|
||||
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
|
||||
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) || \
|
||||
(defined(HAVE_CHACHA) && defined(HAVE_POLY1305))
|
||||
/* Define AES_AUTH_ADD_SZ already here, since it's used in the
|
||||
* static declaration of `bench_Usage_msg1`. */
|
||||
#if !defined(AES_AUTH_ADD_SZ) && \
|
||||
@ -1945,10 +1946,13 @@ static const char* bench_result_words2[][5] = {
|
||||
#define BENCH_MIN_RUNTIME_SEC 1.0F
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) || \
|
||||
(defined(HAVE_CHACHA) && defined(HAVE_POLY1305))
|
||||
static word32 aesAuthAddSz = AES_AUTH_ADD_SZ;
|
||||
#endif
|
||||
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
|
||||
#define AES_AUTH_TAG_SZ 16
|
||||
#define BENCH_CIPHER_ADD AES_AUTH_TAG_SZ
|
||||
static word32 aesAuthAddSz = AES_AUTH_ADD_SZ;
|
||||
#if !defined(AES_AAD_OPTIONS_DEFAULT)
|
||||
#if !defined(NO_MAIN_DRIVER)
|
||||
#define AES_AAD_OPTIONS_DEFAULT 0x1U
|
||||
@ -6059,15 +6063,19 @@ void bench_chacha20_poly1305_aead(void)
|
||||
int ret = 0, i, count;
|
||||
DECLARE_MULTI_VALUE_STATS_VARS()
|
||||
|
||||
WC_DECLARE_VAR(bench_additional, byte, AES_AUTH_ADD_SZ, HEAP_HINT);
|
||||
WC_DECLARE_VAR(authTag, byte, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE, HEAP_HINT);
|
||||
WC_ALLOC_VAR(bench_additional, byte, AES_AUTH_ADD_SZ, HEAP_HINT);
|
||||
WC_ALLOC_VAR(authTag, byte, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE, HEAP_HINT);
|
||||
XMEMSET(bench_additional, 0, AES_AUTH_ADD_SZ);
|
||||
XMEMSET(authTag, 0, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE);
|
||||
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (i = 0; i < numBlocks; i++) {
|
||||
ret = wc_ChaCha20Poly1305_Encrypt(bench_key, bench_iv, NULL, 0,
|
||||
bench_plain, bench_size, bench_cipher, authTag);
|
||||
ret = wc_ChaCha20Poly1305_Encrypt(bench_key, bench_iv,
|
||||
bench_additional, aesAuthAddSz, bench_plain, bench_size,
|
||||
bench_cipher, authTag);
|
||||
if (ret < 0) {
|
||||
printf("wc_ChaCha20Poly1305_Encrypt error: %d\n", ret);
|
||||
goto exit;
|
||||
@ -6089,6 +6097,7 @@ void bench_chacha20_poly1305_aead(void)
|
||||
exit:
|
||||
|
||||
WC_FREE_VAR(authTag, HEAP_HINT);
|
||||
WC_FREE_VAR(bench_additional, HEAP_HINT);
|
||||
}
|
||||
#endif /* HAVE_CHACHA && HAVE_POLY1305 */
|
||||
|
||||
|
@ -529,6 +529,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
#endif
|
||||
poly1305_setkey_avx(ctx, key);
|
||||
RESTORE_VECTOR_REGISTERS();
|
||||
ctx->started = 0;
|
||||
#elif defined(POLY130564)
|
||||
|
||||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
|
||||
@ -813,13 +814,49 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \
|
||||
!defined(WOLFSSL_ARMASM_NO_NEON)
|
||||
/* handle leftover */
|
||||
if (ctx->leftover) {
|
||||
size_t want = sizeof(ctx->buffer) - ctx->leftover;
|
||||
if (want > bytes)
|
||||
want = bytes;
|
||||
|
||||
for (i = 0; i < want; i++)
|
||||
ctx->buffer[ctx->leftover + i] = m[i];
|
||||
bytes -= (word32)want;
|
||||
m += want;
|
||||
ctx->leftover += want;
|
||||
if (ctx->leftover < sizeof(ctx->buffer)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
poly1305_blocks(ctx, ctx->buffer, sizeof(ctx->buffer));
|
||||
ctx->leftover = 0;
|
||||
}
|
||||
|
||||
/* process full blocks */
|
||||
if (bytes >= sizeof(ctx->buffer)) {
|
||||
size_t want = bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
poly1305_blocks(ctx, m, want);
|
||||
m += want;
|
||||
bytes -= (word32)want;
|
||||
}
|
||||
|
||||
/* store leftover */
|
||||
if (bytes) {
|
||||
for (i = 0; i < bytes; i++)
|
||||
ctx->buffer[ctx->leftover + i] = m[i];
|
||||
ctx->leftover += bytes;
|
||||
}
|
||||
#else
|
||||
#ifdef USE_INTEL_POLY1305_SPEEDUP
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
if (IS_INTEL_AVX2(intel_flags)) {
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
|
||||
/* handle leftover */
|
||||
|
||||
if (ctx->leftover) {
|
||||
size_t want = sizeof(ctx->buffer) - ctx->leftover;
|
||||
if (want > bytes)
|
||||
@ -835,8 +872,10 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!ctx->started)
|
||||
if (!ctx->started) {
|
||||
poly1305_calc_powers_avx2(ctx);
|
||||
ctx->started = 1;
|
||||
}
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
|
||||
ctx->leftover = 0;
|
||||
}
|
||||
@ -845,8 +884,10 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
||||
if (bytes >= sizeof(ctx->buffer)) {
|
||||
size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
|
||||
|
||||
if (!ctx->started)
|
||||
if (!ctx->started) {
|
||||
poly1305_calc_powers_avx2(ctx);
|
||||
ctx->started = 1;
|
||||
}
|
||||
poly1305_blocks_avx2(ctx, m, want);
|
||||
m += want;
|
||||
bytes -= (word32)want;
|
||||
@ -902,6 +943,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
||||
ctx->leftover += bytes;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -34,11 +34,12 @@
|
||||
#if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
|
||||
#ifndef WOLFSSL_ARMASM_INLINE
|
||||
#ifdef HAVE_POLY1305
|
||||
#ifdef WOLFSSL_ARMASM_NO_NEON
|
||||
.text
|
||||
.align 4
|
||||
.globl poly1305_blocks_arm32_16
|
||||
.type poly1305_blocks_arm32_16, %function
|
||||
poly1305_blocks_arm32_16:
|
||||
.globl poly1305_arm32_blocks_16
|
||||
.type poly1305_arm32_blocks_16, %function
|
||||
poly1305_arm32_blocks_16:
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
sub sp, sp, #28
|
||||
cmp r2, #0
|
||||
@ -247,7 +248,7 @@ L_poly1305_arm32_16_loop:
|
||||
L_poly1305_arm32_16_done:
|
||||
add sp, sp, #28
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
.size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16
|
||||
.size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16
|
||||
.text
|
||||
.type L_poly1305_arm32_clamp, %object
|
||||
.size L_poly1305_arm32_clamp, 16
|
||||
@ -347,6 +348,941 @@ poly1305_final:
|
||||
stm r9, {r4, r5, r6, r7}
|
||||
pop {r4, r5, r6, r7, r8, r9, pc}
|
||||
.size poly1305_final,.-poly1305_final
|
||||
#else
|
||||
.text
|
||||
.align 4
|
||||
.globl poly1305_arm32_blocks_16
|
||||
.type poly1305_arm32_blocks_16, %function
|
||||
poly1305_arm32_blocks_16:
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
sub sp, sp, #28
|
||||
cmp r2, #0
|
||||
beq L_poly1305_arm32_16_done
|
||||
add lr, sp, #12
|
||||
stm lr, {r0, r1, r2, r3}
|
||||
# Get h pointer
|
||||
add lr, r0, #16
|
||||
ldm lr, {r4, r5, r6, r7, r8}
|
||||
L_poly1305_arm32_16_loop:
|
||||
# Add m to h
|
||||
ldr r1, [sp, #16]
|
||||
ldr r2, [r1]
|
||||
ldr r3, [r1, #4]
|
||||
ldr r9, [r1, #8]
|
||||
ldr r10, [r1, #12]
|
||||
ldr r11, [sp, #24]
|
||||
adds r4, r4, r2
|
||||
adcs r5, r5, r3
|
||||
adcs r6, r6, r9
|
||||
adcs r7, r7, r10
|
||||
add r1, r1, #16
|
||||
adc r8, r8, r11
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
stm lr, {r4, r5, r6, r7, r8}
|
||||
#else
|
||||
# h[0]-h[2] in r4-r6 for multiplication.
|
||||
str r7, [lr, #12]
|
||||
str r8, [lr, #16]
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
str r1, [sp, #16]
|
||||
ldr r1, [sp, #12]
|
||||
# Multiply h by r
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i]
|
||||
ldr r3, [r1]
|
||||
eor r0, r0, r0
|
||||
# r[0] * h[0]
|
||||
# h[0] in r4
|
||||
umull r4, r5, r3, r4
|
||||
# r[0] * h[2]
|
||||
# h[2] in r6
|
||||
umull r6, r7, r3, r6
|
||||
# r[0] * h[4]
|
||||
# h[4] in r8
|
||||
mul r8, r3, r8
|
||||
# r[0] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
mov r12, r0
|
||||
umlal r5, r12, r3, r2
|
||||
# r[0] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
adds r6, r6, r12
|
||||
adc r7, r7, r0
|
||||
umlal r7, r8, r3, r2
|
||||
# r[1] * h[0]
|
||||
ldr r3, [r1, #4]
|
||||
ldr r2, [lr]
|
||||
mov r12, r0
|
||||
umlal r5, r12, r3, r2
|
||||
# r[1] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
adds r6, r6, r12
|
||||
adc r12, r0, r0
|
||||
umlal r6, r12, r3, r2
|
||||
# r[1] * h[2]
|
||||
ldr r2, [lr, #8]
|
||||
adds r7, r7, r12
|
||||
adc r12, r0, r0
|
||||
umlal r7, r12, r3, r2
|
||||
# r[1] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
adds r8, r8, r12
|
||||
adc r9, r0, r0
|
||||
umlal r8, r9, r3, r2
|
||||
# r[1] * h[4]
|
||||
ldr r2, [lr, #16]
|
||||
mla r9, r3, r2, r9
|
||||
# r[2] * h[0]
|
||||
ldr r3, [r1, #8]
|
||||
ldr r2, [lr]
|
||||
mov r12, r0
|
||||
umlal r6, r12, r3, r2
|
||||
# r[2] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
adds r7, r7, r12
|
||||
adc r12, r0, r0
|
||||
umlal r7, r12, r3, r2
|
||||
# r[2] * h[2]
|
||||
ldr r2, [lr, #8]
|
||||
adds r8, r8, r12
|
||||
adc r12, r0, r0
|
||||
umlal r8, r12, r3, r2
|
||||
# r[2] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
adds r9, r9, r12
|
||||
adc r10, r0, r0
|
||||
umlal r9, r10, r3, r2
|
||||
# r[2] * h[4]
|
||||
ldr r2, [lr, #16]
|
||||
mla r10, r3, r2, r10
|
||||
# r[3] * h[0]
|
||||
ldr r3, [r1, #12]
|
||||
ldr r2, [lr]
|
||||
mov r12, r0
|
||||
umlal r7, r12, r3, r2
|
||||
# r[3] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
adds r8, r8, r12
|
||||
adc r12, r0, r0
|
||||
umlal r8, r12, r3, r2
|
||||
# r[3] * h[2]
|
||||
ldr r2, [lr, #8]
|
||||
adds r9, r9, r12
|
||||
adc r10, r10, r0
|
||||
umlal r9, r10, r3, r2
|
||||
# r[3] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
mov r11, r0
|
||||
umlal r10, r11, r3, r2
|
||||
# r[3] * h[4]
|
||||
ldr r2, [lr, #16]
|
||||
mov r12, r0
|
||||
mla r11, r3, r2, r11
|
||||
#else
|
||||
ldm r1, {r0, r1, r2, r3}
|
||||
# r[0] * h[0]
|
||||
umull r10, r11, r0, r4
|
||||
# r[1] * h[0]
|
||||
umull r12, r7, r1, r4
|
||||
# r[0] * h[1]
|
||||
umaal r11, r12, r0, r5
|
||||
# r[2] * h[0]
|
||||
umull r8, r9, r2, r4
|
||||
# r[1] * h[1]
|
||||
umaal r12, r8, r1, r5
|
||||
# r[0] * h[2]
|
||||
umaal r12, r7, r0, r6
|
||||
# r[3] * h[0]
|
||||
umaal r8, r9, r3, r4
|
||||
stm sp, {r10, r11, r12}
|
||||
# r[2] * h[1]
|
||||
umaal r7, r8, r2, r5
|
||||
# Replace h[0] with h[3]
|
||||
ldr r4, [lr, #12]
|
||||
# r[1] * h[2]
|
||||
umull r10, r11, r1, r6
|
||||
# r[2] * h[2]
|
||||
umaal r8, r9, r2, r6
|
||||
# r[0] * h[3]
|
||||
umaal r7, r10, r0, r4
|
||||
# r[3] * h[1]
|
||||
umaal r8, r11, r3, r5
|
||||
# r[1] * h[3]
|
||||
umaal r8, r10, r1, r4
|
||||
# r[3] * h[2]
|
||||
umaal r9, r11, r3, r6
|
||||
# r[2] * h[3]
|
||||
umaal r9, r10, r2, r4
|
||||
# Replace h[1] with h[4]
|
||||
ldr r5, [lr, #16]
|
||||
# r[3] * h[3]
|
||||
umaal r10, r11, r3, r4
|
||||
mov r12, #0
|
||||
# r[0] * h[4]
|
||||
umaal r8, r12, r0, r5
|
||||
# r[1] * h[4]
|
||||
umaal r9, r12, r1, r5
|
||||
# r[2] * h[4]
|
||||
umaal r10, r12, r2, r5
|
||||
# r[3] * h[4]
|
||||
umaal r11, r12, r3, r5
|
||||
# DONE
|
||||
ldm sp, {r4, r5, r6}
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
# r12 will be zero because r is masked.
|
||||
# Load length
|
||||
ldr r2, [sp, #20]
|
||||
# Reduce mod 2^130 - 5
|
||||
bic r3, r8, #0x3
|
||||
and r8, r8, #3
|
||||
adds r4, r4, r3
|
||||
lsr r3, r3, #2
|
||||
adcs r5, r5, r9
|
||||
orr r3, r3, r9, LSL #30
|
||||
adcs r6, r6, r10
|
||||
lsr r9, r9, #2
|
||||
adcs r7, r7, r11
|
||||
orr r9, r9, r10, LSL #30
|
||||
adc r8, r8, r12
|
||||
lsr r10, r10, #2
|
||||
adds r4, r4, r3
|
||||
orr r10, r10, r11, LSL #30
|
||||
adcs r5, r5, r9
|
||||
lsr r11, r11, #2
|
||||
adcs r6, r6, r10
|
||||
adcs r7, r7, r11
|
||||
adc r8, r8, r12
|
||||
# Sub 16 from length.
|
||||
subs r2, r2, #16
|
||||
# Store length.
|
||||
str r2, [sp, #20]
|
||||
# Loop again if more message to do.
|
||||
bgt L_poly1305_arm32_16_loop
|
||||
stm lr, {r4, r5, r6, r7, r8}
|
||||
L_poly1305_arm32_16_done:
|
||||
add sp, sp, #28
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
.size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16
|
||||
.text
|
||||
.align 4
|
||||
.globl poly1305_arm32_blocks
|
||||
.type poly1305_arm32_blocks, %function
|
||||
poly1305_arm32_blocks:
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vpush {d8-d15}
|
||||
cmp r2, #16
|
||||
add r12, r0, #16
|
||||
bgt L_poly1305_arm32_blocks_begin_neon
|
||||
ldm r12, {r7, r8, r9, r10, r11}
|
||||
b L_poly1305_arm32_blocks_start_1
|
||||
L_poly1305_arm32_blocks_begin_neon:
|
||||
vmov.i16 q15, #0xffff
|
||||
vshr.u64 q15, q15, #38
|
||||
vld1.64 {d0-d2}, [r12]
|
||||
vshl.u64 d4, d2, #24
|
||||
vsri.u64 d4, d1, #40
|
||||
vshr.u64 d3, d1, #14
|
||||
vshl.u64 d2, d1, #12
|
||||
vsri.u64 d1, d0, #26
|
||||
vsri.u64 d2, d0, #52
|
||||
vand.u64 d0, d0, d31
|
||||
vand.u64 d3, d3, d31
|
||||
vand.u64 d2, d2, d31
|
||||
vand.u64 d1, d1, d31
|
||||
add r3, r0, #0x7c
|
||||
vldm.32 r3, {d20-d24}
|
||||
cmp r2, #0x40
|
||||
bge L_poly1305_arm32_blocks_begin_4
|
||||
vshl.u32 d6, d21, #2
|
||||
vshl.u32 d7, d22, #2
|
||||
vshl.u32 d8, d23, #2
|
||||
vshl.u32 d9, d24, #2
|
||||
vadd.u32 d6, d6, d21
|
||||
vadd.u32 d7, d7, d22
|
||||
vadd.u32 d8, d8, d23
|
||||
vadd.u32 d9, d9, d24
|
||||
b L_poly1305_arm32_blocks_start_2
|
||||
L_poly1305_arm32_blocks_begin_4:
|
||||
add r3, r0, #0xa4
|
||||
vldm.32 r3, {d26-d30}
|
||||
L_poly1305_arm32_blocks_start_4:
|
||||
sub r2, #0x40
|
||||
vld4.32 {d10-d13}, [r1]!
|
||||
vshl.u32 d6, d27, #2
|
||||
vshl.u32 d7, d28, #2
|
||||
vshl.u32 d8, d29, #2
|
||||
vshl.u32 d9, d30, #2
|
||||
vadd.u32 d6, d6, d27
|
||||
vadd.u32 d7, d7, d28
|
||||
vadd.u32 d8, d8, d29
|
||||
vadd.u32 d9, d9, d30
|
||||
vshr.u32 d14, d13, #8
|
||||
vshl.u32 d13, d13, #18
|
||||
vorr.i32 d14, d14, #0x1000000
|
||||
vsri.u32 d13, d12, #14
|
||||
vshl.u32 d12, d12, #12
|
||||
vand.i32 d13, d13, #0x3ffffff
|
||||
vsri.u32 d12, d11, #20
|
||||
vshl.u32 d11, d11, #6
|
||||
vand.i32 d12, d12, #0x3ffffff
|
||||
vsri.u32 d11, d10, #26
|
||||
vand.i32 d10, d10, #0x3ffffff
|
||||
vand.i32 d11, d11, #0x3ffffff
|
||||
vadd.u32 d4, d4, d14
|
||||
vadd.u32 q1, q1, q6
|
||||
vadd.u32 q0, q0, q5
|
||||
vmull.u32 q5, d0, d26
|
||||
vmull.u32 q6, d0, d27
|
||||
vmull.u32 q7, d0, d28
|
||||
vmull.u32 q8, d0, d29
|
||||
vmull.u32 q9, d0, d30
|
||||
vmlal.u32 q5, d1, d9
|
||||
vmlal.u32 q6, d1, d26
|
||||
vmlal.u32 q7, d1, d27
|
||||
vmlal.u32 q8, d1, d28
|
||||
vmlal.u32 q9, d1, d29
|
||||
vmlal.u32 q5, d2, d8
|
||||
vmlal.u32 q6, d2, d9
|
||||
vmlal.u32 q7, d2, d26
|
||||
vmlal.u32 q8, d2, d27
|
||||
vmlal.u32 q9, d2, d28
|
||||
vmlal.u32 q5, d3, d7
|
||||
vmlal.u32 q6, d3, d8
|
||||
vmlal.u32 q7, d3, d9
|
||||
vmlal.u32 q8, d3, d26
|
||||
vmlal.u32 q9, d3, d27
|
||||
vmlal.u32 q5, d4, d6
|
||||
vmlal.u32 q6, d4, d7
|
||||
vmlal.u32 q7, d4, d8
|
||||
vmlal.u32 q8, d4, d9
|
||||
vmlal.u32 q9, d4, d26
|
||||
vld4.32 {d0-d3}, [r1]!
|
||||
vshl.u32 d6, d21, #2
|
||||
vshl.u32 d7, d22, #2
|
||||
vshl.u32 d8, d23, #2
|
||||
vshl.u32 d9, d24, #2
|
||||
vadd.u32 d6, d6, d21
|
||||
vadd.u32 d7, d7, d22
|
||||
vadd.u32 d8, d8, d23
|
||||
vadd.u32 d9, d9, d24
|
||||
vshr.u32 d4, d3, #8
|
||||
vshl.u32 d3, d3, #18
|
||||
vorr.i32 d4, d4, #0x1000000
|
||||
vsri.u32 d3, d2, #14
|
||||
vshl.u32 d2, d2, #12
|
||||
vand.i32 d3, d3, #0x3ffffff
|
||||
vsri.u32 d2, d1, #20
|
||||
vshl.u32 d1, d1, #6
|
||||
vand.i32 d2, d2, #0x3ffffff
|
||||
vsri.u32 d1, d0, #26
|
||||
vand.i32 d0, d0, #0x3ffffff
|
||||
vand.i32 d1, d1, #0x3ffffff
|
||||
vmlal.u32 q5, d0, d20
|
||||
vmlal.u32 q6, d0, d21
|
||||
vmlal.u32 q7, d0, d22
|
||||
vmlal.u32 q8, d0, d23
|
||||
vmlal.u32 q9, d0, d24
|
||||
vmlal.u32 q5, d1, d9
|
||||
vmlal.u32 q6, d1, d20
|
||||
vmlal.u32 q7, d1, d21
|
||||
vmlal.u32 q8, d1, d22
|
||||
vmlal.u32 q9, d1, d23
|
||||
vmlal.u32 q5, d2, d8
|
||||
vmlal.u32 q6, d2, d9
|
||||
vmlal.u32 q7, d2, d20
|
||||
vmlal.u32 q8, d2, d21
|
||||
vmlal.u32 q9, d2, d22
|
||||
vmlal.u32 q5, d3, d7
|
||||
vmlal.u32 q6, d3, d8
|
||||
vmlal.u32 q7, d3, d9
|
||||
vmlal.u32 q8, d3, d20
|
||||
vmlal.u32 q9, d3, d21
|
||||
vmlal.u32 q5, d4, d6
|
||||
vmlal.u32 q6, d4, d7
|
||||
vmlal.u32 q7, d4, d8
|
||||
vmlal.u32 q8, d4, d9
|
||||
vmlal.u32 q9, d4, d20
|
||||
vadd.u64 d0, d10, d11
|
||||
vadd.u64 d1, d12, d13
|
||||
vadd.u64 d2, d14, d15
|
||||
vadd.u64 d3, d16, d17
|
||||
vadd.u64 d4, d18, d19
|
||||
vsra.u64 d1, d0, #26
|
||||
vand.u64 d0, d0, d31
|
||||
vsra.u64 d2, d1, #26
|
||||
vand.u64 d1, d1, d31
|
||||
vsra.u64 d3, d2, #26
|
||||
vand.u64 d2, d2, d31
|
||||
vsra.u64 d4, d3, #26
|
||||
vand.u64 d3, d3, d31
|
||||
vshr.u64 d15, d4, #26
|
||||
vand.u64 d4, d4, d31
|
||||
vadd.u64 d0, d0, d15
|
||||
vshl.u64 d15, d15, #2
|
||||
vadd.u64 d0, d0, d15
|
||||
vsra.u64 d1, d0, #26
|
||||
vand.u64 d0, d0, d31
|
||||
cmp r2, #0x40
|
||||
bge L_poly1305_arm32_blocks_start_4
|
||||
cmp r2, #32
|
||||
blt L_poly1305_arm32_blocks_done_neon
|
||||
L_poly1305_arm32_blocks_start_2:
|
||||
sub r2, #32
|
||||
vld4.32 {d10-d13}, [r1]!
|
||||
vshr.u32 d14, d13, #8
|
||||
vshl.u32 d13, d13, #18
|
||||
vorr.i32 d14, d14, #0x1000000
|
||||
vsri.u32 d13, d12, #14
|
||||
vshl.u32 d12, d12, #12
|
||||
vand.i32 d13, d13, #0x3ffffff
|
||||
vsri.u32 d12, d11, #20
|
||||
vshl.u32 d11, d11, #6
|
||||
vand.i32 d12, d12, #0x3ffffff
|
||||
vsri.u32 d11, d10, #26
|
||||
vand.i32 d10, d10, #0x3ffffff
|
||||
vand.i32 d11, d11, #0x3ffffff
|
||||
vadd.u32 d4, d4, d14
|
||||
vadd.u32 q1, q1, q6
|
||||
vadd.u32 q0, q0, q5
|
||||
vmull.u32 q5, d0, d20
|
||||
vmull.u32 q6, d0, d21
|
||||
vmull.u32 q7, d0, d22
|
||||
vmull.u32 q8, d0, d23
|
||||
vmull.u32 q9, d0, d24
|
||||
vmlal.u32 q5, d1, d9
|
||||
vmlal.u32 q6, d1, d20
|
||||
vmlal.u32 q7, d1, d21
|
||||
vmlal.u32 q8, d1, d22
|
||||
vmlal.u32 q9, d1, d23
|
||||
vmlal.u32 q5, d2, d8
|
||||
vmlal.u32 q6, d2, d9
|
||||
vmlal.u32 q7, d2, d20
|
||||
vmlal.u32 q8, d2, d21
|
||||
vmlal.u32 q9, d2, d22
|
||||
vmlal.u32 q5, d3, d7
|
||||
vmlal.u32 q6, d3, d8
|
||||
vmlal.u32 q7, d3, d9
|
||||
vmlal.u32 q8, d3, d20
|
||||
vmlal.u32 q9, d3, d21
|
||||
vmlal.u32 q5, d4, d6
|
||||
vmlal.u32 q6, d4, d7
|
||||
vmlal.u32 q7, d4, d8
|
||||
vmlal.u32 q8, d4, d9
|
||||
vmlal.u32 q9, d4, d20
|
||||
vadd.u64 d0, d10, d11
|
||||
vadd.u64 d1, d12, d13
|
||||
vadd.u64 d2, d14, d15
|
||||
vadd.u64 d3, d16, d17
|
||||
vadd.u64 d4, d18, d19
|
||||
vsra.u64 d1, d0, #26
|
||||
vand.u64 d0, d0, d31
|
||||
vsra.u64 d2, d1, #26
|
||||
vand.u64 d1, d1, d31
|
||||
vsra.u64 d3, d2, #26
|
||||
vand.u64 d2, d2, d31
|
||||
vsra.u64 d4, d3, #26
|
||||
vand.u64 d3, d3, d31
|
||||
vshr.u64 d5, d4, #26
|
||||
vand.u64 d4, d4, d31
|
||||
vadd.u64 d0, d0, d5
|
||||
vshl.u64 d5, d5, #2
|
||||
vadd.u64 d0, d0, d5
|
||||
vsra.u64 d1, d0, #26
|
||||
vand.u64 d0, d0, d31
|
||||
L_poly1305_arm32_blocks_done_neon:
|
||||
cmp r2, #16
|
||||
beq L_poly1305_arm32_blocks_begin_1
|
||||
add r12, r0, #16
|
||||
vsli.u64 d0, d1, #26
|
||||
vsli.u64 d0, d2, #52
|
||||
vshr.u64 d1, d2, #12
|
||||
vsli.u64 d1, d3, #14
|
||||
vsli.u64 d1, d4, #40
|
||||
vshr.u64 d2, d4, #24
|
||||
vst1.64 {d0-d2}, [r12]
|
||||
b L_poly1305_arm32_blocks_done
|
||||
L_poly1305_arm32_blocks_begin_1:
|
||||
vsli.u64 d0, d1, #26
|
||||
vsli.u64 d0, d2, #52
|
||||
vshr.u64 d1, d2, #12
|
||||
vsli.u64 d1, d3, #14
|
||||
vsli.u64 d1, d4, #40
|
||||
vshr.u64 d2, d4, #24
|
||||
vmov r7, r8, d0
|
||||
vmov r9, r10, d1
|
||||
vmov r11, d2[0]
|
||||
L_poly1305_arm32_blocks_start_1:
|
||||
mov r12, #1
|
||||
push {r2}
|
||||
# Load message
|
||||
ldm r1, {r2, r3, r4, r5}
|
||||
# Add message
|
||||
adds r7, r7, r2
|
||||
adcs r8, r8, r3
|
||||
adcs r9, r9, r4
|
||||
adcs r10, r10, r5
|
||||
adc r11, r11, r12
|
||||
push {r0, r1}
|
||||
add r1, r0, #0
|
||||
add lr, r0, #16
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
stm lr, {r7, r8, r9, r10, r11}
|
||||
#else
|
||||
# h[0]-h[2] in r4-r6 for multiplication.
|
||||
str r10, [lr, #12]
|
||||
str r11, [lr, #16]
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i]
|
||||
ldr r3, [r1]
|
||||
eor r0, r0, r0
|
||||
# r[0] * h[0]
|
||||
# h[0] in r4
|
||||
umull r7, r8, r3, r7
|
||||
# r[0] * h[2]
|
||||
# h[2] in r6
|
||||
umull r9, r10, r3, r9
|
||||
# r[0] * h[4]
|
||||
# h[4] in r8
|
||||
mul r11, r3, r11
|
||||
# r[0] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
mov r12, r0
|
||||
umlal r8, r12, r3, r2
|
||||
# r[0] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
adds r9, r9, r12
|
||||
adc r10, r10, r0
|
||||
umlal r10, r11, r3, r2
|
||||
# r[1] * h[0]
|
||||
ldr r3, [r1, #4]
|
||||
ldr r2, [lr]
|
||||
mov r12, r0
|
||||
umlal r8, r12, r3, r2
|
||||
# r[1] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
adds r9, r9, r12
|
||||
adc r12, r0, r0
|
||||
umlal r9, r12, r3, r2
|
||||
# r[1] * h[2]
|
||||
ldr r2, [lr, #8]
|
||||
adds r10, r10, r12
|
||||
adc r12, r0, r0
|
||||
umlal r10, r12, r3, r2
|
||||
# r[1] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
adds r11, r11, r12
|
||||
adc r4, r0, r0
|
||||
umlal r11, r4, r3, r2
|
||||
# r[1] * h[4]
|
||||
ldr r2, [lr, #16]
|
||||
mla r4, r3, r2, r4
|
||||
# r[2] * h[0]
|
||||
ldr r3, [r1, #8]
|
||||
ldr r2, [lr]
|
||||
mov r12, r0
|
||||
umlal r9, r12, r3, r2
|
||||
# r[2] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
adds r10, r10, r12
|
||||
adc r12, r0, r0
|
||||
umlal r10, r12, r3, r2
|
||||
# r[2] * h[2]
|
||||
ldr r2, [lr, #8]
|
||||
adds r11, r11, r12
|
||||
adc r12, r0, r0
|
||||
umlal r11, r12, r3, r2
|
||||
# r[2] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
adds r4, r4, r12
|
||||
adc r5, r0, r0
|
||||
umlal r4, r5, r3, r2
|
||||
# r[2] * h[4]
|
||||
ldr r2, [lr, #16]
|
||||
mla r5, r3, r2, r5
|
||||
# r[3] * h[0]
|
||||
ldr r3, [r1, #12]
|
||||
ldr r2, [lr]
|
||||
mov r12, r0
|
||||
umlal r10, r12, r3, r2
|
||||
# r[3] * h[1]
|
||||
ldr r2, [lr, #4]
|
||||
adds r11, r11, r12
|
||||
adc r12, r0, r0
|
||||
umlal r11, r12, r3, r2
|
||||
# r[3] * h[2]
|
||||
ldr r2, [lr, #8]
|
||||
adds r4, r4, r12
|
||||
adc r5, r5, r0
|
||||
umlal r4, r5, r3, r2
|
||||
# r[3] * h[3]
|
||||
ldr r2, [lr, #12]
|
||||
mov r6, r0
|
||||
umlal r5, r6, r3, r2
|
||||
# r[3] * h[4]
|
||||
ldr r2, [lr, #16]
|
||||
mov r12, r0
|
||||
mla r6, r3, r2, r6
|
||||
#else
|
||||
sub sp, sp, #12
|
||||
ldm r1, {r0, r1, r2, r3}
|
||||
# r[0] * h[0]
|
||||
umull r5, r6, r0, r7
|
||||
# r[1] * h[0]
|
||||
umull r12, r10, r1, r7
|
||||
# r[0] * h[1]
|
||||
umaal r6, r12, r0, r8
|
||||
# r[2] * h[0]
|
||||
umull r11, r4, r2, r7
|
||||
# r[1] * h[1]
|
||||
umaal r12, r11, r1, r8
|
||||
# r[0] * h[2]
|
||||
umaal r12, r10, r0, r9
|
||||
# r[3] * h[0]
|
||||
umaal r11, r4, r3, r7
|
||||
stm sp, {r5, r6, r12}
|
||||
# r[2] * h[1]
|
||||
umaal r10, r11, r2, r8
|
||||
# Replace h[0] with h[3]
|
||||
ldr r7, [lr, #12]
|
||||
# r[1] * h[2]
|
||||
umull r5, r6, r1, r9
|
||||
# r[2] * h[2]
|
||||
umaal r11, r4, r2, r9
|
||||
# r[0] * h[3]
|
||||
umaal r10, r5, r0, r7
|
||||
# r[3] * h[1]
|
||||
umaal r11, r6, r3, r8
|
||||
# r[1] * h[3]
|
||||
umaal r11, r5, r1, r7
|
||||
# r[3] * h[2]
|
||||
umaal r4, r6, r3, r9
|
||||
# r[2] * h[3]
|
||||
umaal r4, r5, r2, r7
|
||||
# Replace h[1] with h[4]
|
||||
ldr r8, [lr, #16]
|
||||
# r[3] * h[3]
|
||||
umaal r5, r6, r3, r7
|
||||
mov r12, #0
|
||||
# r[0] * h[4]
|
||||
umaal r11, r12, r0, r8
|
||||
# r[1] * h[4]
|
||||
umaal r4, r12, r1, r8
|
||||
# r[2] * h[4]
|
||||
umaal r5, r12, r2, r8
|
||||
# r[3] * h[4]
|
||||
umaal r6, r12, r3, r8
|
||||
# DONE
|
||||
ldm sp, {r7, r8, r9}
|
||||
add sp, sp, #12
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
# Reduce mod 2^130 - 5
|
||||
bic r3, r11, #0x3
|
||||
and r11, r11, #3
|
||||
adds r7, r7, r3
|
||||
lsr r3, r3, #2
|
||||
adcs r8, r8, r4
|
||||
orr r3, r3, r4, LSL #30
|
||||
adcs r9, r9, r5
|
||||
lsr r4, r4, #2
|
||||
adcs r10, r10, r6
|
||||
orr r4, r4, r5, LSL #30
|
||||
adc r11, r11, r12
|
||||
lsr r5, r5, #2
|
||||
adds r7, r7, r3
|
||||
orr r5, r5, r6, LSL #30
|
||||
adcs r8, r8, r4
|
||||
lsr r6, r6, #2
|
||||
adcs r9, r9, r5
|
||||
adcs r10, r10, r6
|
||||
adc r11, r11, r12
|
||||
pop {r0, r1}
|
||||
pop {r2}
|
||||
add r12, r0, #16
|
||||
stm r12, {r7, r8, r9, r10, r11}
|
||||
L_poly1305_arm32_blocks_done:
|
||||
vpop {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
.size poly1305_arm32_blocks,.-poly1305_arm32_blocks
|
||||
.text
|
||||
.type L_poly1305_arm32_clamp, %object
|
||||
.size L_poly1305_arm32_clamp, 16
|
||||
.align 4
|
||||
L_poly1305_arm32_clamp:
|
||||
.word 0xfffffff
|
||||
.word 0xffffffc
|
||||
.word 0xffffffc
|
||||
.word 0xffffffc
|
||||
.text
|
||||
.align 4
|
||||
.globl poly1305_set_key
|
||||
.type poly1305_set_key, %function
|
||||
poly1305_set_key:
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vpush {d8-d15}
|
||||
# Load mask.
|
||||
adr lr, L_poly1305_arm32_clamp
|
||||
ldm lr, {r6, r7, r8, r9}
|
||||
# Load and cache padding.
|
||||
ldr r2, [r1, #16]
|
||||
ldr r3, [r1, #20]
|
||||
ldr r4, [r1, #24]
|
||||
ldr r5, [r1, #28]
|
||||
add lr, r0, #40
|
||||
stm lr, {r2, r3, r4, r5}
|
||||
# Load, mask and store r.
|
||||
ldr r2, [r1]
|
||||
ldr r3, [r1, #4]
|
||||
ldr r4, [r1, #8]
|
||||
ldr r5, [r1, #12]
|
||||
and r2, r2, r6
|
||||
and r3, r3, r7
|
||||
and r4, r4, r8
|
||||
and r5, r5, r9
|
||||
add lr, r0, #0
|
||||
stm lr, {r2, r3, r4, r5}
|
||||
vmov.i16 q10, #0xffff
|
||||
vshr.u64 q10, q10, #38
|
||||
lsr r8, r2, #26
|
||||
lsr r9, r3, #20
|
||||
lsr r10, r4, #14
|
||||
lsr r11, r5, #8
|
||||
eor r8, r8, r3, lsl #6
|
||||
eor r9, r9, r4, lsl #12
|
||||
eor r10, r10, r5, lsl #18
|
||||
and r7, r2, #0x3ffffff
|
||||
and r8, r8, #0x3ffffff
|
||||
and r9, r9, #0x3ffffff
|
||||
and r10, r10, #0x3ffffff
|
||||
vmov.i32 s1, r7
|
||||
vmov.i32 s3, r8
|
||||
vmov.i32 s5, r9
|
||||
vmov.i32 s7, r10
|
||||
vmov.i32 s9, r11
|
||||
push {r0, r1}
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# Square r
|
||||
umull r1, r6, r2, r3
|
||||
mov r12, #0
|
||||
umull r7, r8, r2, r5
|
||||
mov lr, r12
|
||||
umlal r6, lr, r2, r4
|
||||
adds r7, r7, lr
|
||||
adc lr, r12, r12
|
||||
umlal r7, lr, r3, r4
|
||||
mov r9, r12
|
||||
umlal lr, r9, r3, r5
|
||||
adds r8, r8, lr
|
||||
adcs r9, r9, r12
|
||||
adc r10, r12, r12
|
||||
umlal r9, r10, r4, r5
|
||||
adds r1, r1, r1
|
||||
adcs r6, r6, r6
|
||||
adcs r7, r7, r7
|
||||
adcs r8, r8, r8
|
||||
adcs r9, r9, r9
|
||||
adcs r10, r10, r10
|
||||
adc r11, r12, r12
|
||||
umull r0, lr, r2, r2
|
||||
adds r1, r1, lr
|
||||
adcs r6, r6, r12
|
||||
adc lr, r12, r12
|
||||
umlal r6, lr, r3, r3
|
||||
adds r7, r7, lr
|
||||
adcs r8, r8, r12
|
||||
adc lr, r12, r12
|
||||
umlal r8, lr, r4, r4
|
||||
adds r9, r9, lr
|
||||
adcs r10, r10, r12
|
||||
adc r11, r11, r12
|
||||
umlal r10, r11, r5, r5
|
||||
#else
|
||||
umull r0, r1, r2, r2
|
||||
umull r6, r7, r2, r3
|
||||
adds r6, r6, r6
|
||||
mov r12, #0
|
||||
umaal r1, r6, r12, r12
|
||||
mov r8, r12
|
||||
umaal r8, r7, r2, r4
|
||||
adcs r8, r8, r8
|
||||
umaal r6, r8, r3, r3
|
||||
umull r9, r10, r2, r5
|
||||
umaal r7, r9, r3, r4
|
||||
adcs r7, r7, r7
|
||||
umaal r7, r8, r12, r12
|
||||
umaal r10, r9, r3, r5
|
||||
adcs r10, r10, r10
|
||||
umaal r8, r10, r4, r4
|
||||
mov r11, r12
|
||||
umaal r9, r11, r4, r5
|
||||
adcs r9, r9, r9
|
||||
umaal r9, r10, r12, r12
|
||||
adcs r11, r11, r11
|
||||
umaal r10, r11, r5, r5
|
||||
adc r11, r11, r12
|
||||
#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */
|
||||
# Reduce mod 2^130 - 5
|
||||
bic r2, r8, #0x3
|
||||
and r8, r8, #3
|
||||
adds r0, r0, r2
|
||||
lsr r2, r2, #2
|
||||
adcs r1, r1, r9
|
||||
orr r2, r2, r9, LSL #30
|
||||
adcs r6, r6, r10
|
||||
lsr r9, r9, #2
|
||||
adcs r7, r7, r11
|
||||
orr r9, r9, r10, LSL #30
|
||||
adc r8, r8, r12
|
||||
lsr r10, r10, #2
|
||||
adds r0, r0, r2
|
||||
orr r10, r10, r11, LSL #30
|
||||
adcs r1, r1, r9
|
||||
lsr r11, r11, #2
|
||||
adcs r6, r6, r10
|
||||
adcs r7, r7, r11
|
||||
adc r8, r8, r12
|
||||
lsr r3, r0, #26
|
||||
lsr r4, r1, #20
|
||||
lsr r5, r6, #14
|
||||
lsr r10, r7, #8
|
||||
eor r3, r3, r1, lsl #6
|
||||
eor r4, r4, r6, lsl #12
|
||||
eor r5, r5, r7, lsl #18
|
||||
eor r10, r10, r8, lsl #24
|
||||
and r2, r0, #0x3ffffff
|
||||
and r3, r3, #0x3ffffff
|
||||
and r4, r4, #0x3ffffff
|
||||
and r5, r5, #0x3ffffff
|
||||
vmov.i32 s0, r2
|
||||
vmov.i32 s2, r3
|
||||
vmov.i32 s4, r4
|
||||
vmov.i32 s6, r5
|
||||
vmov.i32 s8, r10
|
||||
pop {r0, r1}
|
||||
add lr, r0, #0x7c
|
||||
vstm.32 lr, {d0-d4}
|
||||
# Multiply r^2, r by r^2
|
||||
vshl.u32 d6, d1, #2
|
||||
vshl.u32 d7, d2, #2
|
||||
vshl.u32 d8, d3, #2
|
||||
vshl.u32 d9, d4, #2
|
||||
vadd.u32 d6, d6, d1
|
||||
vadd.u32 d7, d7, d2
|
||||
vadd.u32 d8, d8, d3
|
||||
vadd.u32 d9, d9, d4
|
||||
vmull.u32 q5, d0, d0[0]
|
||||
vmull.u32 q6, d0, d1[0]
|
||||
vmull.u32 q7, d0, d2[0]
|
||||
vmull.u32 q8, d0, d3[0]
|
||||
vmull.u32 q9, d0, d4[0]
|
||||
vmlal.u32 q5, d1, d9[0]
|
||||
vmlal.u32 q6, d1, d0[0]
|
||||
vmlal.u32 q7, d1, d1[0]
|
||||
vmlal.u32 q8, d1, d2[0]
|
||||
vmlal.u32 q9, d1, d3[0]
|
||||
vmlal.u32 q5, d2, d8[0]
|
||||
vmlal.u32 q6, d2, d9[0]
|
||||
vmlal.u32 q7, d2, d0[0]
|
||||
vmlal.u32 q8, d2, d1[0]
|
||||
vmlal.u32 q9, d2, d2[0]
|
||||
vmlal.u32 q5, d3, d7[0]
|
||||
vmlal.u32 q6, d3, d8[0]
|
||||
vmlal.u32 q7, d3, d9[0]
|
||||
vmlal.u32 q8, d3, d0[0]
|
||||
vmlal.u32 q9, d3, d1[0]
|
||||
vmlal.u32 q5, d4, d6[0]
|
||||
vmlal.u32 q6, d4, d7[0]
|
||||
vmlal.u32 q7, d4, d8[0]
|
||||
vmlal.u32 q8, d4, d9[0]
|
||||
vmlal.u32 q9, d4, d0[0]
|
||||
vsra.u64 q6, q5, #26
|
||||
vand.u64 q5, q5, q10
|
||||
vsra.u64 q7, q6, #26
|
||||
vand.u64 q6, q6, q10
|
||||
vsra.u64 q8, q7, #26
|
||||
vand.u64 q7, q7, q10
|
||||
vsra.u64 q9, q8, #26
|
||||
vand.u64 q8, q8, q10
|
||||
vshr.u64 q3, q9, #26
|
||||
vand.u64 q9, q9, q10
|
||||
vadd.u64 q5, q5, q3
|
||||
vshl.u64 q3, q3, #2
|
||||
vadd.u64 q5, q5, q3
|
||||
vsra.u64 q6, q5, #26
|
||||
vand.u64 q5, q5, q10
|
||||
vmovn.i64 d10, q5
|
||||
vmovn.i64 d11, q6
|
||||
vmovn.i64 d12, q7
|
||||
vmovn.i64 d13, q8
|
||||
vmovn.i64 d14, q9
|
||||
add lr, r0, #0xa4
|
||||
vstm.32 lr, {d10-d14}
|
||||
# h (accumulator) = 0
|
||||
eor r6, r6, r6
|
||||
eor r7, r7, r7
|
||||
eor r8, r8, r8
|
||||
eor r9, r9, r9
|
||||
add lr, r0, #16
|
||||
eor r4, r4, r4
|
||||
eor r5, r5, r5
|
||||
stm lr, {r4, r5, r6, r7, r8, r9}
|
||||
# Zero leftover
|
||||
str r5, [r0, #56]
|
||||
vpop {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
.size poly1305_set_key,.-poly1305_set_key
|
||||
.text
|
||||
.align 4
|
||||
.globl poly1305_final
|
||||
.type poly1305_final, %function
|
||||
poly1305_final:
|
||||
push {r4, r5, r6, r7, r8, r9, lr}
|
||||
add r9, r0, #16
|
||||
ldm r9, {r4, r5, r6, r7, r8}
|
||||
# Add 5 and check for h larger than p.
|
||||
adds r2, r4, #5
|
||||
adcs r2, r5, #0
|
||||
adcs r2, r6, #0
|
||||
adcs r2, r7, #0
|
||||
adc r2, r8, #0
|
||||
sub r2, r2, #4
|
||||
lsr r2, r2, #31
|
||||
sub r2, r2, #1
|
||||
and r2, r2, #5
|
||||
# Add 0/5 to h.
|
||||
adds r4, r4, r2
|
||||
adcs r5, r5, #0
|
||||
adcs r6, r6, #0
|
||||
adc r7, r7, #0
|
||||
# Add padding
|
||||
add r9, r0, #40
|
||||
ldm r9, {r2, r3, r12, lr}
|
||||
adds r4, r4, r2
|
||||
adcs r5, r5, r3
|
||||
adcs r6, r6, r12
|
||||
adc r7, r7, lr
|
||||
# Store MAC
|
||||
str r4, [r1]
|
||||
str r5, [r1, #4]
|
||||
str r6, [r1, #8]
|
||||
str r7, [r1, #12]
|
||||
# Zero out h.
|
||||
eor r4, r4, r4
|
||||
eor r5, r5, r5
|
||||
eor r6, r6, r6
|
||||
eor r7, r7, r7
|
||||
eor r8, r8, r8
|
||||
add r9, r0, #16
|
||||
stm r9, {r4, r5, r6, r7, r8}
|
||||
# Zero out r.
|
||||
add r9, r0, #0
|
||||
stm r9, {r4, r5, r6, r7}
|
||||
# Zero out padding.
|
||||
add r9, r0, #40
|
||||
stm r9, {r4, r5, r6, r7}
|
||||
pop {r4, r5, r6, r7, r8, r9, pc}
|
||||
.size poly1305_final,.-poly1305_final
|
||||
#endif /* WOLFSSL_ARMASM_NO_NEON */
|
||||
#endif /* HAVE_POLY1305 */
|
||||
#endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */
|
||||
#endif /* WOLFSSL_ARMASM */
|
||||
|
@ -52,7 +52,8 @@
|
||||
#ifdef HAVE_POLY1305
|
||||
#include <wolfssl/wolfcrypt/poly1305.h>
|
||||
|
||||
void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
|
||||
#ifdef WOLFSSL_ARMASM_NO_NEON
|
||||
void poly1305_arm32_blocks_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
|
||||
int notLast_p)
|
||||
{
|
||||
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
|
||||
@ -383,6 +384,976 @@ void poly1305_final(Poly1305* ctx_p, byte* mac_p)
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
void poly1305_arm32_blocks_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
|
||||
int notLast_p)
|
||||
{
|
||||
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
|
||||
register const byte* m asm ("r1") = (const byte*)m_p;
|
||||
register word32 len asm ("r2") = (word32)len_p;
|
||||
register int notLast asm ("r3") = (int)notLast_p;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"sub sp, sp, #28\n\t"
|
||||
"cmp %[len], #0\n\t"
|
||||
"beq L_poly1305_arm32_16_done_%=\n\t"
|
||||
"add lr, sp, #12\n\t"
|
||||
"stm lr, {r0, r1, r2, r3}\n\t"
|
||||
/* Get h pointer */
|
||||
"add lr, %[ctx], #16\n\t"
|
||||
"ldm lr, {r4, r5, r6, r7, r8}\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_16_loop_%=: \n\t"
|
||||
/* Add m to h */
|
||||
"ldr %[m], [sp, #16]\n\t"
|
||||
"ldr %[len], [%[m]]\n\t"
|
||||
"ldr %[notLast], [%[m], #4]\n\t"
|
||||
"ldr r9, [%[m], #8]\n\t"
|
||||
"ldr r10, [%[m], #12]\n\t"
|
||||
"ldr r11, [sp, #24]\n\t"
|
||||
"adds r4, r4, %[len]\n\t"
|
||||
"adcs r5, r5, %[notLast]\n\t"
|
||||
"adcs r6, r6, r9\n\t"
|
||||
"adcs r7, r7, r10\n\t"
|
||||
"add %[m], %[m], #16\n\t"
|
||||
"adc r8, r8, r11\n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
"stm lr, {r4, r5, r6, r7, r8}\n\t"
|
||||
#else
|
||||
/* h[0]-h[2] in r4-r6 for multiplication. */
|
||||
"str r7, [lr, #12]\n\t"
|
||||
"str r8, [lr, #16]\n\t"
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
"str %[m], [sp, #16]\n\t"
|
||||
"ldr %[m], [sp, #12]\n\t"
|
||||
/* Multiply h by r */
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
|
||||
"ldr %[notLast], [%[m]]\n\t"
|
||||
"eor %[ctx], %[ctx], %[ctx]\n\t"
|
||||
/* r[0] * h[0] */
|
||||
/* h[0] in r4 */
|
||||
"umull r4, r5, %[notLast], r4\n\t"
|
||||
/* r[0] * h[2] */
|
||||
/* h[2] in r6 */
|
||||
"umull r6, r7, %[notLast], r6\n\t"
|
||||
/* r[0] * h[4] */
|
||||
/* h[4] in r8 */
|
||||
"mul r8, %[notLast], r8\n\t"
|
||||
/* r[0] * h[1] */
|
||||
"ldr %[len], [lr, #4]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r5, r12, %[notLast], %[len]\n\t"
|
||||
/* r[0] * h[3] */
|
||||
"ldr %[len], [lr, #12]\n\t"
|
||||
"adds r6, r6, r12\n\t"
|
||||
"adc r7, r7, %[ctx]\n\t"
|
||||
"umlal r7, r8, %[notLast], %[len]\n\t"
|
||||
/* r[1] * h[0] */
|
||||
"ldr %[notLast], [%[m], #4]\n\t"
|
||||
"ldr %[len], [lr]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r5, r12, %[notLast], %[len]\n\t"
|
||||
/* r[1] * h[1] */
|
||||
"ldr %[len], [lr, #4]\n\t"
|
||||
"adds r6, r6, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r6, r12, %[notLast], %[len]\n\t"
|
||||
/* r[1] * h[2] */
|
||||
"ldr %[len], [lr, #8]\n\t"
|
||||
"adds r7, r7, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r7, r12, %[notLast], %[len]\n\t"
|
||||
/* r[1] * h[3] */
|
||||
"ldr %[len], [lr, #12]\n\t"
|
||||
"adds r8, r8, r12\n\t"
|
||||
"adc r9, %[ctx], %[ctx]\n\t"
|
||||
"umlal r8, r9, %[notLast], %[len]\n\t"
|
||||
/* r[1] * h[4] */
|
||||
"ldr %[len], [lr, #16]\n\t"
|
||||
"mla r9, %[notLast], %[len], r9\n\t"
|
||||
/* r[2] * h[0] */
|
||||
"ldr %[notLast], [%[m], #8]\n\t"
|
||||
"ldr %[len], [lr]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r6, r12, %[notLast], %[len]\n\t"
|
||||
/* r[2] * h[1] */
|
||||
"ldr %[len], [lr, #4]\n\t"
|
||||
"adds r7, r7, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r7, r12, %[notLast], %[len]\n\t"
|
||||
/* r[2] * h[2] */
|
||||
"ldr %[len], [lr, #8]\n\t"
|
||||
"adds r8, r8, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r8, r12, %[notLast], %[len]\n\t"
|
||||
/* r[2] * h[3] */
|
||||
"ldr %[len], [lr, #12]\n\t"
|
||||
"adds r9, r9, r12\n\t"
|
||||
"adc r10, %[ctx], %[ctx]\n\t"
|
||||
"umlal r9, r10, %[notLast], %[len]\n\t"
|
||||
/* r[2] * h[4] */
|
||||
"ldr %[len], [lr, #16]\n\t"
|
||||
"mla r10, %[notLast], %[len], r10\n\t"
|
||||
/* r[3] * h[0] */
|
||||
"ldr %[notLast], [%[m], #12]\n\t"
|
||||
"ldr %[len], [lr]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r7, r12, %[notLast], %[len]\n\t"
|
||||
/* r[3] * h[1] */
|
||||
"ldr %[len], [lr, #4]\n\t"
|
||||
"adds r8, r8, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r8, r12, %[notLast], %[len]\n\t"
|
||||
/* r[3] * h[2] */
|
||||
"ldr %[len], [lr, #8]\n\t"
|
||||
"adds r9, r9, r12\n\t"
|
||||
"adc r10, r10, %[ctx]\n\t"
|
||||
"umlal r9, r10, %[notLast], %[len]\n\t"
|
||||
/* r[3] * h[3] */
|
||||
"ldr %[len], [lr, #12]\n\t"
|
||||
"mov r11, %[ctx]\n\t"
|
||||
"umlal r10, r11, %[notLast], %[len]\n\t"
|
||||
/* r[3] * h[4] */
|
||||
"ldr %[len], [lr, #16]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"mla r11, %[notLast], %[len], r11\n\t"
|
||||
#else
|
||||
"ldm %[m], {r0, r1, r2, r3}\n\t"
|
||||
/* r[0] * h[0] */
|
||||
"umull r10, r11, %[ctx], r4\n\t"
|
||||
/* r[1] * h[0] */
|
||||
"umull r12, r7, %[m], r4\n\t"
|
||||
/* r[0] * h[1] */
|
||||
"umaal r11, r12, %[ctx], r5\n\t"
|
||||
/* r[2] * h[0] */
|
||||
"umull r8, r9, %[len], r4\n\t"
|
||||
/* r[1] * h[1] */
|
||||
"umaal r12, r8, %[m], r5\n\t"
|
||||
/* r[0] * h[2] */
|
||||
"umaal r12, r7, %[ctx], r6\n\t"
|
||||
/* r[3] * h[0] */
|
||||
"umaal r8, r9, %[notLast], r4\n\t"
|
||||
"stm sp, {r10, r11, r12}\n\t"
|
||||
/* r[2] * h[1] */
|
||||
"umaal r7, r8, %[len], r5\n\t"
|
||||
/* Replace h[0] with h[3] */
|
||||
"ldr r4, [lr, #12]\n\t"
|
||||
/* r[1] * h[2] */
|
||||
"umull r10, r11, %[m], r6\n\t"
|
||||
/* r[2] * h[2] */
|
||||
"umaal r8, r9, %[len], r6\n\t"
|
||||
/* r[0] * h[3] */
|
||||
"umaal r7, r10, %[ctx], r4\n\t"
|
||||
/* r[3] * h[1] */
|
||||
"umaal r8, r11, %[notLast], r5\n\t"
|
||||
/* r[1] * h[3] */
|
||||
"umaal r8, r10, %[m], r4\n\t"
|
||||
/* r[3] * h[2] */
|
||||
"umaal r9, r11, %[notLast], r6\n\t"
|
||||
/* r[2] * h[3] */
|
||||
"umaal r9, r10, %[len], r4\n\t"
|
||||
/* Replace h[1] with h[4] */
|
||||
"ldr r5, [lr, #16]\n\t"
|
||||
/* r[3] * h[3] */
|
||||
"umaal r10, r11, %[notLast], r4\n\t"
|
||||
"mov r12, #0\n\t"
|
||||
/* r[0] * h[4] */
|
||||
"umaal r8, r12, %[ctx], r5\n\t"
|
||||
/* r[1] * h[4] */
|
||||
"umaal r9, r12, %[m], r5\n\t"
|
||||
/* r[2] * h[4] */
|
||||
"umaal r10, r12, %[len], r5\n\t"
|
||||
/* r[3] * h[4] */
|
||||
"umaal r11, r12, %[notLast], r5\n\t"
|
||||
/* DONE */
|
||||
"ldm sp, {r4, r5, r6}\n\t"
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
/* r12 will be zero because r is masked. */
|
||||
/* Load length */
|
||||
"ldr %[len], [sp, #20]\n\t"
|
||||
/* Reduce mod 2^130 - 5 */
|
||||
"bic %[notLast], r8, #0x3\n\t"
|
||||
"and r8, r8, #3\n\t"
|
||||
"adds r4, r4, %[notLast]\n\t"
|
||||
"lsr %[notLast], %[notLast], #2\n\t"
|
||||
"adcs r5, r5, r9\n\t"
|
||||
"orr %[notLast], %[notLast], r9, LSL #30\n\t"
|
||||
"adcs r6, r6, r10\n\t"
|
||||
"lsr r9, r9, #2\n\t"
|
||||
"adcs r7, r7, r11\n\t"
|
||||
"orr r9, r9, r10, LSL #30\n\t"
|
||||
"adc r8, r8, r12\n\t"
|
||||
"lsr r10, r10, #2\n\t"
|
||||
"adds r4, r4, %[notLast]\n\t"
|
||||
"orr r10, r10, r11, LSL #30\n\t"
|
||||
"adcs r5, r5, r9\n\t"
|
||||
"lsr r11, r11, #2\n\t"
|
||||
"adcs r6, r6, r10\n\t"
|
||||
"adcs r7, r7, r11\n\t"
|
||||
"adc r8, r8, r12\n\t"
|
||||
/* Sub 16 from length. */
|
||||
"subs %[len], %[len], #16\n\t"
|
||||
/* Store length. */
|
||||
"str %[len], [sp, #20]\n\t"
|
||||
/* Loop again if more message to do. */
|
||||
"bgt L_poly1305_arm32_16_loop_%=\n\t"
|
||||
"stm lr, {r4, r5, r6, r7, r8}\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_16_done_%=: \n\t"
|
||||
"add sp, sp, #28\n\t"
|
||||
: [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
|
||||
[notLast] "+r" (notLast)
|
||||
:
|
||||
: "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
|
||||
"r10", "r11"
|
||||
);
|
||||
}
|
||||
|
||||
void poly1305_arm32_blocks(Poly1305* ctx_p, const unsigned char* m_p,
|
||||
size_t bytes_p)
|
||||
{
|
||||
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
|
||||
register const unsigned char* m asm ("r1") = (const unsigned char*)m_p;
|
||||
register size_t bytes asm ("r2") = (size_t)bytes_p;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"cmp %[bytes], #16\n\t"
|
||||
"add r12, %[ctx], #16\n\t"
|
||||
"bgt L_poly1305_arm32_blocks_begin_neon_%=\n\t"
|
||||
"ldm r12, {r7, r8, r9, r10, r11}\n\t"
|
||||
"b L_poly1305_arm32_blocks_start_1_%=\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_begin_neon_%=: \n\t"
|
||||
"vmov.i16 q15, #0xffff\n\t"
|
||||
"vshr.u64 q15, q15, #38\n\t"
|
||||
"vld1.64 {d0-d2}, [r12]\n\t"
|
||||
"vshl.u64 d4, d2, #24\n\t"
|
||||
"vsri.u64 d4, d1, #40\n\t"
|
||||
"vshr.u64 d3, d1, #14\n\t"
|
||||
"vshl.u64 d2, d1, #12\n\t"
|
||||
"vsri.u64 d1, d0, #26\n\t"
|
||||
"vsri.u64 d2, d0, #52\n\t"
|
||||
"vand.u64 d0, d0, d31\n\t"
|
||||
"vand.u64 d3, d3, d31\n\t"
|
||||
"vand.u64 d2, d2, d31\n\t"
|
||||
"vand.u64 d1, d1, d31\n\t"
|
||||
"add r3, %[ctx], #0x7c\n\t"
|
||||
"vldm.32 r3, {d20-d24}\n\t"
|
||||
"cmp %[bytes], #0x40\n\t"
|
||||
"bge L_poly1305_arm32_blocks_begin_4_%=\n\t"
|
||||
"vshl.u32 d6, d21, #2\n\t"
|
||||
"vshl.u32 d7, d22, #2\n\t"
|
||||
"vshl.u32 d8, d23, #2\n\t"
|
||||
"vshl.u32 d9, d24, #2\n\t"
|
||||
"vadd.u32 d6, d6, d21\n\t"
|
||||
"vadd.u32 d7, d7, d22\n\t"
|
||||
"vadd.u32 d8, d8, d23\n\t"
|
||||
"vadd.u32 d9, d9, d24\n\t"
|
||||
"b L_poly1305_arm32_blocks_start_2_%=\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_begin_4_%=: \n\t"
|
||||
"add r3, %[ctx], #0xa4\n\t"
|
||||
"vldm.32 r3, {d26-d30}\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_start_4_%=: \n\t"
|
||||
"sub %[bytes], #0x40\n\t"
|
||||
"vld4.32 {d10-d13}, [%[m]]!\n\t"
|
||||
"vshl.u32 d6, d27, #2\n\t"
|
||||
"vshl.u32 d7, d28, #2\n\t"
|
||||
"vshl.u32 d8, d29, #2\n\t"
|
||||
"vshl.u32 d9, d30, #2\n\t"
|
||||
"vadd.u32 d6, d6, d27\n\t"
|
||||
"vadd.u32 d7, d7, d28\n\t"
|
||||
"vadd.u32 d8, d8, d29\n\t"
|
||||
"vadd.u32 d9, d9, d30\n\t"
|
||||
"vshr.u32 d14, d13, #8\n\t"
|
||||
"vshl.u32 d13, d13, #18\n\t"
|
||||
"vorr.i32 d14, d14, #0x1000000\n\t"
|
||||
"vsri.u32 d13, d12, #14\n\t"
|
||||
"vshl.u32 d12, d12, #12\n\t"
|
||||
"vand.i32 d13, d13, #0x3ffffff\n\t"
|
||||
"vsri.u32 d12, d11, #20\n\t"
|
||||
"vshl.u32 d11, d11, #6\n\t"
|
||||
"vand.i32 d12, d12, #0x3ffffff\n\t"
|
||||
"vsri.u32 d11, d10, #26\n\t"
|
||||
"vand.i32 d10, d10, #0x3ffffff\n\t"
|
||||
"vand.i32 d11, d11, #0x3ffffff\n\t"
|
||||
"vadd.u32 d4, d4, d14\n\t"
|
||||
"vadd.u32 q1, q1, q6\n\t"
|
||||
"vadd.u32 q0, q0, q5\n\t"
|
||||
"vmull.u32 q5, d0, d26\n\t"
|
||||
"vmull.u32 q6, d0, d27\n\t"
|
||||
"vmull.u32 q7, d0, d28\n\t"
|
||||
"vmull.u32 q8, d0, d29\n\t"
|
||||
"vmull.u32 q9, d0, d30\n\t"
|
||||
"vmlal.u32 q5, d1, d9\n\t"
|
||||
"vmlal.u32 q6, d1, d26\n\t"
|
||||
"vmlal.u32 q7, d1, d27\n\t"
|
||||
"vmlal.u32 q8, d1, d28\n\t"
|
||||
"vmlal.u32 q9, d1, d29\n\t"
|
||||
"vmlal.u32 q5, d2, d8\n\t"
|
||||
"vmlal.u32 q6, d2, d9\n\t"
|
||||
"vmlal.u32 q7, d2, d26\n\t"
|
||||
"vmlal.u32 q8, d2, d27\n\t"
|
||||
"vmlal.u32 q9, d2, d28\n\t"
|
||||
"vmlal.u32 q5, d3, d7\n\t"
|
||||
"vmlal.u32 q6, d3, d8\n\t"
|
||||
"vmlal.u32 q7, d3, d9\n\t"
|
||||
"vmlal.u32 q8, d3, d26\n\t"
|
||||
"vmlal.u32 q9, d3, d27\n\t"
|
||||
"vmlal.u32 q5, d4, d6\n\t"
|
||||
"vmlal.u32 q6, d4, d7\n\t"
|
||||
"vmlal.u32 q7, d4, d8\n\t"
|
||||
"vmlal.u32 q8, d4, d9\n\t"
|
||||
"vmlal.u32 q9, d4, d26\n\t"
|
||||
"vld4.32 {d0-d3}, [%[m]]!\n\t"
|
||||
"vshl.u32 d6, d21, #2\n\t"
|
||||
"vshl.u32 d7, d22, #2\n\t"
|
||||
"vshl.u32 d8, d23, #2\n\t"
|
||||
"vshl.u32 d9, d24, #2\n\t"
|
||||
"vadd.u32 d6, d6, d21\n\t"
|
||||
"vadd.u32 d7, d7, d22\n\t"
|
||||
"vadd.u32 d8, d8, d23\n\t"
|
||||
"vadd.u32 d9, d9, d24\n\t"
|
||||
"vshr.u32 d4, d3, #8\n\t"
|
||||
"vshl.u32 d3, d3, #18\n\t"
|
||||
"vorr.i32 d4, d4, #0x1000000\n\t"
|
||||
"vsri.u32 d3, d2, #14\n\t"
|
||||
"vshl.u32 d2, d2, #12\n\t"
|
||||
"vand.i32 d3, d3, #0x3ffffff\n\t"
|
||||
"vsri.u32 d2, d1, #20\n\t"
|
||||
"vshl.u32 d1, d1, #6\n\t"
|
||||
"vand.i32 d2, d2, #0x3ffffff\n\t"
|
||||
"vsri.u32 d1, d0, #26\n\t"
|
||||
"vand.i32 d0, d0, #0x3ffffff\n\t"
|
||||
"vand.i32 d1, d1, #0x3ffffff\n\t"
|
||||
"vmlal.u32 q5, d0, d20\n\t"
|
||||
"vmlal.u32 q6, d0, d21\n\t"
|
||||
"vmlal.u32 q7, d0, d22\n\t"
|
||||
"vmlal.u32 q8, d0, d23\n\t"
|
||||
"vmlal.u32 q9, d0, d24\n\t"
|
||||
"vmlal.u32 q5, d1, d9\n\t"
|
||||
"vmlal.u32 q6, d1, d20\n\t"
|
||||
"vmlal.u32 q7, d1, d21\n\t"
|
||||
"vmlal.u32 q8, d1, d22\n\t"
|
||||
"vmlal.u32 q9, d1, d23\n\t"
|
||||
"vmlal.u32 q5, d2, d8\n\t"
|
||||
"vmlal.u32 q6, d2, d9\n\t"
|
||||
"vmlal.u32 q7, d2, d20\n\t"
|
||||
"vmlal.u32 q8, d2, d21\n\t"
|
||||
"vmlal.u32 q9, d2, d22\n\t"
|
||||
"vmlal.u32 q5, d3, d7\n\t"
|
||||
"vmlal.u32 q6, d3, d8\n\t"
|
||||
"vmlal.u32 q7, d3, d9\n\t"
|
||||
"vmlal.u32 q8, d3, d20\n\t"
|
||||
"vmlal.u32 q9, d3, d21\n\t"
|
||||
"vmlal.u32 q5, d4, d6\n\t"
|
||||
"vmlal.u32 q6, d4, d7\n\t"
|
||||
"vmlal.u32 q7, d4, d8\n\t"
|
||||
"vmlal.u32 q8, d4, d9\n\t"
|
||||
"vmlal.u32 q9, d4, d20\n\t"
|
||||
"vadd.u64 d0, d10, d11\n\t"
|
||||
"vadd.u64 d1, d12, d13\n\t"
|
||||
"vadd.u64 d2, d14, d15\n\t"
|
||||
"vadd.u64 d3, d16, d17\n\t"
|
||||
"vadd.u64 d4, d18, d19\n\t"
|
||||
"vsra.u64 d1, d0, #26\n\t"
|
||||
"vand.u64 d0, d0, d31\n\t"
|
||||
"vsra.u64 d2, d1, #26\n\t"
|
||||
"vand.u64 d1, d1, d31\n\t"
|
||||
"vsra.u64 d3, d2, #26\n\t"
|
||||
"vand.u64 d2, d2, d31\n\t"
|
||||
"vsra.u64 d4, d3, #26\n\t"
|
||||
"vand.u64 d3, d3, d31\n\t"
|
||||
"vshr.u64 d15, d4, #26\n\t"
|
||||
"vand.u64 d4, d4, d31\n\t"
|
||||
"vadd.u64 d0, d0, d15\n\t"
|
||||
"vshl.u64 d15, d15, #2\n\t"
|
||||
"vadd.u64 d0, d0, d15\n\t"
|
||||
"vsra.u64 d1, d0, #26\n\t"
|
||||
"vand.u64 d0, d0, d31\n\t"
|
||||
"cmp %[bytes], #0x40\n\t"
|
||||
"bge L_poly1305_arm32_blocks_start_4_%=\n\t"
|
||||
"cmp %[bytes], #32\n\t"
|
||||
"blt L_poly1305_arm32_blocks_done_neon_%=\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_start_2_%=: \n\t"
|
||||
"sub %[bytes], #32\n\t"
|
||||
"vld4.32 {d10-d13}, [%[m]]!\n\t"
|
||||
"vshr.u32 d14, d13, #8\n\t"
|
||||
"vshl.u32 d13, d13, #18\n\t"
|
||||
"vorr.i32 d14, d14, #0x1000000\n\t"
|
||||
"vsri.u32 d13, d12, #14\n\t"
|
||||
"vshl.u32 d12, d12, #12\n\t"
|
||||
"vand.i32 d13, d13, #0x3ffffff\n\t"
|
||||
"vsri.u32 d12, d11, #20\n\t"
|
||||
"vshl.u32 d11, d11, #6\n\t"
|
||||
"vand.i32 d12, d12, #0x3ffffff\n\t"
|
||||
"vsri.u32 d11, d10, #26\n\t"
|
||||
"vand.i32 d10, d10, #0x3ffffff\n\t"
|
||||
"vand.i32 d11, d11, #0x3ffffff\n\t"
|
||||
"vadd.u32 d4, d4, d14\n\t"
|
||||
"vadd.u32 q1, q1, q6\n\t"
|
||||
"vadd.u32 q0, q0, q5\n\t"
|
||||
"vmull.u32 q5, d0, d20\n\t"
|
||||
"vmull.u32 q6, d0, d21\n\t"
|
||||
"vmull.u32 q7, d0, d22\n\t"
|
||||
"vmull.u32 q8, d0, d23\n\t"
|
||||
"vmull.u32 q9, d0, d24\n\t"
|
||||
"vmlal.u32 q5, d1, d9\n\t"
|
||||
"vmlal.u32 q6, d1, d20\n\t"
|
||||
"vmlal.u32 q7, d1, d21\n\t"
|
||||
"vmlal.u32 q8, d1, d22\n\t"
|
||||
"vmlal.u32 q9, d1, d23\n\t"
|
||||
"vmlal.u32 q5, d2, d8\n\t"
|
||||
"vmlal.u32 q6, d2, d9\n\t"
|
||||
"vmlal.u32 q7, d2, d20\n\t"
|
||||
"vmlal.u32 q8, d2, d21\n\t"
|
||||
"vmlal.u32 q9, d2, d22\n\t"
|
||||
"vmlal.u32 q5, d3, d7\n\t"
|
||||
"vmlal.u32 q6, d3, d8\n\t"
|
||||
"vmlal.u32 q7, d3, d9\n\t"
|
||||
"vmlal.u32 q8, d3, d20\n\t"
|
||||
"vmlal.u32 q9, d3, d21\n\t"
|
||||
"vmlal.u32 q5, d4, d6\n\t"
|
||||
"vmlal.u32 q6, d4, d7\n\t"
|
||||
"vmlal.u32 q7, d4, d8\n\t"
|
||||
"vmlal.u32 q8, d4, d9\n\t"
|
||||
"vmlal.u32 q9, d4, d20\n\t"
|
||||
"vadd.u64 d0, d10, d11\n\t"
|
||||
"vadd.u64 d1, d12, d13\n\t"
|
||||
"vadd.u64 d2, d14, d15\n\t"
|
||||
"vadd.u64 d3, d16, d17\n\t"
|
||||
"vadd.u64 d4, d18, d19\n\t"
|
||||
"vsra.u64 d1, d0, #26\n\t"
|
||||
"vand.u64 d0, d0, d31\n\t"
|
||||
"vsra.u64 d2, d1, #26\n\t"
|
||||
"vand.u64 d1, d1, d31\n\t"
|
||||
"vsra.u64 d3, d2, #26\n\t"
|
||||
"vand.u64 d2, d2, d31\n\t"
|
||||
"vsra.u64 d4, d3, #26\n\t"
|
||||
"vand.u64 d3, d3, d31\n\t"
|
||||
"vshr.u64 d5, d4, #26\n\t"
|
||||
"vand.u64 d4, d4, d31\n\t"
|
||||
"vadd.u64 d0, d0, d5\n\t"
|
||||
"vshl.u64 d5, d5, #2\n\t"
|
||||
"vadd.u64 d0, d0, d5\n\t"
|
||||
"vsra.u64 d1, d0, #26\n\t"
|
||||
"vand.u64 d0, d0, d31\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_done_neon_%=: \n\t"
|
||||
"cmp %[bytes], #16\n\t"
|
||||
"beq L_poly1305_arm32_blocks_begin_1_%=\n\t"
|
||||
"add r12, %[ctx], #16\n\t"
|
||||
"vsli.u64 d0, d1, #26\n\t"
|
||||
"vsli.u64 d0, d2, #52\n\t"
|
||||
"vshr.u64 d1, d2, #12\n\t"
|
||||
"vsli.u64 d1, d3, #14\n\t"
|
||||
"vsli.u64 d1, d4, #40\n\t"
|
||||
"vshr.u64 d2, d4, #24\n\t"
|
||||
"vst1.64 {d0-d2}, [r12]\n\t"
|
||||
"b L_poly1305_arm32_blocks_done_%=\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_begin_1_%=: \n\t"
|
||||
"vsli.u64 d0, d1, #26\n\t"
|
||||
"vsli.u64 d0, d2, #52\n\t"
|
||||
"vshr.u64 d1, d2, #12\n\t"
|
||||
"vsli.u64 d1, d3, #14\n\t"
|
||||
"vsli.u64 d1, d4, #40\n\t"
|
||||
"vshr.u64 d2, d4, #24\n\t"
|
||||
"vmov r7, r8, d0\n\t"
|
||||
"vmov r9, r10, d1\n\t"
|
||||
"vmov r11, d2[0]\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_start_1_%=: \n\t"
|
||||
"mov r12, #1\n\t"
|
||||
"push {r2}\n\t"
|
||||
/* Load message */
|
||||
"ldm %[m], {r2, r3, r4, r5}\n\t"
|
||||
/* Add message */
|
||||
"adds r7, r7, %[bytes]\n\t"
|
||||
"adcs r8, r8, r3\n\t"
|
||||
"adcs r9, r9, r4\n\t"
|
||||
"adcs r10, r10, r5\n\t"
|
||||
"adc r11, r11, r12\n\t"
|
||||
"push {r0-r1}\n\t"
|
||||
"add %[m], %[ctx], #0\n\t"
|
||||
"add lr, %[ctx], #16\n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
"stm lr, {r7, r8, r9, r10, r11}\n\t"
|
||||
#else
|
||||
/* h[0]-h[2] in r4-r6 for multiplication. */
|
||||
"str r10, [lr, #12]\n\t"
|
||||
"str r11, [lr, #16]\n\t"
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
|
||||
"ldr r3, [%[m]]\n\t"
|
||||
"eor %[ctx], %[ctx], %[ctx]\n\t"
|
||||
/* r[0] * h[0] */
|
||||
/* h[0] in r4 */
|
||||
"umull r7, r8, r3, r7\n\t"
|
||||
/* r[0] * h[2] */
|
||||
/* h[2] in r6 */
|
||||
"umull r9, r10, r3, r9\n\t"
|
||||
/* r[0] * h[4] */
|
||||
/* h[4] in r8 */
|
||||
"mul r11, r3, r11\n\t"
|
||||
/* r[0] * h[1] */
|
||||
"ldr %[bytes], [lr, #4]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r8, r12, r3, %[bytes]\n\t"
|
||||
/* r[0] * h[3] */
|
||||
"ldr %[bytes], [lr, #12]\n\t"
|
||||
"adds r9, r9, r12\n\t"
|
||||
"adc r10, r10, %[ctx]\n\t"
|
||||
"umlal r10, r11, r3, %[bytes]\n\t"
|
||||
/* r[1] * h[0] */
|
||||
"ldr r3, [%[m], #4]\n\t"
|
||||
"ldr %[bytes], [lr]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r8, r12, r3, %[bytes]\n\t"
|
||||
/* r[1] * h[1] */
|
||||
"ldr %[bytes], [lr, #4]\n\t"
|
||||
"adds r9, r9, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r9, r12, r3, %[bytes]\n\t"
|
||||
/* r[1] * h[2] */
|
||||
"ldr %[bytes], [lr, #8]\n\t"
|
||||
"adds r10, r10, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r10, r12, r3, %[bytes]\n\t"
|
||||
/* r[1] * h[3] */
|
||||
"ldr %[bytes], [lr, #12]\n\t"
|
||||
"adds r11, r11, r12\n\t"
|
||||
"adc r4, %[ctx], %[ctx]\n\t"
|
||||
"umlal r11, r4, r3, %[bytes]\n\t"
|
||||
/* r[1] * h[4] */
|
||||
"ldr %[bytes], [lr, #16]\n\t"
|
||||
"mla r4, r3, %[bytes], r4\n\t"
|
||||
/* r[2] * h[0] */
|
||||
"ldr r3, [%[m], #8]\n\t"
|
||||
"ldr %[bytes], [lr]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r9, r12, r3, %[bytes]\n\t"
|
||||
/* r[2] * h[1] */
|
||||
"ldr %[bytes], [lr, #4]\n\t"
|
||||
"adds r10, r10, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r10, r12, r3, %[bytes]\n\t"
|
||||
/* r[2] * h[2] */
|
||||
"ldr %[bytes], [lr, #8]\n\t"
|
||||
"adds r11, r11, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r11, r12, r3, %[bytes]\n\t"
|
||||
/* r[2] * h[3] */
|
||||
"ldr %[bytes], [lr, #12]\n\t"
|
||||
"adds r4, r4, r12\n\t"
|
||||
"adc r5, %[ctx], %[ctx]\n\t"
|
||||
"umlal r4, r5, r3, %[bytes]\n\t"
|
||||
/* r[2] * h[4] */
|
||||
"ldr %[bytes], [lr, #16]\n\t"
|
||||
"mla r5, r3, %[bytes], r5\n\t"
|
||||
/* r[3] * h[0] */
|
||||
"ldr r3, [%[m], #12]\n\t"
|
||||
"ldr %[bytes], [lr]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"umlal r10, r12, r3, %[bytes]\n\t"
|
||||
/* r[3] * h[1] */
|
||||
"ldr %[bytes], [lr, #4]\n\t"
|
||||
"adds r11, r11, r12\n\t"
|
||||
"adc r12, %[ctx], %[ctx]\n\t"
|
||||
"umlal r11, r12, r3, %[bytes]\n\t"
|
||||
/* r[3] * h[2] */
|
||||
"ldr %[bytes], [lr, #8]\n\t"
|
||||
"adds r4, r4, r12\n\t"
|
||||
"adc r5, r5, %[ctx]\n\t"
|
||||
"umlal r4, r5, r3, %[bytes]\n\t"
|
||||
/* r[3] * h[3] */
|
||||
"ldr %[bytes], [lr, #12]\n\t"
|
||||
"mov r6, %[ctx]\n\t"
|
||||
"umlal r5, r6, r3, %[bytes]\n\t"
|
||||
/* r[3] * h[4] */
|
||||
"ldr %[bytes], [lr, #16]\n\t"
|
||||
"mov r12, %[ctx]\n\t"
|
||||
"mla r6, r3, %[bytes], r6\n\t"
|
||||
#else
|
||||
"sub sp, sp, #12\n\t"
|
||||
"ldm %[m], {r0, r1, r2, r3}\n\t"
|
||||
/* r[0] * h[0] */
|
||||
"umull r5, r6, %[ctx], r7\n\t"
|
||||
/* r[1] * h[0] */
|
||||
"umull r12, r10, %[m], r7\n\t"
|
||||
/* r[0] * h[1] */
|
||||
"umaal r6, r12, %[ctx], r8\n\t"
|
||||
/* r[2] * h[0] */
|
||||
"umull r11, r4, %[bytes], r7\n\t"
|
||||
/* r[1] * h[1] */
|
||||
"umaal r12, r11, %[m], r8\n\t"
|
||||
/* r[0] * h[2] */
|
||||
"umaal r12, r10, %[ctx], r9\n\t"
|
||||
/* r[3] * h[0] */
|
||||
"umaal r11, r4, r3, r7\n\t"
|
||||
"stm sp, {r5, r6, r12}\n\t"
|
||||
/* r[2] * h[1] */
|
||||
"umaal r10, r11, %[bytes], r8\n\t"
|
||||
/* Replace h[0] with h[3] */
|
||||
"ldr r7, [lr, #12]\n\t"
|
||||
/* r[1] * h[2] */
|
||||
"umull r5, r6, %[m], r9\n\t"
|
||||
/* r[2] * h[2] */
|
||||
"umaal r11, r4, %[bytes], r9\n\t"
|
||||
/* r[0] * h[3] */
|
||||
"umaal r10, r5, %[ctx], r7\n\t"
|
||||
/* r[3] * h[1] */
|
||||
"umaal r11, r6, r3, r8\n\t"
|
||||
/* r[1] * h[3] */
|
||||
"umaal r11, r5, %[m], r7\n\t"
|
||||
/* r[3] * h[2] */
|
||||
"umaal r4, r6, r3, r9\n\t"
|
||||
/* r[2] * h[3] */
|
||||
"umaal r4, r5, %[bytes], r7\n\t"
|
||||
/* Replace h[1] with h[4] */
|
||||
"ldr r8, [lr, #16]\n\t"
|
||||
/* r[3] * h[3] */
|
||||
"umaal r5, r6, r3, r7\n\t"
|
||||
"mov r12, #0\n\t"
|
||||
/* r[0] * h[4] */
|
||||
"umaal r11, r12, %[ctx], r8\n\t"
|
||||
/* r[1] * h[4] */
|
||||
"umaal r4, r12, %[m], r8\n\t"
|
||||
/* r[2] * h[4] */
|
||||
"umaal r5, r12, %[bytes], r8\n\t"
|
||||
/* r[3] * h[4] */
|
||||
"umaal r6, r12, r3, r8\n\t"
|
||||
/* DONE */
|
||||
"ldm sp, {r7, r8, r9}\n\t"
|
||||
"add sp, sp, #12\n\t"
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
/* Reduce mod 2^130 - 5 */
|
||||
"bic r3, r11, #0x3\n\t"
|
||||
"and r11, r11, #3\n\t"
|
||||
"adds r7, r7, r3\n\t"
|
||||
"lsr r3, r3, #2\n\t"
|
||||
"adcs r8, r8, r4\n\t"
|
||||
"orr r3, r3, r4, LSL #30\n\t"
|
||||
"adcs r9, r9, r5\n\t"
|
||||
"lsr r4, r4, #2\n\t"
|
||||
"adcs r10, r10, r6\n\t"
|
||||
"orr r4, r4, r5, LSL #30\n\t"
|
||||
"adc r11, r11, r12\n\t"
|
||||
"lsr r5, r5, #2\n\t"
|
||||
"adds r7, r7, r3\n\t"
|
||||
"orr r5, r5, r6, LSL #30\n\t"
|
||||
"adcs r8, r8, r4\n\t"
|
||||
"lsr r6, r6, #2\n\t"
|
||||
"adcs r9, r9, r5\n\t"
|
||||
"adcs r10, r10, r6\n\t"
|
||||
"adc r11, r11, r12\n\t"
|
||||
"pop {r0-r1}\n\t"
|
||||
"pop {r2}\n\t"
|
||||
"add r12, %[ctx], #16\n\t"
|
||||
"stm r12, {r7, r8, r9, r10, r11}\n\t"
|
||||
"\n"
|
||||
"L_poly1305_arm32_blocks_done_%=: \n\t"
|
||||
: [ctx] "+r" (ctx), [m] "+r" (m), [bytes] "+r" (bytes)
|
||||
:
|
||||
: "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
|
||||
"r10", "r11", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
|
||||
"d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18",
|
||||
"d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
|
||||
"d28", "d29", "d30", "d31"
|
||||
);
|
||||
}
|
||||
|
||||
static const word32 L_poly1305_arm32_clamp[] = {
|
||||
0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
|
||||
};
|
||||
|
||||
void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
|
||||
{
|
||||
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
|
||||
register const byte* key asm ("r1") = (const byte*)key_p;
|
||||
register word32* L_poly1305_arm32_clamp_c asm ("r2") =
|
||||
(word32*)&L_poly1305_arm32_clamp;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
/* Load mask. */
|
||||
"mov lr, %[L_poly1305_arm32_clamp]\n\t"
|
||||
"ldm lr, {r6, r7, r8, r9}\n\t"
|
||||
/* Load and cache padding. */
|
||||
"ldr r2, [%[key], #16]\n\t"
|
||||
"ldr r3, [%[key], #20]\n\t"
|
||||
"ldr r4, [%[key], #24]\n\t"
|
||||
"ldr r5, [%[key], #28]\n\t"
|
||||
"add lr, %[ctx], #40\n\t"
|
||||
"stm lr, {r2, r3, r4, r5}\n\t"
|
||||
/* Load, mask and store r. */
|
||||
"ldr r2, [%[key]]\n\t"
|
||||
"ldr r3, [%[key], #4]\n\t"
|
||||
"ldr r4, [%[key], #8]\n\t"
|
||||
"ldr r5, [%[key], #12]\n\t"
|
||||
"and r2, r2, r6\n\t"
|
||||
"and r3, r3, r7\n\t"
|
||||
"and r4, r4, r8\n\t"
|
||||
"and r5, r5, r9\n\t"
|
||||
"add lr, %[ctx], #0\n\t"
|
||||
"stm lr, {r2, r3, r4, r5}\n\t"
|
||||
"vmov.i16 q10, #0xffff\n\t"
|
||||
"vshr.u64 q10, q10, #38\n\t"
|
||||
"lsr r8, r2, #26\n\t"
|
||||
"lsr r9, r3, #20\n\t"
|
||||
"lsr r10, r4, #14\n\t"
|
||||
"lsr r11, r5, #8\n\t"
|
||||
"eor r8, r8, r3, lsl #6\n\t"
|
||||
"eor r9, r9, r4, lsl #12\n\t"
|
||||
"eor r10, r10, r5, lsl #18\n\t"
|
||||
"and r7, r2, #0x3ffffff\n\t"
|
||||
"and r8, r8, #0x3ffffff\n\t"
|
||||
"and r9, r9, #0x3ffffff\n\t"
|
||||
"and r10, r10, #0x3ffffff\n\t"
|
||||
"vmov.i32 s1, r7\n\t"
|
||||
"vmov.i32 s3, r8\n\t"
|
||||
"vmov.i32 s5, r9\n\t"
|
||||
"vmov.i32 s7, r10\n\t"
|
||||
"vmov.i32 s9, r11\n\t"
|
||||
"push {%[ctx]-%[key]}\n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* Square r */
|
||||
"umull %[key], r6, r2, r3\n\t"
|
||||
"mov r12, #0\n\t"
|
||||
"umull r7, r8, r2, r5\n\t"
|
||||
"mov lr, r12\n\t"
|
||||
"umlal r6, lr, r2, r4\n\t"
|
||||
"adds r7, r7, lr\n\t"
|
||||
"adc lr, r12, r12\n\t"
|
||||
"umlal r7, lr, r3, r4\n\t"
|
||||
"mov r9, r12\n\t"
|
||||
"umlal lr, r9, r3, r5\n\t"
|
||||
"adds r8, r8, lr\n\t"
|
||||
"adcs r9, r9, r12\n\t"
|
||||
"adc r10, r12, r12\n\t"
|
||||
"umlal r9, r10, r4, r5\n\t"
|
||||
"adds %[key], %[key], %[key]\n\t"
|
||||
"adcs r6, r6, r6\n\t"
|
||||
"adcs r7, r7, r7\n\t"
|
||||
"adcs r8, r8, r8\n\t"
|
||||
"adcs r9, r9, r9\n\t"
|
||||
"adcs r10, r10, r10\n\t"
|
||||
"adc r11, r12, r12\n\t"
|
||||
"umull %[ctx], lr, r2, r2\n\t"
|
||||
"adds %[key], %[key], lr\n\t"
|
||||
"adcs r6, r6, r12\n\t"
|
||||
"adc lr, r12, r12\n\t"
|
||||
"umlal r6, lr, r3, r3\n\t"
|
||||
"adds r7, r7, lr\n\t"
|
||||
"adcs r8, r8, r12\n\t"
|
||||
"adc lr, r12, r12\n\t"
|
||||
"umlal r8, lr, r4, r4\n\t"
|
||||
"adds r9, r9, lr\n\t"
|
||||
"adcs r10, r10, r12\n\t"
|
||||
"adc r11, r11, r12\n\t"
|
||||
"umlal r10, r11, r5, r5\n\t"
|
||||
#else
|
||||
"umull %[ctx], %[key], r2, r2\n\t"
|
||||
"umull r6, r7, r2, r3\n\t"
|
||||
"adds r6, r6, r6\n\t"
|
||||
"mov r12, #0\n\t"
|
||||
"umaal %[key], r6, r12, r12\n\t"
|
||||
"mov r8, r12\n\t"
|
||||
"umaal r8, r7, r2, r4\n\t"
|
||||
"adcs r8, r8, r8\n\t"
|
||||
"umaal r6, r8, r3, r3\n\t"
|
||||
"umull r9, r10, r2, r5\n\t"
|
||||
"umaal r7, r9, r3, r4\n\t"
|
||||
"adcs r7, r7, r7\n\t"
|
||||
"umaal r7, r8, r12, r12\n\t"
|
||||
"umaal r10, r9, r3, r5\n\t"
|
||||
"adcs r10, r10, r10\n\t"
|
||||
"umaal r8, r10, r4, r4\n\t"
|
||||
"mov r11, r12\n\t"
|
||||
"umaal r9, r11, r4, r5\n\t"
|
||||
"adcs r9, r9, r9\n\t"
|
||||
"umaal r9, r10, r12, r12\n\t"
|
||||
"adcs r11, r11, r11\n\t"
|
||||
"umaal r10, r11, r5, r5\n\t"
|
||||
"adc r11, r11, r12\n\t"
|
||||
#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */
|
||||
/* Reduce mod 2^130 - 5 */
|
||||
"bic r2, r8, #0x3\n\t"
|
||||
"and r8, r8, #3\n\t"
|
||||
"adds %[ctx], %[ctx], r2\n\t"
|
||||
"lsr r2, r2, #2\n\t"
|
||||
"adcs %[key], %[key], r9\n\t"
|
||||
"orr r2, r2, r9, LSL #30\n\t"
|
||||
"adcs r6, r6, r10\n\t"
|
||||
"lsr r9, r9, #2\n\t"
|
||||
"adcs r7, r7, r11\n\t"
|
||||
"orr r9, r9, r10, LSL #30\n\t"
|
||||
"adc r8, r8, r12\n\t"
|
||||
"lsr r10, r10, #2\n\t"
|
||||
"adds %[ctx], %[ctx], r2\n\t"
|
||||
"orr r10, r10, r11, LSL #30\n\t"
|
||||
"adcs %[key], %[key], r9\n\t"
|
||||
"lsr r11, r11, #2\n\t"
|
||||
"adcs r6, r6, r10\n\t"
|
||||
"adcs r7, r7, r11\n\t"
|
||||
"adc r8, r8, r12\n\t"
|
||||
"lsr r3, %[ctx], #26\n\t"
|
||||
"lsr r4, %[key], #20\n\t"
|
||||
"lsr r5, r6, #14\n\t"
|
||||
"lsr r10, r7, #8\n\t"
|
||||
"eor r3, r3, %[key], lsl #6\n\t"
|
||||
"eor r4, r4, r6, lsl #12\n\t"
|
||||
"eor r5, r5, r7, lsl #18\n\t"
|
||||
"eor r10, r10, r8, lsl #24\n\t"
|
||||
"and r2, %[ctx], #0x3ffffff\n\t"
|
||||
"and r3, r3, #0x3ffffff\n\t"
|
||||
"and r4, r4, #0x3ffffff\n\t"
|
||||
"and r5, r5, #0x3ffffff\n\t"
|
||||
"vmov.i32 s0, r2\n\t"
|
||||
"vmov.i32 s2, r3\n\t"
|
||||
"vmov.i32 s4, r4\n\t"
|
||||
"vmov.i32 s6, r5\n\t"
|
||||
"vmov.i32 s8, r10\n\t"
|
||||
"pop {%[ctx]-%[key]}\n\t"
|
||||
"add lr, %[ctx], #0x7c\n\t"
|
||||
"vstm.32 lr, {d0-d4}\n\t"
|
||||
/* Multiply r^2, r by r^2 */
|
||||
"vshl.u32 d6, d1, #2\n\t"
|
||||
"vshl.u32 d7, d2, #2\n\t"
|
||||
"vshl.u32 d8, d3, #2\n\t"
|
||||
"vshl.u32 d9, d4, #2\n\t"
|
||||
"vadd.u32 d6, d6, d1\n\t"
|
||||
"vadd.u32 d7, d7, d2\n\t"
|
||||
"vadd.u32 d8, d8, d3\n\t"
|
||||
"vadd.u32 d9, d9, d4\n\t"
|
||||
"vmull.u32 q5, d0, d0[0]\n\t"
|
||||
"vmull.u32 q6, d0, d1[0]\n\t"
|
||||
"vmull.u32 q7, d0, d2[0]\n\t"
|
||||
"vmull.u32 q8, d0, d3[0]\n\t"
|
||||
"vmull.u32 q9, d0, d4[0]\n\t"
|
||||
"vmlal.u32 q5, d1, d9[0]\n\t"
|
||||
"vmlal.u32 q6, d1, d0[0]\n\t"
|
||||
"vmlal.u32 q7, d1, d1[0]\n\t"
|
||||
"vmlal.u32 q8, d1, d2[0]\n\t"
|
||||
"vmlal.u32 q9, d1, d3[0]\n\t"
|
||||
"vmlal.u32 q5, d2, d8[0]\n\t"
|
||||
"vmlal.u32 q6, d2, d9[0]\n\t"
|
||||
"vmlal.u32 q7, d2, d0[0]\n\t"
|
||||
"vmlal.u32 q8, d2, d1[0]\n\t"
|
||||
"vmlal.u32 q9, d2, d2[0]\n\t"
|
||||
"vmlal.u32 q5, d3, d7[0]\n\t"
|
||||
"vmlal.u32 q6, d3, d8[0]\n\t"
|
||||
"vmlal.u32 q7, d3, d9[0]\n\t"
|
||||
"vmlal.u32 q8, d3, d0[0]\n\t"
|
||||
"vmlal.u32 q9, d3, d1[0]\n\t"
|
||||
"vmlal.u32 q5, d4, d6[0]\n\t"
|
||||
"vmlal.u32 q6, d4, d7[0]\n\t"
|
||||
"vmlal.u32 q7, d4, d8[0]\n\t"
|
||||
"vmlal.u32 q8, d4, d9[0]\n\t"
|
||||
"vmlal.u32 q9, d4, d0[0]\n\t"
|
||||
"vsra.u64 q6, q5, #26\n\t"
|
||||
"vand.u64 q5, q5, q10\n\t"
|
||||
"vsra.u64 q7, q6, #26\n\t"
|
||||
"vand.u64 q6, q6, q10\n\t"
|
||||
"vsra.u64 q8, q7, #26\n\t"
|
||||
"vand.u64 q7, q7, q10\n\t"
|
||||
"vsra.u64 q9, q8, #26\n\t"
|
||||
"vand.u64 q8, q8, q10\n\t"
|
||||
"vshr.u64 q3, q9, #26\n\t"
|
||||
"vand.u64 q9, q9, q10\n\t"
|
||||
"vadd.u64 q5, q5, q3\n\t"
|
||||
"vshl.u64 q3, q3, #2\n\t"
|
||||
"vadd.u64 q5, q5, q3\n\t"
|
||||
"vsra.u64 q6, q5, #26\n\t"
|
||||
"vand.u64 q5, q5, q10\n\t"
|
||||
"vmovn.i64 d10, q5\n\t"
|
||||
"vmovn.i64 d11, q6\n\t"
|
||||
"vmovn.i64 d12, q7\n\t"
|
||||
"vmovn.i64 d13, q8\n\t"
|
||||
"vmovn.i64 d14, q9\n\t"
|
||||
"add lr, %[ctx], #0xa4\n\t"
|
||||
"vstm.32 lr, {d10-d14}\n\t"
|
||||
/* h (accumulator) = 0 */
|
||||
"eor r6, r6, r6\n\t"
|
||||
"eor r7, r7, r7\n\t"
|
||||
"eor r8, r8, r8\n\t"
|
||||
"eor r9, r9, r9\n\t"
|
||||
"add lr, %[ctx], #16\n\t"
|
||||
"eor r4, r4, r4\n\t"
|
||||
"eor r5, r5, r5\n\t"
|
||||
"stm lr, {r4, r5, r6, r7, r8, r9}\n\t"
|
||||
/* Zero leftover */
|
||||
"str r5, [%[ctx], #56]\n\t"
|
||||
: [ctx] "+r" (ctx), [key] "+r" (key),
|
||||
[L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c)
|
||||
:
|
||||
: "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
|
||||
"r10", "r11", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
|
||||
"d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18",
|
||||
"d19", "d20", "d21"
|
||||
);
|
||||
}
|
||||
|
||||
void poly1305_final(Poly1305* ctx_p, byte* mac_p)
|
||||
{
|
||||
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
|
||||
register byte* mac asm ("r1") = (byte*)mac_p;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"add r9, %[ctx], #16\n\t"
|
||||
"ldm r9, {r4, r5, r6, r7, r8}\n\t"
|
||||
/* Add 5 and check for h larger than p. */
|
||||
"adds r2, r4, #5\n\t"
|
||||
"adcs r2, r5, #0\n\t"
|
||||
"adcs r2, r6, #0\n\t"
|
||||
"adcs r2, r7, #0\n\t"
|
||||
"adc r2, r8, #0\n\t"
|
||||
"sub r2, r2, #4\n\t"
|
||||
"lsr r2, r2, #31\n\t"
|
||||
"sub r2, r2, #1\n\t"
|
||||
"and r2, r2, #5\n\t"
|
||||
/* Add 0/5 to h. */
|
||||
"adds r4, r4, r2\n\t"
|
||||
"adcs r5, r5, #0\n\t"
|
||||
"adcs r6, r6, #0\n\t"
|
||||
"adc r7, r7, #0\n\t"
|
||||
/* Add padding */
|
||||
"add r9, %[ctx], #40\n\t"
|
||||
"ldm r9, {r2, r3, r12, lr}\n\t"
|
||||
"adds r4, r4, r2\n\t"
|
||||
"adcs r5, r5, r3\n\t"
|
||||
"adcs r6, r6, r12\n\t"
|
||||
"adc r7, r7, lr\n\t"
|
||||
/* Store MAC */
|
||||
"str r4, [%[mac]]\n\t"
|
||||
"str r5, [%[mac], #4]\n\t"
|
||||
"str r6, [%[mac], #8]\n\t"
|
||||
"str r7, [%[mac], #12]\n\t"
|
||||
/* Zero out h. */
|
||||
"eor r4, r4, r4\n\t"
|
||||
"eor r5, r5, r5\n\t"
|
||||
"eor r6, r6, r6\n\t"
|
||||
"eor r7, r7, r7\n\t"
|
||||
"eor r8, r8, r8\n\t"
|
||||
"add r9, %[ctx], #16\n\t"
|
||||
"stm r9, {r4, r5, r6, r7, r8}\n\t"
|
||||
/* Zero out r. */
|
||||
"add r9, %[ctx], #0\n\t"
|
||||
"stm r9, {r4, r5, r6, r7}\n\t"
|
||||
/* Zero out padding. */
|
||||
"add r9, %[ctx], #40\n\t"
|
||||
"stm r9, {r4, r5, r6, r7}\n\t"
|
||||
: [ctx] "+r" (ctx), [mac] "+r" (mac)
|
||||
:
|
||||
: "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
|
||||
"r9"
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* WOLFSSL_ARMASM_NO_NEON */
|
||||
#endif /* HAVE_POLY1305 */
|
||||
#endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */
|
||||
#endif /* WOLFSSL_ARMASM */
|
||||
|
@ -1150,7 +1150,11 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m)
|
||||
*/
|
||||
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes)
|
||||
{
|
||||
poly1305_blocks_arm32_16(ctx, m, bytes, 1);
|
||||
#ifndef WOLFSSL_ARMASM_NO_NEON
|
||||
poly1305_arm32_blocks(ctx, m, bytes);
|
||||
#else
|
||||
poly1305_arm32_blocks_16(ctx, m, bytes, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Process 16 bytes of message.
|
||||
@ -1160,7 +1164,7 @@ void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes)
|
||||
*/
|
||||
void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m)
|
||||
{
|
||||
poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
|
||||
poly1305_arm32_blocks_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1219,6 +1223,16 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
||||
|
||||
/* Process the remaining partial block - last block. */
|
||||
if (ret == 0) {
|
||||
#if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON)
|
||||
if (ctx->leftover >= POLY1305_BLOCK_SIZE) {
|
||||
size_t len = ctx->leftover & (~(POLY1305_BLOCK_SIZE - 1));
|
||||
poly1305_arm32_blocks(ctx, ctx->buffer, len);
|
||||
ctx->leftover -= len;
|
||||
if (ctx->leftover) {
|
||||
XMEMCPY(ctx->buffer, ctx->buffer + len, ctx->leftover);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (ctx->leftover) {
|
||||
size_t i = ctx->leftover;
|
||||
ctx->buffer[i++] = 1;
|
||||
@ -1229,7 +1243,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
||||
poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE,
|
||||
0);
|
||||
#else
|
||||
poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0);
|
||||
poly1305_arm32_blocks_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -8201,6 +8201,31 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void)
|
||||
return WC_TEST_RET_ENC_I(i);
|
||||
}
|
||||
|
||||
/* Testing multiple updates with various sizes works. */
|
||||
for (i = 1; i < (int)sizeof(msg6); i++) {
|
||||
int j;
|
||||
|
||||
ret = wc_Poly1305SetKey(&enc, key, 32);
|
||||
if (ret != 0)
|
||||
return WC_TEST_RET_ENC_I(i);
|
||||
|
||||
for (j = 0; j < (int)sizeof(msg6); j += i) {
|
||||
int len = (int)sizeof(msg6) - j;
|
||||
if (len > i)
|
||||
len = i;
|
||||
ret = wc_Poly1305Update(&enc, msg6 + j, len);
|
||||
if (ret != 0)
|
||||
return WC_TEST_RET_ENC_I(j);
|
||||
}
|
||||
|
||||
ret = wc_Poly1305Final(&enc, tag);
|
||||
if (ret != 0)
|
||||
return WC_TEST_RET_ENC_I(i);
|
||||
|
||||
if (XMEMCMP(tag, correct6, sizeof(tag)))
|
||||
return WC_TEST_RET_ENC_I(i);
|
||||
}
|
||||
|
||||
/* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */
|
||||
XMEMSET(tag, 0, sizeof(tag));
|
||||
ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4));
|
||||
|
@ -98,7 +98,18 @@ typedef struct Poly1305 {
|
||||
word64 leftover;
|
||||
unsigned char buffer[POLY1305_BLOCK_SIZE];
|
||||
unsigned char finished;
|
||||
#elif defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \
|
||||
!defined(WOLFSSL_ARMASM_NO_NEON)
|
||||
/* NEON implementation for ARM32 */
|
||||
word32 r[4];
|
||||
word32 h[6];
|
||||
word32 pad[4];
|
||||
word32 leftover;
|
||||
unsigned char buffer[4*POLY1305_BLOCK_SIZE];
|
||||
word32 r_21[10];
|
||||
word32 r_43[10];
|
||||
#elif defined(WOLFSSL_ARMASM)
|
||||
/* ARM32 (non-NEON) and Thumb2 */
|
||||
word32 r[4];
|
||||
word32 h[5];
|
||||
word32 pad[4];
|
||||
@ -173,7 +184,8 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m,
|
||||
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes);
|
||||
void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m);
|
||||
|
||||
void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len,
|
||||
void poly1305_arm32_blocks(Poly1305* ctx, const unsigned char* m, word32 len);
|
||||
void poly1305_arm32_blocks_16(Poly1305* ctx, const unsigned char* m, word32 len,
|
||||
int notLast);
|
||||
#endif
|
||||
void poly1305_set_key(Poly1305* ctx, const byte* key);
|
||||
|
Reference in New Issue
Block a user