ARM ASM: ARMv7a with NEON instructions

Change to build assembly code for ARMv7a with NEON instruction set.
./configure -host=armv7a --enable-armasm
Added ARM32 SHA-256 NEON only implementation.
This commit is contained in:
Sean Parkinson
2022-05-18 16:23:48 +10:00
parent 602116c3f2
commit 805b0eb606
9 changed files with 5159 additions and 26 deletions

View File

@@ -2061,6 +2061,7 @@ then
esac
# Include options.h
AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN"
ENABLED_ARMASM_CRYPTO=yes
# Check for and set -mstrict-align compiler flag
# Used to set assumption that Aarch64 systems will not handle
@@ -2077,12 +2078,22 @@ then
AM_CPPFLAGS="$AM_CPPFLAGS -mstrict-align"
AC_MSG_NOTICE([64bit ARMv8, setting -mstrict-align]);;
esac
AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]);;
AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto])
;;
armv7a)
AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon-vfpv3 -DWOLFSSL_ARMASM_NO_CRYPTO"
# Include options.h
AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN"
ENABLED_ARMASM_CRYPTO=no
AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon-vfpv3])
;;
*)
AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8"
# Include options.h
AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN"
AC_MSG_NOTICE([32bit ARMv8 found, setting mfpu to crypto-neon-fp-armv8]);;
ENABLED_ARMASM_CRYPTO=yes
AC_MSG_NOTICE([32bit ARMv8 found, setting mfpu to crypto-neon-fp-armv8])
;;
esac
esac
fi
@@ -7998,6 +8009,7 @@ AM_CONDITIONAL([BUILD_AESGCM],[test "x$ENABLED_AESGCM" = "xyes" || test "x$ENABL
AM_CONDITIONAL([BUILD_AESCCM],[test "x$ENABLED_AESCCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"])
AM_CONDITIONAL([BUILD_ARMASM],[test "x$ENABLED_ARMASM" = "xyes"])
AM_CONDITIONAL([BUILD_ARMASM_INLINE],[test "x$ENABLED_ARMASM_INLINE" = "xyes"])
AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"])
AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"])
AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"])
AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"])

View File

@@ -187,7 +187,7 @@ endif
if BUILD_AES
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes.c
if BUILD_ARMASM
if BUILD_ARMASM_CRYPTO
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c
endif
endif
@@ -203,6 +203,11 @@ endif
if BUILD_ARMASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
endif
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
if BUILD_INTELASM
@@ -300,10 +305,15 @@ endif
endif !BUILD_FIPS_CURRENT
if !BUILD_FIPS_CURRENT
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
if BUILD_ARMASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
endif
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256_asm.S
endif
@@ -383,7 +393,7 @@ endif
if !BUILD_FIPS_CURRENT
if BUILD_AES
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes.c
if BUILD_ARMASM
if BUILD_ARMASM_CRYPTO
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c
endif
if BUILD_AFALG

View File

@@ -306,7 +306,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
#include <wolfcrypt/src/misc.c>
#endif
#if !defined(WOLFSSL_ARMASM)
#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_CRYPTO)
#ifdef WOLFSSL_IMX6_CAAM_BLOB
/* case of possibly not using hardware acceleration for AES but using key
@@ -4601,7 +4601,7 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz)
#endif
#ifdef WOLFSSL_ARMASM
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_CRYPTO)
/* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */
#elif defined(WOLFSSL_AFALG)
@@ -9933,7 +9933,7 @@ int wc_AesCcmCheckTagSize(int sz)
return 0;
}
#ifdef WOLFSSL_ARMASM
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_CRYPTO)
/* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */
#elif defined(HAVE_COLDFIRE_SEC)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -225,7 +225,7 @@ Transform_Sha512_Len:
strd r6, r7, [sp, #176]
strd r8, r9, [sp, #184]
# Start of loop processing a block
L_sha512_len_neon_begin:
L_SHA512_transform_len_begin:
# Load, Reverse and Store W
ldr r12, [r1]
ldr lr, [r1, #4]
@@ -319,7 +319,7 @@ L_sha512_len_neon_begin:
eor r9, r9, lr
mov r10, #4
# Start of 16 rounds
L_sha512_len_neon_start:
L_SHA512_transform_len_start:
# Round 0
ldr r12, [r0, #32]
ldr lr, [r0, #36]
@@ -2546,7 +2546,7 @@ L_sha512_len_neon_start:
str lr, [sp, #124]
add r3, r3, #0x80
subs r10, r10, #1
bne L_sha512_len_neon_start
bne L_SHA512_transform_len_start
# Round 0
ldr r12, [r0, #32]
ldr lr, [r0, #36]
@@ -4035,7 +4035,7 @@ L_sha512_len_neon_start:
subs r2, r2, #0x80
sub r3, r3, #0x200
add r1, r1, #0x80
bne L_sha512_len_neon_begin
bne L_SHA512_transform_len_begin
eor r0, r0, r0
add sp, sp, #0xc0
pop {r4, r5, r6, r7, r8, r9, r10, pc}
@@ -4216,7 +4216,7 @@ Transform_Sha512_Len:
# Load digest into working vars
vldm.64 r0, {d0-d7}
# Start of loop processing a block
L_sha512_len_neon_begin:
L_SHA512_transform_neon_len_begin:
# Load W
vldm.64 r1!, {d16-d31}
vrev64.8 q8, q8
@@ -4230,7 +4230,7 @@ L_sha512_len_neon_begin:
adr r3, L_SHA512_transform_neon_len_k
mov r12, #4
# Start of 16 rounds
L_sha512_len_neon_start:
L_SHA512_transform_neon_len_start:
# Round 0
vld1.64 {d12}, [r3:64]!
vshl.u64 d8, d4, #50
@@ -4856,7 +4856,7 @@ L_sha512_len_neon_start:
veor q5, q6
vadd.i64 q15, q5
subs r12, r12, #1
bne L_sha512_len_neon_start
bne L_SHA512_transform_neon_len_start
# Round 0
vld1.64 {d12}, [r3:64]!
vshl.u64 d8, d4, #50
@@ -5329,7 +5329,7 @@ L_sha512_len_neon_start:
vadd.i64 q3, q3, q7
vstm.64 r0, {d0-d7}
subs r2, r2, #0x80
bne L_sha512_len_neon_begin
bne L_SHA512_transform_neon_len_begin
vpop {d8-d15}
bx lr
.size Transform_Sha512_Len,.-Transform_Sha512_Len

View File

@@ -120,7 +120,7 @@ static const uint64_t L_SHA512_transform_len_k[] = {
0x6c44198c4a475817UL,
};
void Transform_Sha512_Len();
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len);
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
{
__asm__ __volatile__ (
@@ -145,7 +145,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"strd r8, r9, [sp, #184]\n\t"
/* Start of loop processing a block */
"\n"
"L_sha512_len_neon_begin_%=: \n\t"
"L_SHA512_transform_len_begin_%=: \n\t"
/* Load, Reverse and Store W */
"ldrd r12, lr, [%[data]]\n\t"
"ldrd r4, r5, [%[data], #8]\n\t"
@@ -235,7 +235,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"mov r10, #4\n\t"
/* Start of 16 rounds */
"\n"
"L_sha512_len_neon_start_%=: \n\t"
"L_SHA512_transform_len_start_%=: \n\t"
/* Round 0 */
"ldrd r12, lr, [%[sha512], #32]\n\t"
"lsrs r4, r12, #14\n\t"
@@ -2222,7 +2222,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"strd r12, lr, [sp, #120]\n\t"
"add r3, r3, #0x80\n\t"
"subs r10, r10, #1\n\t"
"bne L_sha512_len_neon_start_%=\n\t"
"bne L_SHA512_transform_len_start_%=\n\t"
/* Round 0 */
"ldrd r12, lr, [%[sha512], #32]\n\t"
"lsrs r4, r12, #14\n\t"
@@ -3555,7 +3555,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"subs %[len], %[len], #0x80\n\t"
"sub r3, r3, #0x200\n\t"
"add %[data], %[data], #0x80\n\t"
"bne L_sha512_len_neon_begin_%=\n\t"
"bne L_SHA512_transform_len_begin_%=\n\t"
"eor r0, r0, r0\n\t"
"add sp, sp, #0xc0\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
@@ -3659,7 +3659,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"vldm.64 %[sha512], {d0-d7}\n\t"
/* Start of loop processing a block */
"\n"
"L_sha512_len_neon_begin_%=: \n\t"
"L_SHA512_transform_neon_len_begin_%=: \n\t"
/* Load W */
"vldm.64 %[data]!, {d16-d31}\n\t"
"vrev64.8 q8, q8\n\t"
@@ -3674,7 +3674,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"mov r12, #4\n\t"
/* Start of 16 rounds */
"\n"
"L_sha512_len_neon_start_%=: \n\t"
"L_SHA512_transform_neon_len_start_%=: \n\t"
/* Round 0 */
"vld1.64 {d12}, [r3]!\n\t"
"vshl.u64 d8, d4, #50\n\t"
@@ -4300,7 +4300,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"veor q5, q6\n\t"
"vadd.i64 q15, q5\n\t"
"subs r12, r12, #1\n\t"
"bne L_sha512_len_neon_start_%=\n\t"
"bne L_SHA512_transform_neon_len_start_%=\n\t"
/* Round 0 */
"vld1.64 {d12}, [r3]!\n\t"
"vshl.u64 d8, d4, #50\n\t"
@@ -4773,7 +4773,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"vadd.i64 q3, q3, q7\n\t"
"vstm.64 %[sha512], {d0-d7}\n\t"
"subs %[len], %[len], #0x80\n\t"
"bne L_sha512_len_neon_begin_%=\n\t"
"bne L_SHA512_transform_neon_len_begin_%=\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
: [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k)
: "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

View File

@@ -32,7 +32,8 @@
#include <wolfssl/wolfcrypt/settings.h>
#if !defined(NO_AES) && defined(WOLFSSL_ARMASM)
#if !defined(NO_AES) && defined(WOLFSSL_ARMASM) && \
!defined(WOLFSSL_ARMASM_NO_CRYPTO)
#ifdef HAVE_FIPS
#undef HAVE_FIPS

View File

@@ -45,6 +45,7 @@
#endif
#ifndef WOLFSSL_ARMASM_NO_CRYPTO
static const ALIGN32 word32 K[64] = {
0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
@@ -60,6 +61,7 @@ static const ALIGN32 word32 K[64] = {
0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
};
#endif
static int InitSha256(wc_Sha256* sha256)
@@ -94,6 +96,8 @@ static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len)
}
#ifndef WOLFSSL_ARMASM_NO_CRYPTO
#ifdef __aarch64__
/* First block is in sha256->buffer and rest in data. */
@@ -1306,6 +1310,109 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
#endif /* __aarch64__ */
#else
extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
word32 len);
/* ARMv8 hardware acceleration Aarch32 */
static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
{
int ret = 0;
/* do block size increments */
byte* local = (byte*)sha256->buffer;
word32 blocksLen;
/* check that internal buffLen is valid */
if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE)
return BUFFER_E;
AddLength(sha256, len);
if (sha256->buffLen > 0) {
word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
if (add > 0) {
XMEMCPY(&local[sha256->buffLen], data, add);
sha256->buffLen += add;
data += add;
len -= add;
}
if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
Transform_Sha256_Len(sha256, (const byte*)sha256->buffer,
WC_SHA256_BLOCK_SIZE);
sha256->buffLen = 0;
}
}
blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
if (blocksLen > 0) {
/* Byte reversal performed in function if required. */
Transform_Sha256_Len(sha256, data, blocksLen);
data += blocksLen;
len -= blocksLen;
}
if (len > 0) {
XMEMCPY(local, data, len);
sha256->buffLen = len;
}
return ret;
}
static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
{
byte* local = (byte*)sha256->buffer;
if (sha256 == NULL) {
return BAD_FUNC_ARG;
}
local[sha256->buffLen++] = 0x80; /* add 1 */
/* pad with zeros */
if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE -
sha256->buffLen);
sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
Transform_Sha256_Len(sha256, (const byte*)sha256->buffer,
WC_SHA256_BLOCK_SIZE);
sha256->buffLen = 0;
}
XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
/* put lengths in bits */
sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
(sha256->hiLen << 3);
sha256->loLen = sha256->loLen << 3;
/* store lengths */
/* ! length ordering dependent on digest endian type ! */
sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 2] = sha256->hiLen;
sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 1] = sha256->loLen;
ByteReverseWords(
&(sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 2]),
&(sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 2]),
WC_SHA256_BLOCK_SIZE - WC_SHA256_PAD_SIZE);
Transform_Sha256_Len(sha256, (const byte*)sha256->buffer,
WC_SHA256_BLOCK_SIZE);
#ifdef LITTLE_ENDIAN_ORDER
ByteReverseWords((word32*)hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
#else
XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
#endif
return 0;
}
#endif /* !WOLFSSL_ARMASM_NO_CRYPTO */
#ifndef NO_SHA256