Merge pull request #8314 from SparkiDev/aarch64_no_crypto_fallback

Aarch64 ASM: check CPU features before hw crypto instr use
This commit is contained in:
JacobBarthelmeh
2024-12-24 10:15:23 -07:00
committed by GitHub
9 changed files with 628 additions and 525 deletions

View File

@@ -49,6 +49,7 @@
#endif
#include <wolfssl/wolfcrypt/logging.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef NO_INLINE
#include <wolfssl/wolfcrypt/misc.h>
@@ -69,8 +70,8 @@
#endif
#endif
#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
static const ALIGN32 word32 K[64] = {
#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
static const FLASH_QUALIFIER ALIGN32 word32 K[64] = {
0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
@@ -88,6 +89,202 @@ static const ALIGN32 word32 K[64] = {
#endif
#if defined(__aarch64__)
/* Both versions of Ch and Maj are logically the same, but with the second set
the compilers can recognize them better for optimization */
#ifdef WOLFSSL_SHA256_BY_SPEC
/* SHA256 math based on specification */
#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
#else
/* SHA256 math reworked for easier compiler optimization */
#define Ch(x,y,z) ((((y) ^ (z)) & (x)) ^ (z))
#define Maj(x,y,z) ((((x) ^ (y)) & ((y) ^ (z))) ^ (y))
#endif
#define R(x, n) (((x) & 0xFFFFFFFFU) >> (n))
#define S(x, n) rotrFixed(x, n)
#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
#define a(i) S[(0-(i)) & 7]
#define b(i) S[(1-(i)) & 7]
#define c(i) S[(2-(i)) & 7]
#define d(i) S[(3-(i)) & 7]
#define e(i) S[(4-(i)) & 7]
#define f(i) S[(5-(i)) & 7]
#define g(i) S[(6-(i)) & 7]
#define h(i) S[(7-(i)) & 7]
#ifndef XTRANSFORM
#define XTRANSFORM(S, D) Transform_Sha256((S),(D))
#endif
#ifndef SHA256_MANY_REGISTERS
#define RND(j) \
t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+(j)] + \
W[i+(j)]; \
t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
d(j) += t0; \
h(j) = t0 + t1
static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
{
word32 S[8], t0, t1;
int i;
#ifdef WOLFSSL_SMALL_STACK_CACHE
word32* W = sha256->W;
if (W == NULL) {
W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
DYNAMIC_TYPE_DIGEST);
if (W == NULL)
return MEMORY_E;
sha256->W = W;
}
#elif defined(WOLFSSL_SMALL_STACK)
word32* W;
W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
DYNAMIC_TYPE_TMP_BUFFER);
if (W == NULL)
return MEMORY_E;
#else
word32 W[WC_SHA256_BLOCK_SIZE];
#endif
/* Copy context->state[] to working vars */
for (i = 0; i < 8; i++)
S[i] = sha256->digest[i];
for (i = 0; i < 16; i++)
W[i] = *((const word32*)&data[i*(int)sizeof(word32)]);
for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
#ifdef USE_SLOW_SHA256
/* not unrolled - ~2k smaller and ~25% slower */
for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
int j;
for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
RND(j);
}
}
#else
/* partially loop unrolled */
for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
RND(0); RND(1); RND(2); RND(3);
RND(4); RND(5); RND(6); RND(7);
}
#endif /* USE_SLOW_SHA256 */
/* Add the working vars back into digest state[] */
for (i = 0; i < 8; i++) {
sha256->digest[i] += S[i];
}
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
ForceZero(W, sizeof(word32) * WC_SHA256_BLOCK_SIZE);
XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif
}
#else
/* SHA256 version that keeps all data in registers */
#define SCHED1(j) (W[j] = *((word32*)&data[j*sizeof(word32)]))
#define SCHED(j) ( \
W[ j & 15] += \
Gamma1(W[(j-2) & 15])+ \
W[(j-7) & 15] + \
Gamma0(W[(j-15) & 15]) \
)
#define RND1(j) \
t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED1(j); \
t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
d(j) += t0; \
h(j) = t0 + t1
#define RNDN(j) \
t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED(j); \
t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
d(j) += t0; \
h(j) = t0 + t1
static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
{
word32 S[8], t0, t1;
int i;
#ifdef USE_SLOW_SHA256
int j;
#endif
word32 W[WC_SHA256_BLOCK_SIZE/sizeof(word32)];
/* Copy digest to working vars */
S[0] = sha256->digest[0];
S[1] = sha256->digest[1];
S[2] = sha256->digest[2];
S[3] = sha256->digest[3];
S[4] = sha256->digest[4];
S[5] = sha256->digest[5];
S[6] = sha256->digest[6];
S[7] = sha256->digest[7];
i = 0;
#ifdef USE_SLOW_SHA256
for (j = 0; j < 16; j++) {
RND1(j);
}
for (i = 16; i < 64; i += 16) {
for (j = 0; j < 16; j++) {
RNDN(j);
}
}
#else
RND1( 0); RND1( 1); RND1( 2); RND1( 3);
RND1( 4); RND1( 5); RND1( 6); RND1( 7);
RND1( 8); RND1( 9); RND1(10); RND1(11);
RND1(12); RND1(13); RND1(14); RND1(15);
/* 64 operations, partially loop unrolled */
for (i = 16; i < 64; i += 16) {
RNDN( 0); RNDN( 1); RNDN( 2); RNDN( 3);
RNDN( 4); RNDN( 5); RNDN( 6); RNDN( 7);
RNDN( 8); RNDN( 9); RNDN(10); RNDN(11);
RNDN(12); RNDN(13); RNDN(14); RNDN(15);
}
#endif
/* Add the working vars back into digest */
sha256->digest[0] += S[0];
sha256->digest[1] += S[1];
sha256->digest[2] += S[2];
sha256->digest[3] += S[3];
sha256->digest[4] += S[4];
sha256->digest[5] += S[5];
sha256->digest[6] += S[6];
sha256->digest[7] += S[7];
}
#endif /* SHA256_MANY_REGISTERS */
static void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
word32 len)
{
while (len > 0) {
byte tmp[WC_SHA256_BLOCK_SIZE];
ByteReverseWords((word32*)tmp, (const word32*)data,
WC_SHA256_BLOCK_SIZE);
Transform_Sha256(sha256, tmp);
data += WC_SHA256_BLOCK_SIZE;
len -= WC_SHA256_BLOCK_SIZE;
}
}
#endif
#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
static word32 cpuid_flags = 0;
static int cpuid_flags_set = 0;
#endif
static int InitSha256(wc_Sha256* sha256)
{
int ret = 0;
@@ -340,16 +537,30 @@ static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data,
data += add;
len -= add;
if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
if (IS_AARCH64_SHA256(cpuid_flags)) {
Sha256Transform(sha256, (byte*)sha256->buffer, 1);
}
else {
ByteReverseWords(sha256->buffer, sha256->buffer,
WC_SHA256_BLOCK_SIZE);
Transform_Sha256(sha256, (const byte*)sha256->buffer);
}
sha256->buffLen = 0;
}
}
/* number of blocks in a row to complete */
numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE;
numBlocks = (len + sha256->buffLen) / WC_SHA256_BLOCK_SIZE;
if (numBlocks > 0) {
if (IS_AARCH64_SHA256(cpuid_flags)) {
Sha256Transform(sha256, data, numBlocks);
}
else {
Transform_Sha256_Len(sha256, data,
numBlocks * WC_SHA256_BLOCK_SIZE);
}
data += numBlocks * WC_SHA256_BLOCK_SIZE;
len -= numBlocks * WC_SHA256_BLOCK_SIZE;
}
@@ -379,9 +590,10 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
/* pad with zeros */
if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE -
sha256->buffLen);
sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
if (IS_AARCH64_SHA256(cpuid_flags)) {
k = K;
__asm__ volatile (
"LD1 {v4.2d-v7.2d}, %[buffer] \n"
@@ -527,10 +739,17 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
: [out] "=m" (sha256->digest), [k] "+r" (k)
: [digest] "m" (sha256->digest),
[buffer] "m" (sha256->buffer)
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11"
, "v12", "v13", "v14", "v15", "v16", "v17", "v18"
, "v19", "v20", "v21", "v22", "v23", "v24", "v25"
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10"
, "v11" , "v12", "v13", "v14", "v15", "v16"
, "v17", "v18" , "v19", "v20", "v21", "v22"
, "v23", "v24", "v25"
);
}
else {
ByteReverseWords(sha256->buffer, sha256->buffer,
WC_SHA256_BLOCK_SIZE);
Transform_Sha256(sha256, (const byte*)sha256->buffer);
}
sha256->buffLen = 0;
}
@@ -560,6 +779,7 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
sizeof(word32));
if (IS_AARCH64_SHA256(cpuid_flags)) {
k = K;
__asm__ volatile (
"#load in message and schedule updates \n"
@@ -712,6 +932,16 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
"v15", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25"
);
}
else {
Transform_Sha256(sha256, (const byte*)sha256->buffer);
#ifdef LITTLE_ENDIAN_ORDER
ByteReverseWords((word32*)hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
#else
XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
#endif
}
return 0;
}
@@ -1407,214 +1637,7 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
return ret;
}
#elif defined(__aarch64__)
static const FLASH_QUALIFIER ALIGN32 word32 K[64] = {
0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
};
/* Both versions of Ch and Maj are logically the same, but with the second set
the compilers can recognize them better for optimization */
#ifdef WOLFSSL_SHA256_BY_SPEC
/* SHA256 math based on specification */
#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
#else
/* SHA256 math reworked for easier compiler optimization */
#define Ch(x,y,z) ((((y) ^ (z)) & (x)) ^ (z))
#define Maj(x,y,z) ((((x) ^ (y)) & ((y) ^ (z))) ^ (y))
#endif
#define R(x, n) (((x) & 0xFFFFFFFFU) >> (n))
#define S(x, n) rotrFixed(x, n)
#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
#define a(i) S[(0-(i)) & 7]
#define b(i) S[(1-(i)) & 7]
#define c(i) S[(2-(i)) & 7]
#define d(i) S[(3-(i)) & 7]
#define e(i) S[(4-(i)) & 7]
#define f(i) S[(5-(i)) & 7]
#define g(i) S[(6-(i)) & 7]
#define h(i) S[(7-(i)) & 7]
#ifndef XTRANSFORM
#define XTRANSFORM(S, D) Transform_Sha256((S),(D))
#endif
#ifndef SHA256_MANY_REGISTERS
#define RND(j) \
t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+(j)] + \
W[i+(j)]; \
t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
d(j) += t0; \
h(j) = t0 + t1
static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
{
word32 S[8], t0, t1;
int i;
#ifdef WOLFSSL_SMALL_STACK_CACHE
word32* W = sha256->W;
if (W == NULL) {
W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
DYNAMIC_TYPE_DIGEST);
if (W == NULL)
return MEMORY_E;
sha256->W = W;
}
#elif defined(WOLFSSL_SMALL_STACK)
word32* W;
W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
DYNAMIC_TYPE_TMP_BUFFER);
if (W == NULL)
return MEMORY_E;
#else
word32 W[WC_SHA256_BLOCK_SIZE];
#endif
/* Copy context->state[] to working vars */
for (i = 0; i < 8; i++)
S[i] = sha256->digest[i];
for (i = 0; i < 16; i++)
W[i] = *((const word32*)&data[i*(int)sizeof(word32)]);
for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
#ifdef USE_SLOW_SHA256
/* not unrolled - ~2k smaller and ~25% slower */
for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
int j;
for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
RND(j);
}
}
#else
/* partially loop unrolled */
for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
RND(0); RND(1); RND(2); RND(3);
RND(4); RND(5); RND(6); RND(7);
}
#endif /* USE_SLOW_SHA256 */
/* Add the working vars back into digest state[] */
for (i = 0; i < 8; i++) {
sha256->digest[i] += S[i];
}
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
ForceZero(W, sizeof(word32) * WC_SHA256_BLOCK_SIZE);
XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif
}
#else
/* SHA256 version that keeps all data in registers */
#define SCHED1(j) (W[j] = *((word32*)&data[j*sizeof(word32)]))
#define SCHED(j) ( \
W[ j & 15] += \
Gamma1(W[(j-2) & 15])+ \
W[(j-7) & 15] + \
Gamma0(W[(j-15) & 15]) \
)
#define RND1(j) \
t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED1(j); \
t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
d(j) += t0; \
h(j) = t0 + t1
#define RNDN(j) \
t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED(j); \
t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
d(j) += t0; \
h(j) = t0 + t1
static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
{
word32 S[8], t0, t1;
int i;
#ifdef USE_SLOW_SHA256
int j;
#endif
word32 W[WC_SHA256_BLOCK_SIZE/sizeof(word32)];
/* Copy digest to working vars */
S[0] = sha256->digest[0];
S[1] = sha256->digest[1];
S[2] = sha256->digest[2];
S[3] = sha256->digest[3];
S[4] = sha256->digest[4];
S[5] = sha256->digest[5];
S[6] = sha256->digest[6];
S[7] = sha256->digest[7];
i = 0;
#ifdef USE_SLOW_SHA256
for (j = 0; j < 16; j++) {
RND1(j);
}
for (i = 16; i < 64; i += 16) {
for (j = 0; j < 16; j++) {
RNDN(j);
}
}
#else
RND1( 0); RND1( 1); RND1( 2); RND1( 3);
RND1( 4); RND1( 5); RND1( 6); RND1( 7);
RND1( 8); RND1( 9); RND1(10); RND1(11);
RND1(12); RND1(13); RND1(14); RND1(15);
/* 64 operations, partially loop unrolled */
for (i = 16; i < 64; i += 16) {
RNDN( 0); RNDN( 1); RNDN( 2); RNDN( 3);
RNDN( 4); RNDN( 5); RNDN( 6); RNDN( 7);
RNDN( 8); RNDN( 9); RNDN(10); RNDN(11);
RNDN(12); RNDN(13); RNDN(14); RNDN(15);
}
#endif
/* Add the working vars back into digest */
sha256->digest[0] += S[0];
sha256->digest[1] += S[1];
sha256->digest[2] += S[2];
sha256->digest[3] += S[3];
sha256->digest[4] += S[4];
sha256->digest[5] += S[5];
sha256->digest[6] += S[6];
sha256->digest[7] += S[7];
}
#endif /* SHA256_MANY_REGISTERS */
static void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
word32 len)
{
while (len > 0) {
byte tmp[WC_SHA256_BLOCK_SIZE];
ByteReverseWords((word32*)tmp, (const word32*)data,
WC_SHA256_BLOCK_SIZE);
Transform_Sha256(sha256, tmp);
data += WC_SHA256_BLOCK_SIZE;
len -= WC_SHA256_BLOCK_SIZE;
}
}
#else
#elif !defined(__aarch64__)
extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
word32 len);
@@ -1743,7 +1766,16 @@ int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
return ret;
}
#endif
#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
if (!cpuid_flags_set) {
cpuid_flags = cpuid_get_flags();
cpuid_flags_set = 1;
}
#endif
(void)devId;
return ret;
}
@@ -2015,6 +2047,14 @@ int wc_Sha256HashBlock(wc_Sha256* sha256, const unsigned char* data,
return BAD_FUNC_ARG;
sha224->heap = heap;
#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
if (!cpuid_flags_set) {
cpuid_flags = cpuid_get_flags();
cpuid_flags_set = 1;
}
#endif
(void)devId;
return InitSha224(sha224);

View File

@@ -73,15 +73,15 @@ L_SHA3_transform_crypto_r:
.xword 0x8000000080008008
#ifndef __APPLE__
.text
.globl BlockSha3
.type BlockSha3,@function
.globl BlockSha3_crypto
.type BlockSha3_crypto,@function
.align 2
BlockSha3:
BlockSha3_crypto:
#else
.section __TEXT,__text
.globl _BlockSha3
.globl _BlockSha3_crypto
.p2align 2
_BlockSha3:
_BlockSha3_crypto:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
@@ -204,9 +204,9 @@ L_sha3_crypto_begin:
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size BlockSha3,.-BlockSha3
.size BlockSha3_crypto,.-BlockSha3_crypto
#endif /* __APPLE__ */
#else
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#ifndef __APPLE__
.text
.type L_SHA3_transform_base_r, %object
@@ -247,15 +247,15 @@ L_SHA3_transform_base_r:
.xword 0x8000000080008008
#ifndef __APPLE__
.text
.globl BlockSha3
.type BlockSha3,@function
.globl BlockSha3_base
.type BlockSha3_base,@function
.align 2
BlockSha3:
BlockSha3_base:
#else
.section __TEXT,__text
.globl _BlockSha3
.globl _BlockSha3_base
.p2align 2
_BlockSha3:
_BlockSha3_base:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-160]!
add x29, sp, #0
@@ -449,9 +449,8 @@ L_SHA3_transform_base_begin:
ldp x29, x30, [sp], #0xa0
ret
#ifndef __APPLE__
.size BlockSha3,.-BlockSha3
.size BlockSha3_base,.-BlockSha3_base
#endif /* __APPLE__ */
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_SHA3 */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */

View File

@@ -63,7 +63,7 @@ static const word64 L_SHA3_transform_crypto_r[] = {
0x8000000080008008UL,
};
void BlockSha3(word64* state)
void BlockSha3_crypto(word64* state)
{
__asm__ __volatile__ (
#ifdef __APPLE__
@@ -181,7 +181,7 @@ void BlockSha3(word64* state)
);
}
#else
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
static const word64 L_SHA3_transform_base_r[] = {
0x1UL,
0x8082UL,
@@ -209,7 +209,7 @@ static const word64 L_SHA3_transform_base_r[] = {
0x8000000080008008UL,
};
void BlockSha3(word64* state)
void BlockSha3_base(word64* state)
{
__asm__ __volatile__ (
"stp x29, x30, [sp, #-64]!\n\t"
@@ -397,7 +397,6 @@ void BlockSha3(word64* state)
);
}
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_SHA3 */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */

View File

@@ -32,7 +32,6 @@
#ifdef __aarch64__
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_SHA512
#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512
#ifndef __APPLE__
.text
.type L_SHA512_transform_neon_len_k, %object
@@ -1093,7 +1092,7 @@ L_sha512_len_neon_start:
#ifndef __APPLE__
.size Transform_Sha512_Len_neon,.-Transform_Sha512_Len_neon
#endif /* __APPLE__ */
#else
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
#ifndef __APPLE__
.text
.type L_SHA512_transform_crypto_len_k, %object

View File

@@ -35,7 +35,6 @@
#include <wolfssl/wolfcrypt/sha512.h>
#ifdef WOLFSSL_SHA512
#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512
static const word64 L_SHA512_transform_neon_len_k[] = {
0x428a2f98d728ae22UL,
0x7137449123ef65cdUL,
@@ -1053,7 +1052,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
);
}
#else
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
static const word64 L_SHA512_transform_crypto_len_k[] = {
0x428a2f98d728ae22UL,
0x7137449123ef65cdUL,

View File

@@ -48,6 +48,7 @@
}
#endif
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#include <wolfssl/wolfcrypt/hash.h>
#include <wolfssl/wolfcrypt/logging.h>
@@ -62,6 +63,11 @@
#include <wolfssl/wolfcrypt/cryptocb.h>
#endif
#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512)
static word32 cpuid_flags = 0;
static int cpuid_flags_set = 0;
#endif
#ifdef WOLFSSL_SHA512
static int InitSha512(wc_Sha512* sha512)
@@ -198,6 +204,13 @@ static int InitSha512_Family(wc_Sha512* sha512, void* heap, int devId,
if (ret != 0)
return ret;
#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512)
if (!cpuid_flags_set) {
cpuid_flags = cpuid_get_flags();
cpuid_flags_set = 1;
}
#endif
(void)devId;
return ret;
@@ -432,6 +445,22 @@ static void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len
}
#undef DATA
#elif defined(__aarch64__)
static WC_INLINE void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data,
word32 len)
{
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
if (IS_AARCH64_SHA512(cpuid_flags)) {
Transform_Sha512_Len_crypto(sha512, data, len);
}
else
#endif
{
Transform_Sha512_Len_neon(sha512, data, len);
}
}
#endif
@@ -855,6 +884,14 @@ int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
return ret;
}
#endif
#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512)
if (!cpuid_flags_set) {
cpuid_flags = cpuid_get_flags();
cpuid_flags_set = 1;
}
#endif
(void)devId;
return ret;

View File

@@ -62,9 +62,9 @@
}
#endif
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
#ifdef USE_INTEL_SPEEDUP
#if defined(USE_INTEL_SPEEDUP) || (defined(__aarch64__) && \
defined(WOLFSSL_ARMASM))
#include <wolfssl/wolfcrypt/cpuid.h>
word32 cpuid_flags;
@@ -81,6 +81,8 @@
#endif
#endif
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
#ifdef WOLFSSL_SHA3_SMALL
/* Rotate a 64-bit value left.
*
@@ -659,11 +661,37 @@ static int InitSha3(wc_Sha3* sha3)
SHA3_BLOCK_N = NULL;
}
}
#define SHA3_FUNC_PTR
#endif
#if defined(__aarch64__) && defined(WOLFSSL_ARMASM)
if (!cpuid_flags_set) {
cpuid_flags = cpuid_get_flags();
cpuid_flags_set = 1;
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
if (IS_AARCH64_SHA3(cpuid_flags)) {
SHA3_BLOCK = BlockSha3_crypto;
SHA3_BLOCK_N = NULL;
}
else
#endif
{
SHA3_BLOCK = BlockSha3_base;
SHA3_BLOCK_N = NULL;
}
}
#define SHA3_FUNC_PTR
#endif
return 0;
}
#if defined(__aarch64__) && defined(WOLFSSL_ARMASM)
void BlockSha3(word64* s)
{
(*SHA3_BLOCK)(s);
}
#endif
/* Update the SHA-3 hash state with message data.
*
* sha3 wc_Sha3 object holding state.
@@ -700,7 +728,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
for (i = 0; i < p; i++) {
sha3->s[i] ^= Load64BitBigEndian(sha3->t + 8 * i);
}
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
(*SHA3_BLOCK)(sha3->s);
#else
BlockSha3(sha3->s);
@@ -709,7 +737,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
}
}
blocks = len / (p * 8U);
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
if ((SHA3_BLOCK_N != NULL) && (blocks > 0)) {
(*SHA3_BLOCK_N)(sha3->s, data, blocks, p * 8U);
len -= blocks * (p * 8U);
@@ -721,7 +749,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
for (i = 0; i < p; i++) {
sha3->s[i] ^= Load64Unaligned(data + 8 * i);
}
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
(*SHA3_BLOCK)(sha3->s);
#else
BlockSha3(sha3->s);
@@ -773,7 +801,7 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
#endif
for (j = 0; l - j >= rate; j += rate) {
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
(*SHA3_BLOCK)(sha3->s);
#else
BlockSha3(sha3->s);
@@ -785,7 +813,7 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
#endif
}
if (j != l) {
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
(*SHA3_BLOCK)(sha3->s);
#else
BlockSha3(sha3->s);
@@ -1503,7 +1531,7 @@ int wc_Shake128_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#endif
for (; (blockCnt > 0); blockCnt--) {
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
(*SHA3_BLOCK)(shake->s);
#else
BlockSha3(shake->s);
@@ -1641,7 +1669,7 @@ int wc_Shake256_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#endif
for (; (blockCnt > 0); blockCnt--) {
#ifdef USE_INTEL_SPEEDUP
#ifdef SHA3_FUNC_PTR
(*SHA3_BLOCK)(shake->s);
#else
BlockSha3(shake->s);

View File

@@ -226,8 +226,13 @@ WOLFSSL_LOCAL void sha3_block_n_bmi2(word64* s, const byte* data, word32 n,
WOLFSSL_LOCAL void sha3_block_bmi2(word64* s);
WOLFSSL_LOCAL void sha3_block_avx2(word64* s);
WOLFSSL_LOCAL void BlockSha3(word64 *s);
#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM)
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
WOLFSSL_LOCAL void BlockSha3_crypto(word64 *s);
#endif
#if defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM)
WOLFSSL_LOCAL void BlockSha3_base(word64 *s);
WOLFSSL_LOCAL void BlockSha3(word64 *s);
#elif defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM)
WOLFSSL_LOCAL void BlockSha3(word64 *s);
#endif

View File

@@ -228,14 +228,11 @@ struct wc_Sha512 {
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512
void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data,
word32 len);
#define Transform_Sha512_Len Transform_Sha512_Len_neon
#else
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
word32 len);
#define Transform_Sha512_Len Transform_Sha512_Len_crypto
#endif
#else
extern void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data,