Merge pull request #8314 from SparkiDev/aarch64_no_crypto_fallback

Aarch64 ASM: check CPU features before hw crypto instr use
2025-08-01 19:54:40 +02:00 · 2024-12-24 10:15:23 -07:00
parent 98d212d60b e1851cd482
commit 838fe22e61
9 changed files with 628 additions and 525 deletions
--- a/wolfcrypt/src/port/arm/armv8-sha256.c
+++ b/wolfcrypt/src/port/arm/armv8-sha256.c
@@ -49,6 +49,7 @@
 #endif
 #include <wolfssl/wolfcrypt/logging.h>
 #include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/cpuid.h>

 #ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
@@ -69,8 +70,8 @@
    #endif
 #endif

-#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
-static const ALIGN32 word32 K[64] = {
+#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+static const FLASH_QUALIFIER ALIGN32 word32 K[64] = {
    0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
    0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
    0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
@@ -88,6 +89,202 @@ static const ALIGN32 word32 K[64] = {
 #endif


+#if defined(__aarch64__)
+/* Both versions of Ch and Maj are logically the same, but with the second set
+    the compilers can recognize them better for optimization */
+#ifdef WOLFSSL_SHA256_BY_SPEC
+    /* SHA256 math based on specification */
+    #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+    #define Maj(x,y,z)      ((((x) | (y)) & (z)) | ((x) & (y)))
+#else
+    /* SHA256 math reworked for easier compiler optimization */
+    #define Ch(x,y,z)       ((((y) ^ (z)) & (x)) ^ (z))
+    #define Maj(x,y,z)      ((((x) ^ (y)) & ((y) ^ (z))) ^ (y))
+#endif
+    #define R(x, n)         (((x) & 0xFFFFFFFFU) >> (n))
+
+    #define S(x, n)         rotrFixed(x, n)
+    #define Sigma0(x)       (S(x, 2)  ^ S(x, 13) ^ S(x, 22))
+    #define Sigma1(x)       (S(x, 6)  ^ S(x, 11) ^ S(x, 25))
+    #define Gamma0(x)       (S(x, 7)  ^ S(x, 18) ^ R(x, 3))
+    #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
+
+    #define a(i) S[(0-(i)) & 7]
+    #define b(i) S[(1-(i)) & 7]
+    #define c(i) S[(2-(i)) & 7]
+    #define d(i) S[(3-(i)) & 7]
+    #define e(i) S[(4-(i)) & 7]
+    #define f(i) S[(5-(i)) & 7]
+    #define g(i) S[(6-(i)) & 7]
+    #define h(i) S[(7-(i)) & 7]
+
+    #ifndef XTRANSFORM
+         #define XTRANSFORM(S, D)         Transform_Sha256((S),(D))
+    #endif
+
+#ifndef SHA256_MANY_REGISTERS
+    #define RND(j) \
+         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+(j)] + \
+              W[i+(j)]; \
+         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
+         d(j) += t0; \
+         h(j)  = t0 + t1
+
+    static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
+    {
+        word32 S[8], t0, t1;
+        int i;
+
+    #ifdef WOLFSSL_SMALL_STACK_CACHE
+        word32* W = sha256->W;
+        if (W == NULL) {
+            W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
+                                                           DYNAMIC_TYPE_DIGEST);
+            if (W == NULL)
+                return MEMORY_E;
+            sha256->W = W;
+        }
+    #elif defined(WOLFSSL_SMALL_STACK)
+        word32* W;
+        W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+        if (W == NULL)
+            return MEMORY_E;
+    #else
+        word32 W[WC_SHA256_BLOCK_SIZE];
+    #endif
+
+        /* Copy context->state[] to working vars */
+        for (i = 0; i < 8; i++)
+            S[i] = sha256->digest[i];
+
+        for (i = 0; i < 16; i++)
+            W[i] = *((const word32*)&data[i*(int)sizeof(word32)]);
+
+        for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
+           W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
+
+    #ifdef USE_SLOW_SHA256
+        /* not unrolled - ~2k smaller and ~25% slower */
+        for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
+            int j;
+            for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
+                RND(j);
+            }
+        }
+    #else
+        /* partially loop unrolled */
+        for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
+            RND(0); RND(1); RND(2); RND(3);
+            RND(4); RND(5); RND(6); RND(7);
+        }
+    #endif /* USE_SLOW_SHA256 */
+
+        /* Add the working vars back into digest state[] */
+        for (i = 0; i < 8; i++) {
+            sha256->digest[i] += S[i];
+        }
+
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
+        ForceZero(W, sizeof(word32) * WC_SHA256_BLOCK_SIZE);
+        XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+    }
+#else
+    /* SHA256 version that keeps all data in registers */
+    #define SCHED1(j) (W[j] = *((word32*)&data[j*sizeof(word32)]))
+    #define SCHED(j) (               \
+                   W[ j     & 15] += \
+            Gamma1(W[(j-2)  & 15])+  \
+                   W[(j-7)  & 15] +  \
+            Gamma0(W[(j-15) & 15])   \
+        )
+
+    #define RND1(j) \
+         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED1(j); \
+         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
+         d(j) += t0; \
+         h(j)  = t0 + t1
+    #define RNDN(j) \
+         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED(j); \
+         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
+         d(j) += t0; \
+         h(j)  = t0 + t1
+
+    static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
+    {
+        word32 S[8], t0, t1;
+        int i;
+    #ifdef USE_SLOW_SHA256
+        int j;
+    #endif
+        word32 W[WC_SHA256_BLOCK_SIZE/sizeof(word32)];
+
+        /* Copy digest to working vars */
+        S[0] = sha256->digest[0];
+        S[1] = sha256->digest[1];
+        S[2] = sha256->digest[2];
+        S[3] = sha256->digest[3];
+        S[4] = sha256->digest[4];
+        S[5] = sha256->digest[5];
+        S[6] = sha256->digest[6];
+        S[7] = sha256->digest[7];
+
+        i = 0;
+    #ifdef USE_SLOW_SHA256
+        for (j = 0; j < 16; j++) {
+            RND1(j);
+        }
+        for (i = 16; i < 64; i += 16) {
+            for (j = 0; j < 16; j++) {
+                RNDN(j);
+            }
+        }
+    #else
+        RND1( 0); RND1( 1); RND1( 2); RND1( 3);
+        RND1( 4); RND1( 5); RND1( 6); RND1( 7);
+        RND1( 8); RND1( 9); RND1(10); RND1(11);
+        RND1(12); RND1(13); RND1(14); RND1(15);
+        /* 64 operations, partially loop unrolled */
+        for (i = 16; i < 64; i += 16) {
+            RNDN( 0); RNDN( 1); RNDN( 2); RNDN( 3);
+            RNDN( 4); RNDN( 5); RNDN( 6); RNDN( 7);
+            RNDN( 8); RNDN( 9); RNDN(10); RNDN(11);
+            RNDN(12); RNDN(13); RNDN(14); RNDN(15);
+        }
+    #endif
+
+        /* Add the working vars back into digest */
+        sha256->digest[0] += S[0];
+        sha256->digest[1] += S[1];
+        sha256->digest[2] += S[2];
+        sha256->digest[3] += S[3];
+        sha256->digest[4] += S[4];
+        sha256->digest[5] += S[5];
+        sha256->digest[6] += S[6];
+        sha256->digest[7] += S[7];
+    }
+#endif /* SHA256_MANY_REGISTERS */
+
+static void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
+    word32 len)
+{
+    while (len > 0) {
+        byte tmp[WC_SHA256_BLOCK_SIZE];
+        ByteReverseWords((word32*)tmp, (const word32*)data,
+            WC_SHA256_BLOCK_SIZE);
+        Transform_Sha256(sha256, tmp);
+        data += WC_SHA256_BLOCK_SIZE;
+        len  -= WC_SHA256_BLOCK_SIZE;
+    }
+}
+#endif
+
+#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+static word32 cpuid_flags = 0;
+static int cpuid_flags_set = 0;
+#endif
+
 static int InitSha256(wc_Sha256* sha256)
 {
    int ret = 0;
@@ -340,16 +537,30 @@ static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data,
            data            += add;
            len             -= add;
            if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
+                if (IS_AARCH64_SHA256(cpuid_flags)) {
                    Sha256Transform(sha256, (byte*)sha256->buffer, 1);
+                }
+                else {
+                    ByteReverseWords(sha256->buffer, sha256->buffer,
+                        WC_SHA256_BLOCK_SIZE);
+                    Transform_Sha256(sha256, (const byte*)sha256->buffer);
+                }
+
                 sha256->buffLen = 0;
            }
        }

        /* number of blocks in a row to complete */
-        numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE;
+        numBlocks = (len + sha256->buffLen) / WC_SHA256_BLOCK_SIZE;

        if (numBlocks > 0) {
+            if (IS_AARCH64_SHA256(cpuid_flags)) {
                Sha256Transform(sha256, data, numBlocks);
+            }
+            else {
+                Transform_Sha256_Len(sha256, data,
+                    numBlocks * WC_SHA256_BLOCK_SIZE);
+            }
            data += numBlocks * WC_SHA256_BLOCK_SIZE;
            len  -= numBlocks * WC_SHA256_BLOCK_SIZE;
        }
@@ -379,9 +590,10 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)

    /* pad with zeros */
    if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
-
-        XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+        XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE -
+                                                               sha256->buffLen);
        sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
+        if (IS_AARCH64_SHA256(cpuid_flags)) {
            k = K;
            __asm__ volatile (
                "LD1 {v4.2d-v7.2d}, %[buffer]          \n"
@@ -527,10 +739,17 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
                : [out] "=m" (sha256->digest), [k] "+r" (k)
                : [digest] "m" (sha256->digest),
                  [buffer] "m" (sha256->buffer)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v8",  "v9",  "v10", "v11"
-                            , "v12", "v13", "v14", "v15", "v16", "v17", "v18"
-                            , "v19", "v20", "v21", "v22", "v23", "v24", "v25"
+                : "cc", "memory", "v0", "v1", "v2", "v3", "v8",  "v9",  "v10"
+                                , "v11" , "v12", "v13", "v14", "v15", "v16"
+                                , "v17", "v18" , "v19", "v20", "v21", "v22"
+                                , "v23", "v24", "v25"
            );
+        }
+        else {
+            ByteReverseWords(sha256->buffer, sha256->buffer,
+                WC_SHA256_BLOCK_SIZE);
+            Transform_Sha256(sha256, (const byte*)sha256->buffer);
+        }

        sha256->buffLen = 0;
    }
@@ -560,6 +779,7 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
        XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
                sizeof(word32));

+    if (IS_AARCH64_SHA256(cpuid_flags)) {
        k = K;
        __asm__ volatile (
            "#load in message and schedule updates \n"
@@ -712,6 +932,16 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
                                  "v15", "v16", "v17", "v18", "v19", "v20", "v21",
                                  "v22", "v23", "v24", "v25"
        );
+    }
+    else {
+        Transform_Sha256(sha256, (const byte*)sha256->buffer);
+
+    #ifdef LITTLE_ENDIAN_ORDER
+        ByteReverseWords((word32*)hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+    #else
+        XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+    #endif
+    }

    return 0;
 }
@@ -1407,214 +1637,7 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
        return ret;
    }

-#elif defined(__aarch64__)
-
-    static const FLASH_QUALIFIER ALIGN32 word32 K[64] = {
-        0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
-        0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
-        0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
-        0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
-        0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
-        0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
-        0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
-        0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
-        0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
-        0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
-        0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
-        0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
-        0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
-    };
-
-/* Both versions of Ch and Maj are logically the same, but with the second set
-    the compilers can recognize them better for optimization */
-#ifdef WOLFSSL_SHA256_BY_SPEC
-    /* SHA256 math based on specification */
-    #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
-    #define Maj(x,y,z)      ((((x) | (y)) & (z)) | ((x) & (y)))
-#else
-    /* SHA256 math reworked for easier compiler optimization */
-    #define Ch(x,y,z)       ((((y) ^ (z)) & (x)) ^ (z))
-    #define Maj(x,y,z)      ((((x) ^ (y)) & ((y) ^ (z))) ^ (y))
-#endif
-    #define R(x, n)         (((x) & 0xFFFFFFFFU) >> (n))
-
-    #define S(x, n)         rotrFixed(x, n)
-    #define Sigma0(x)       (S(x, 2)  ^ S(x, 13) ^ S(x, 22))
-    #define Sigma1(x)       (S(x, 6)  ^ S(x, 11) ^ S(x, 25))
-    #define Gamma0(x)       (S(x, 7)  ^ S(x, 18) ^ R(x, 3))
-    #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
-
-    #define a(i) S[(0-(i)) & 7]
-    #define b(i) S[(1-(i)) & 7]
-    #define c(i) S[(2-(i)) & 7]
-    #define d(i) S[(3-(i)) & 7]
-    #define e(i) S[(4-(i)) & 7]
-    #define f(i) S[(5-(i)) & 7]
-    #define g(i) S[(6-(i)) & 7]
-    #define h(i) S[(7-(i)) & 7]
-
-    #ifndef XTRANSFORM
-         #define XTRANSFORM(S, D)         Transform_Sha256((S),(D))
-    #endif
-
-#ifndef SHA256_MANY_REGISTERS
-    #define RND(j) \
-         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+(j)] + \
-              W[i+(j)]; \
-         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
-         d(j) += t0; \
-         h(j)  = t0 + t1
-
-    static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
-    {
-        word32 S[8], t0, t1;
-        int i;
-
-    #ifdef WOLFSSL_SMALL_STACK_CACHE
-        word32* W = sha256->W;
-        if (W == NULL) {
-            W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
-                                                           DYNAMIC_TYPE_DIGEST);
-            if (W == NULL)
-                return MEMORY_E;
-            sha256->W = W;
-        }
-    #elif defined(WOLFSSL_SMALL_STACK)
-        word32* W;
-        W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
-                                                       DYNAMIC_TYPE_TMP_BUFFER);
-        if (W == NULL)
-            return MEMORY_E;
-    #else
-        word32 W[WC_SHA256_BLOCK_SIZE];
-    #endif
-
-        /* Copy context->state[] to working vars */
-        for (i = 0; i < 8; i++)
-            S[i] = sha256->digest[i];
-
-        for (i = 0; i < 16; i++)
-            W[i] = *((const word32*)&data[i*(int)sizeof(word32)]);
-
-        for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
-           W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
-
-    #ifdef USE_SLOW_SHA256
-        /* not unrolled - ~2k smaller and ~25% slower */
-        for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
-            int j;
-            for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
-                RND(j);
-            }
-        }
-    #else
-        /* partially loop unrolled */
-        for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
-            RND(0); RND(1); RND(2); RND(3);
-            RND(4); RND(5); RND(6); RND(7);
-        }
-    #endif /* USE_SLOW_SHA256 */
-
-        /* Add the working vars back into digest state[] */
-        for (i = 0; i < 8; i++) {
-            sha256->digest[i] += S[i];
-        }
-
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
-        ForceZero(W, sizeof(word32) * WC_SHA256_BLOCK_SIZE);
-        XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-    #endif
-    }
-#else
-    /* SHA256 version that keeps all data in registers */
-    #define SCHED1(j) (W[j] = *((word32*)&data[j*sizeof(word32)]))
-    #define SCHED(j) (               \
-                   W[ j     & 15] += \
-            Gamma1(W[(j-2)  & 15])+  \
-                   W[(j-7)  & 15] +  \
-            Gamma0(W[(j-15) & 15])   \
-        )
-
-    #define RND1(j) \
-         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED1(j); \
-         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
-         d(j) += t0; \
-         h(j)  = t0 + t1
-    #define RNDN(j) \
-         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED(j); \
-         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
-         d(j) += t0; \
-         h(j)  = t0 + t1
-
-    static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
-    {
-        word32 S[8], t0, t1;
-        int i;
-    #ifdef USE_SLOW_SHA256
-        int j;
-    #endif
-        word32 W[WC_SHA256_BLOCK_SIZE/sizeof(word32)];
-
-        /* Copy digest to working vars */
-        S[0] = sha256->digest[0];
-        S[1] = sha256->digest[1];
-        S[2] = sha256->digest[2];
-        S[3] = sha256->digest[3];
-        S[4] = sha256->digest[4];
-        S[5] = sha256->digest[5];
-        S[6] = sha256->digest[6];
-        S[7] = sha256->digest[7];
-
-        i = 0;
-    #ifdef USE_SLOW_SHA256
-        for (j = 0; j < 16; j++) {
-            RND1(j);
-        }
-        for (i = 16; i < 64; i += 16) {
-            for (j = 0; j < 16; j++) {
-                RNDN(j);
-            }
-        }
-    #else
-        RND1( 0); RND1( 1); RND1( 2); RND1( 3);
-        RND1( 4); RND1( 5); RND1( 6); RND1( 7);
-        RND1( 8); RND1( 9); RND1(10); RND1(11);
-        RND1(12); RND1(13); RND1(14); RND1(15);
-        /* 64 operations, partially loop unrolled */
-        for (i = 16; i < 64; i += 16) {
-            RNDN( 0); RNDN( 1); RNDN( 2); RNDN( 3);
-            RNDN( 4); RNDN( 5); RNDN( 6); RNDN( 7);
-            RNDN( 8); RNDN( 9); RNDN(10); RNDN(11);
-            RNDN(12); RNDN(13); RNDN(14); RNDN(15);
-        }
-    #endif
-
-        /* Add the working vars back into digest */
-        sha256->digest[0] += S[0];
-        sha256->digest[1] += S[1];
-        sha256->digest[2] += S[2];
-        sha256->digest[3] += S[3];
-        sha256->digest[4] += S[4];
-        sha256->digest[5] += S[5];
-        sha256->digest[6] += S[6];
-        sha256->digest[7] += S[7];
-    }
-#endif /* SHA256_MANY_REGISTERS */
-
-static void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
-    word32 len)
-{
-    while (len > 0) {
-        byte tmp[WC_SHA256_BLOCK_SIZE];
-        ByteReverseWords((word32*)tmp, (const word32*)data,
-            WC_SHA256_BLOCK_SIZE);
-        Transform_Sha256(sha256, tmp);
-        data += WC_SHA256_BLOCK_SIZE;
-        len  -= WC_SHA256_BLOCK_SIZE;
-    }
-}
-
-#else
+#elif !defined(__aarch64__)

 extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
    word32 len);
@@ -1743,7 +1766,16 @@ int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
        return ret;
    }
 #endif
+
+#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (!cpuid_flags_set) {
+        cpuid_flags = cpuid_get_flags();
+        cpuid_flags_set = 1;
+    }
+#endif
+
    (void)devId;
+
    return ret;
 }

@@ -2015,6 +2047,14 @@ int wc_Sha256HashBlock(wc_Sha256* sha256, const unsigned char* data,
            return BAD_FUNC_ARG;

        sha224->heap = heap;
+
+    #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (!cpuid_flags_set) {
+            cpuid_flags = cpuid_get_flags();
+            cpuid_flags_set = 1;
+        }
+    #endif
+
        (void)devId;

        return InitSha224(sha224);
--- a/wolfcrypt/src/port/arm/armv8-sha3-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-sha3-asm.S
@@ -73,15 +73,15 @@ L_SHA3_transform_crypto_r:
 	.xword	0x8000000080008008
 #ifndef __APPLE__
 .text
-.globl	BlockSha3
-.type	BlockSha3,@function
+.globl	BlockSha3_crypto
+.type	BlockSha3_crypto,@function
 .align	2
-BlockSha3:
+BlockSha3_crypto:
 #else
 .section	__TEXT,__text
-.globl	_BlockSha3
+.globl	_BlockSha3_crypto
 .p2align	2
-_BlockSha3:
+_BlockSha3_crypto:
 #endif /* __APPLE__ */
 	stp	x29, x30, [sp, #-80]!
 	add	x29, sp, #0
@@ -204,9 +204,9 @@ L_sha3_crypto_begin:
 	ldp	x29, x30, [sp], #0x50
 	ret
 #ifndef __APPLE__
-	.size	BlockSha3,.-BlockSha3
+	.size	BlockSha3_crypto,.-BlockSha3_crypto
 #endif /* __APPLE__ */
-#else
+#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
 #ifndef __APPLE__
 	.text
 	.type	L_SHA3_transform_base_r, %object
@@ -247,15 +247,15 @@ L_SHA3_transform_base_r:
 	.xword	0x8000000080008008
 #ifndef __APPLE__
 .text
-.globl	BlockSha3
-.type	BlockSha3,@function
+.globl	BlockSha3_base
+.type	BlockSha3_base,@function
 .align	2
-BlockSha3:
+BlockSha3_base:
 #else
 .section	__TEXT,__text
-.globl	_BlockSha3
+.globl	_BlockSha3_base
 .p2align	2
-_BlockSha3:
+_BlockSha3_base:
 #endif /* __APPLE__ */
 	stp	x29, x30, [sp, #-160]!
 	add	x29, sp, #0
@@ -449,9 +449,8 @@ L_SHA3_transform_base_begin:
 	ldp	x29, x30, [sp], #0xa0
 	ret
 #ifndef __APPLE__
-	.size	BlockSha3,.-BlockSha3
+	.size	BlockSha3_base,.-BlockSha3_base
 #endif /* __APPLE__ */
-#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
 #endif /* WOLFSSL_SHA3 */
 #endif /* __aarch64__ */
 #endif /* WOLFSSL_ARMASM */
--- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
@@ -63,7 +63,7 @@ static const word64 L_SHA3_transform_crypto_r[] = {
    0x8000000080008008UL,
 };

-void BlockSha3(word64* state)
+void BlockSha3_crypto(word64* state)
 {
    __asm__ __volatile__ (
 #ifdef __APPLE__
@@ -181,7 +181,7 @@ void BlockSha3(word64* state)
    );
 }

-#else
+#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
 static const word64 L_SHA3_transform_base_r[] = {
    0x1UL,
    0x8082UL,
@@ -209,7 +209,7 @@ static const word64 L_SHA3_transform_base_r[] = {
    0x8000000080008008UL,
 };

-void BlockSha3(word64* state)
+void BlockSha3_base(word64* state)
 {
    __asm__ __volatile__ (
        "stp	x29, x30, [sp, #-64]!\n\t"
@@ -397,7 +397,6 @@ void BlockSha3(word64* state)
    );
 }

-#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
 #endif /* WOLFSSL_SHA3 */
 #endif /* __aarch64__ */
 #endif /* WOLFSSL_ARMASM */
--- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S
@@ -32,7 +32,6 @@
 #ifdef __aarch64__
 #ifndef WOLFSSL_ARMASM_INLINE
 #ifdef WOLFSSL_SHA512
-#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512
 #ifndef __APPLE__
 	.text
 	.type	L_SHA512_transform_neon_len_k, %object
@@ -1093,7 +1092,7 @@ L_sha512_len_neon_start:
 #ifndef __APPLE__
 	.size	Transform_Sha512_Len_neon,.-Transform_Sha512_Len_neon
 #endif /* __APPLE__ */
-#else
+#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
 #ifndef __APPLE__
 	.text
 	.type	L_SHA512_transform_crypto_len_k, %object
--- a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
@@ -35,7 +35,6 @@
 #include <wolfssl/wolfcrypt/sha512.h>

 #ifdef WOLFSSL_SHA512
-#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512
 static const word64 L_SHA512_transform_neon_len_k[] = {
    0x428a2f98d728ae22UL,
    0x7137449123ef65cdUL,
@@ -1053,7 +1052,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
    );
 }

-#else
+#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
 static const word64 L_SHA512_transform_crypto_len_k[] = {
    0x428a2f98d728ae22UL,
    0x7137449123ef65cdUL,
--- a/wolfcrypt/src/port/arm/armv8-sha512.c
+++ b/wolfcrypt/src/port/arm/armv8-sha512.c
@@ -48,6 +48,7 @@
    }
 #endif
 #include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
 #include <wolfssl/wolfcrypt/hash.h>

 #include <wolfssl/wolfcrypt/logging.h>
@@ -62,6 +63,11 @@
 #include <wolfssl/wolfcrypt/cryptocb.h>
 #endif

+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512)
+static word32 cpuid_flags = 0;
+static int cpuid_flags_set = 0;
+#endif
+
 #ifdef WOLFSSL_SHA512

 static int InitSha512(wc_Sha512* sha512)
@@ -198,6 +204,13 @@ static int InitSha512_Family(wc_Sha512* sha512, void* heap, int devId,
    if (ret != 0)
        return ret;

+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512)
+    if (!cpuid_flags_set) {
+        cpuid_flags = cpuid_get_flags();
+        cpuid_flags_set = 1;
+    }
+#endif
+
    (void)devId;

    return ret;
@@ -432,6 +445,22 @@ static void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len
 }
 #undef DATA

+#elif defined(__aarch64__)
+
+static WC_INLINE void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data,
+    word32 len)
+{
+#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
+    if (IS_AARCH64_SHA512(cpuid_flags)) {
+        Transform_Sha512_Len_crypto(sha512, data, len);
+    }
+    else
+#endif
+    {
+        Transform_Sha512_Len_neon(sha512, data, len);
+    }
+}
+
 #endif


@@ -855,6 +884,14 @@ int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
        return ret;
    }
 #endif
+
+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512)
+    if (!cpuid_flags_set) {
+        cpuid_flags = cpuid_get_flags();
+        cpuid_flags_set = 1;
+    }
+#endif
+
    (void)devId;

    return ret;
--- a/wolfcrypt/src/sha3.c
+++ b/wolfcrypt/src/sha3.c
@@ -62,9 +62,9 @@
    }
 #endif

-#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)

-#ifdef USE_INTEL_SPEEDUP
+#if defined(USE_INTEL_SPEEDUP) || (defined(__aarch64__) && \
+        defined(WOLFSSL_ARMASM))
    #include <wolfssl/wolfcrypt/cpuid.h>

    word32 cpuid_flags;
@@ -81,6 +81,8 @@
 #endif
 #endif

+#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
+
 #ifdef WOLFSSL_SHA3_SMALL
 /* Rotate a 64-bit value left.
 *
@@ -659,11 +661,37 @@ static int InitSha3(wc_Sha3* sha3)
            SHA3_BLOCK_N = NULL;
        }
    }
+#define SHA3_FUNC_PTR
+#endif
+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM)
+    if (!cpuid_flags_set) {
+        cpuid_flags = cpuid_get_flags();
+        cpuid_flags_set = 1;
+    #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
+        if (IS_AARCH64_SHA3(cpuid_flags)) {
+            SHA3_BLOCK = BlockSha3_crypto;
+            SHA3_BLOCK_N = NULL;
+        }
+        else
+    #endif
+        {
+            SHA3_BLOCK = BlockSha3_base;
+            SHA3_BLOCK_N = NULL;
+        }
+    }
+#define SHA3_FUNC_PTR
 #endif

    return 0;
 }

+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM)
+void BlockSha3(word64* s)
+{
+    (*SHA3_BLOCK)(s);
+}
+#endif
+
 /* Update the SHA-3 hash state with message data.
 *
 * sha3  wc_Sha3 object holding state.
@@ -700,7 +728,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
            for (i = 0; i < p; i++) {
                sha3->s[i] ^= Load64BitBigEndian(sha3->t + 8 * i);
            }
-        #ifdef USE_INTEL_SPEEDUP
+        #ifdef SHA3_FUNC_PTR
            (*SHA3_BLOCK)(sha3->s);
        #else
            BlockSha3(sha3->s);
@@ -709,7 +737,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
        }
    }
    blocks = len / (p * 8U);
-    #ifdef USE_INTEL_SPEEDUP
+    #ifdef SHA3_FUNC_PTR
    if ((SHA3_BLOCK_N != NULL) && (blocks > 0)) {
        (*SHA3_BLOCK_N)(sha3->s, data, blocks, p * 8U);
        len -= blocks * (p * 8U);
@@ -721,7 +749,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
        for (i = 0; i < p; i++) {
            sha3->s[i] ^= Load64Unaligned(data + 8 * i);
        }
-    #ifdef USE_INTEL_SPEEDUP
+    #ifdef SHA3_FUNC_PTR
        (*SHA3_BLOCK)(sha3->s);
    #else
        BlockSha3(sha3->s);
@@ -773,7 +801,7 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
 #endif

    for (j = 0; l - j >= rate; j += rate) {
-    #ifdef USE_INTEL_SPEEDUP
+    #ifdef SHA3_FUNC_PTR
        (*SHA3_BLOCK)(sha3->s);
    #else
        BlockSha3(sha3->s);
@@ -785,7 +813,7 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
    #endif
    }
    if (j != l) {
-    #ifdef USE_INTEL_SPEEDUP
+    #ifdef SHA3_FUNC_PTR
        (*SHA3_BLOCK)(sha3->s);
    #else
        BlockSha3(sha3->s);
@@ -1503,7 +1531,7 @@ int wc_Shake128_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
        SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #endif
    for (; (blockCnt > 0); blockCnt--) {
-    #ifdef USE_INTEL_SPEEDUP
+    #ifdef SHA3_FUNC_PTR
        (*SHA3_BLOCK)(shake->s);
    #else
        BlockSha3(shake->s);
@@ -1641,7 +1669,7 @@ int wc_Shake256_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
        SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #endif
    for (; (blockCnt > 0); blockCnt--) {
-    #ifdef USE_INTEL_SPEEDUP
+    #ifdef SHA3_FUNC_PTR
        (*SHA3_BLOCK)(shake->s);
    #else
        BlockSha3(shake->s);
--- a/wolfssl/wolfcrypt/sha3.h
+++ b/wolfssl/wolfcrypt/sha3.h
@@ -226,8 +226,13 @@ WOLFSSL_LOCAL void sha3_block_n_bmi2(word64* s, const byte* data, word32 n,
 WOLFSSL_LOCAL void sha3_block_bmi2(word64* s);
 WOLFSSL_LOCAL void sha3_block_avx2(word64* s);
 WOLFSSL_LOCAL void BlockSha3(word64 *s);
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM)
+#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
+WOLFSSL_LOCAL void BlockSha3_crypto(word64 *s);
 #endif
-#if defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM)
+WOLFSSL_LOCAL void BlockSha3_base(word64 *s);
+WOLFSSL_LOCAL void BlockSha3(word64 *s);
+#elif defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM)
 WOLFSSL_LOCAL void BlockSha3(word64 *s);
 #endif

--- a/wolfssl/wolfcrypt/sha512.h
+++ b/wolfssl/wolfcrypt/sha512.h
@@ -228,14 +228,11 @@ struct wc_Sha512 {

 #ifdef WOLFSSL_ARMASM
 #ifdef __aarch64__
-#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512
    void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data,
        word32 len);
-    #define Transform_Sha512_Len    Transform_Sha512_Len_neon
-#else
+#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
    void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
        word32 len);
-    #define Transform_Sha512_Len    Transform_Sha512_Len_crypto
 #endif
 #else
 extern void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data,