diff --git a/.gitignore b/.gitignore
index 92d94d892..6149b85d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -321,3 +321,8 @@ doc/pdf
 
 # XCODE Index
 IDE/XCODE/Index
+
+# ARM DS-5
+\.settings/
+\.cproject
+\.project
diff --git a/IDE/ECLIPSE/SIFIVE/Makefile b/IDE/ECLIPSE/SIFIVE/Makefile
new file mode 100644
index 000000000..594686aa2
--- /dev/null
+++ b/IDE/ECLIPSE/SIFIVE/Makefile
@@ -0,0 +1,38 @@
+PROGRAM ?= wolfcrypt
+
+# This line must be added in your freedom-e-sdk/scripts/standalone.mk
+# RISCV_CFLAGS   += -I$(WOLFSSL_SRC_DIR) -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE -DWOLFSSL_USER_SETTINGS
+# WOLFSSL_SRC_DIR variable must be set in the environment when GNU make is started.
+# export WOLFSSL_SRC_DIR=~/freedom-e-sdk/software/wolfssl
+
+WOLFSSL_CFLAGS += -I$(WOLFSSL_SRC_DIR) \
+				  -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE \
+				  -DWOLFSSL_USER_SETTINGS
+
+SRC_FILES  = $(wildcard $(WOLFSSL_SRC_DIR)/src/*.c)
+SRC_FILES += $(wildcard $(WOLFSSL_SRC_DIR)/wolfcrypt/src/*.c)
+SRC_FILES := $(filter-out %bio.c %misc.c %evp.c, $(SRC_FILES))
+
+SRC =$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE/main.c \
+	 $(SRC_FILES) \
+	 $(WOLFSSL_SRC_DIR)/wolfcrypt/test/test.c \
+	 $(WOLFSSL_SRC_DIR)/wolfcrypt/benchmark/benchmark.c
+
+OPT_CFLAGS = -specs=nano.specs
+#OPT_CFLAGS += -O3 -DTIME -DNOENUM -Wno-implicit -mexplicit-relocs -save-temps
+#OPT_CFLAGS += -fno-inline -fno-builtin-printf -fno-common -falign-functions=4
+
+# override the __stack_size and __heap_size default values of 0x400 
+# SiFive HiFive1 has 16KB of data SRAM
+# The __stack_size and __heap_size symbols are defined in the linker metal.default.ld 
+# script in the freedom-e-sdk. 
+override CFLAGS += $(OPT_CFLAGS) $(WOLFSSL_CFLAGS) \
+                     -Xlinker --defsym=__stack_size=0x1200 \
+                     -Xlinker --defsym=__heap_size=0x800
+
+
+$(PROGRAM): $(SRC)
+	$(CC) $(CFLAGS) $(SRC) $(LDFLAGS) $(LDLIBS) -o $@
+
+clean:
+	rm -f $(PROGRAM) $(PROGRAM).hex
diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md
new file mode 100644
index 000000000..3e7e39303
--- /dev/null
+++ b/IDE/ECLIPSE/SIFIVE/README.md
@@ -0,0 +1,199 @@
+# SiFive RISC-V HiFive1 Port
+
+## Overview
+You can enable the wolfSSL support for RISC-V using the `#define WOLFSSL_SIFIVE_RISC_V`.
+
+## Prerequisites
+1. Follow the instructions on the SiFive GitHub [here](https://github.com/sifive/freedom-e-sdk) and SiFive website [here](https://www.sifive.com/) to download the freedom-e-sdk and software tools.
+3. Run a simple hello application on your development board to confirm that your board functions as expected and the communication between your computer and the board works.
+
+## Usage
+You can start with a wolfcrypt example project to integrate the wolfSSL source code.
+wolfSSL supports a compile-time user configurable options in the `IDE/ECLIPSE/SIFIVE/user_settings.h` file.
+
+The `IDE/ECLIPSE/SIFIVE/main.c` example application provides a function to run the selected examples at compile time through the following two #defines in user_settings.h. You can define these macro options to disable the test run.
+```
+- #undef NO_CRYPT_TEST
+- #undef NO_CRYPT_BENCHMARK
+```
+
+## Setup
+### Setting up the SDK with wolfSSL
+1. Download the wolfSSL source code or a zip file from GitHub and place it under your SDK `$HOME` directory. You can also copy or simlink to the source.
+```
+  For example,
+  $ cd $HOME
+  $ git clone --depth=1 https://github.com/wolfSSL/wolfssl.git
+
+```
+2. Copy the wolfcrypt example project into your `freedom-e-sdk/software` directory.
+
+```
+  $ cp -rf ~/wolfssl/IDE/ECLIPSE/SIFIVE ~/freedom-e-sdk/software/wolfcrypt
+```
+
+3. Edit your `~/freedom-e-sdk/scripts/standalone.mk` and add the following line after the last RISCV_CFLAGS entry:
+
+```
+  RISCV_CFLAGS += -I$(WOLFSSL_SRC_DIR) -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE -DWOLFSSL_USER_SETTINGS
+```
+
+4. WOLFSSL_SRC_DIR variable must be set in the environment when GNU make is started.
+
+```
+  $ export WOLFSSL_SRC_DIR=~/wolfssl
+```
+
+5. Setup your riscv64 compiler 
+
+```
+  $ export RISCV_OPENOCD_PATH=/opt/riscv-openocd
+```
+6. (Optional) Setup OpenOCD if your target supports it:
+
+```
+  $ export RISCV_OPENOCD_PATH=/opt/riscv-openocd
+```
+## Building and Running
+
+You can build from source or create a static library.
+
+1. Using command-line:
+
+```
+  $ cd freedom-e-sdk
+  $ make PROGRAM=wolfcrypt TARGET=sifive-hifive1-revb CONFIGURATION=debug clean software upload
+```
+This example cleans, builds and uploads the software on the sifive-hifive1-revb target but you can also combine and build for any of the supported targets. 
+
+Review the test results on the target console.
+
+2. Building a static library for RISC-V using a cross-compiler:
+
+```
+$ cd $WOLFSSL_SRC_DIR
+
+$./configure --host=riscv64-unknown-elf  \
+CC=riscv64-unknown-elf-gcc \
+AR=riscv64-unknown-elf-ar \
+AS=riscv64-unknown-elf-as \
+RANLIB=$RISCV_PATH/bin/riscv64-unknown-elf-gcc-ranlib \
+LD=riscv64-unknown-elf-ld \
+CXX=riscv64-unknown-elf-g++ \
+--disable-examples --enable-static --disable-shared \
+CFLAGS="-march=rv32imac -mabi=ilp32 -mcmodel=medlow -ffunction-sections -fdata-sections -I~/freedom-e-sdk/bsp/sifive-hifive1/install/include -O0 -g -DNO_FILESYSTEM -DWOLFSSL_NO_SOCK -DNO_WRITEV -DWOLFCRYPT_ONLY -DWOLFSSL_SIFIVE_RISC_V"
+
+$make
+$sudo make install
+```
+You can now build and link your software to the wolfSSL libwolfssl.a static library.
+
+### `wolfcrypt_test()`
+
+wolfcrypt_test() prints a message on the target console similar to the following output:
+
+```
+SiFive HiFive1 Demo
+Setting clock to 320MHz
+Actual Clock 320MHz
+
+error    test passed!
+MEMORY   test passed!
+base64   test passed!
+asn      test passed!
+SHA      test passed!
+SHA-256  test passed!
+SHA-512  test passed!
+Hash     test passed!
+HMAC-SHA test passed!
+HMAC-SHA256 test passed!
+HMAC-SHA512 test passed!
+GMAC     test passed!
+Chacha   test passed!
+POLY1305 test passed!
+ChaCha20-Poly1305 AEAD test passed!
+AES      test passed!
+AES192   test passed!
+AES256   test passed!
+AES-GCM  test passed!
+RANDOM   test passed!
+ECC      test passed!
+ECC buffer test passed!
+CURVE25519 test passed!
+ED25519  test passed!
+logging  test passed!
+mutex    test passed!
+Test complete
+```
+### `benchmark_test()`
+
+benchmark_test() prints a message on the target console similar to the following output.
+
+TARGET=sifive-hifive1-revb:
+
+```
+SiFive HiFive1 Demo
+Setting clock to 320MHz
+Actual Clock 320MHz
+
+------------------------------------------------------------------------------
+ wolfSSL version 4.0.0
+------------------------------------------------------------------------------
+wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each)
+RNG                250 KB took 1.098 seconds,  227.714 KB/s
+AES-128-CBC-enc     50 KB took 1.132 seconds,   44.175 KB/s
+AES-128-CBC-dec     50 KB took 1.142 seconds,   43.778 KB/s
+AES-192-CBC-enc     50 KB took 1.250 seconds,   40.007 KB/s
+AES-192-CBC-dec     50 KB took 1.260 seconds,   39.677 KB/s
+AES-256-CBC-enc     50 KB took 1.368 seconds,   36.552 KB/s
+AES-256-CBC-dec     50 KB took 1.378 seconds,   36.279 KB/s
+AES-128-GCM-enc     25 KB took 1.225 seconds,   20.412 KB/s
+AES-128-GCM-dec     25 KB took 1.225 seconds,   20.402 KB/s
+AES-192-GCM-enc     25 KB took 1.290 seconds,   19.373 KB/s
+AES-192-GCM-dec     25 KB took 1.291 seconds,   19.366 KB/s
+AES-256-GCM-enc     25 KB took 1.352 seconds,   18.487 KB/s
+AES-256-GCM-dec     25 KB took 1.353 seconds,   18.478 KB/s
+CHACHA               1 MB took 1.006 seconds,    1.020 MB/s
+CHA-POLY           700 KB took 1.032 seconds,  678.045 KB/s
+POLY1305             2 MB took 1.007 seconds,    2.255 MB/s
+SHA                  2 MB took 1.002 seconds,    1.511 MB/s
+SHA-256            525 KB took 1.011 seconds,  519.279 KB/s
+SHA-512            275 KB took 1.017 seconds,  270.477 KB/s
+HMAC-SHA             1 MB took 1.013 seconds,    1.399 MB/s
+HMAC-SHA256        525 KB took 1.019 seconds,  515.020 KB/s
+HMAC-SHA512        275 KB took 1.032 seconds,  266.351 KB/s
+ECC      256 key gen         2 ops took 1.104 sec, avg 551.834 ms, 1.812 ops/sec
+ECDHE    256 agree           2 ops took 1.101 sec, avg 550.400 ms, 1.817 ops/sec
+ECDSA    256 sign            2 ops took 1.173 sec, avg 586.502 ms, 1.705 ops/sec
+ECDSA    256 verify          2 ops took 2.153 sec, avg 1076.294 ms, 0.929 ops/sec
+CURVE  25519 key gen         2 ops took 1.629 sec, avg 814.423 ms, 1.228 ops/sec
+CURVE  25519 agree           2 ops took 1.626 sec, avg 813.156 ms, 1.230 ops/sec
+ED     25519 key gen         1 ops took 1.436 sec, avg 1436.096 ms, 0.696 ops/sec
+ED     25519 sign            2 ops took 2.913 sec, avg 1456.421 ms, 0.687 ops/sec
+ED     25519 verify          2 ops took 5.012 sec, avg 2506.012 ms, 0.399 ops/sec
+Benchmark complete
+```
+
+## Tested Configurations
+- P-RNG (NIST DRBG) with SHA-256
+- SHA 1/256/512
+- AES 128/192/256 CBC/GCM
+- ECC 256 sign/verify/shared secret with fast math or Single Precision (SP) library
+- ED25519/Curve25519
+- HMAC
+- ChaCha20/Poly1305
+
+## Known Caveats
+- If you find the wolfcrypt test stuck on early_trap_vector error, it is like related to memory issues
+- Using the `__stack_size` default value of 0x400 will not be enough for the ECC test to pass.
+The `IDE/ECLIPSE/SIFIVE/Makefile` overwrites the value with 0x1000 (4 KBytes)
+- Enabling RSA will cause the ECC test to fail due to memory shortage.
+
+## References
+
+The test results were collected from a SiFive reference platform target with the following hardware, software and tool chains:
+- HiFive1 Rev A/Rev B: HiFive1 Development Board with the Freedom Everywhere SoC, E300
+- freedom-e-sdk
+- wolfssl [latest version](https://github.com/wolfSSL/wolfssl)
+
+For more information or questions, please email [support@wolfssl.com](mailto:support@wolfssl.com)
diff --git a/IDE/ECLIPSE/SIFIVE/include.am b/IDE/ECLIPSE/SIFIVE/include.am
new file mode 100644
index 000000000..5f9550dc2
--- /dev/null
+++ b/IDE/ECLIPSE/SIFIVE/include.am
@@ -0,0 +1,9 @@
+# vim:ft=automake
+# included from Top Level Makefile.am
+# All paths should be given relative to the root
+
+EXTRA_DIST += \
+    IDE/ECLIPSE/SIFIVE/README.md \
+    IDE/ECLIPSE/SIFIVE/main.c \
+    IDE/ECLIPSE/SIFIVE/Makefile\
+    IDE/ECLIPSE/SIFIVE/user_settings.h
diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c
new file mode 100644
index 000000000..dc33ac163
--- /dev/null
+++ b/IDE/ECLIPSE/SIFIVE/main.c
@@ -0,0 +1,184 @@
+/* main.c
+ *
+ * Copyright (C) 2019 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfcrypt/test/test.h>
+#include <wolfcrypt/benchmark/benchmark.h>
+
+/* wolfCrypt_Init/wolfCrypt_Cleanup */
+#include <wolfssl/wolfcrypt/wc_port.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#ifndef __METAL_MACHINE_HEADER
+#define __METAL_MACHINE_HEADER "../../../../bsp/sifive-hifive1-revb/metal.h"
+#endif
+#include <metal/machine.h>
+
+#ifndef NO_CRYPT_BENCHMARK
+
+/*-specs=nano.specs doesn’t include support for floating point in printf()*/
+asm (".global _printf_float");
+
+#ifndef RTC_FREQ
+#define RTC_FREQ    32768UL
+#endif
+
+/* CLINT Registers (Core Local Interruptor) for time */
+#define CLINT_BASE 0x02000000UL
+#define CLINT_REG_MTIME (*((volatile uint32_t *)(CLINT_BASE + 0xBFF8)))
+
+#define WOLFSSL_SIFIVE_RISC_V_DEBUG 0
+
+double current_time(int reset)
+{
+    double now = CLINT_REG_MTIME;
+    (void)reset;
+    return now/RTC_FREQ;
+}
+#endif /* !NO_CRYPT_BENCHMARK */
+
+#if WOLFSSL_SIFIVE_RISC_V_DEBUG
+void check(int depth) {
+    char ch;
+    char *ptr = malloc(1);
+
+    printf("stack at %p, heap at %p\n", &ch, ptr);
+    if (depth <= 0)
+        return;
+
+    check(depth-1);
+    free(ptr);
+}
+
+void mtime_sleep(uint32_t ticks) {
+    uint32_t start = CLINT_REG_MTIME;
+
+    while((CLINT_REG_MTIME - start) < ticks) {
+
+    }
+}
+
+void delay(uint32_t sec) {
+    uint32_t ticks = sec * RTC_FREQ;
+    mtime_sleep(ticks);
+}
+#endif /* WOLFSSL_SIFIVE_RISC_V_DEBUG */
+
+/* RNG CODE */
+/* TODO: Implement real RNG */
+static unsigned int gCounter;
+unsigned int hw_rand(void)
+{
+    /* #warning Must implement your own random source */
+
+    return ++gCounter;
+}
+
+unsigned int my_rng_seed_gen(void)
+{
+    return hw_rand();
+}
+
+int my_rng_gen_block(unsigned char* output, unsigned int sz)
+{
+    uint32_t i = 0;
+    uint32_t randReturnSize = sizeof(CUSTOM_RAND_TYPE);
+
+    while (i < sz)
+    {
+        /* If not aligned or there is odd/remainder */
+        if((i + randReturnSize) > sz ||
+            ((uint32_t)&output[i] % randReturnSize) != 0 ) {
+            /* Single byte at a time */
+            output[i++] = (unsigned char)my_rng_seed_gen();
+        }
+        else {
+            /* Use native 8, 16, 32 or 64 copy instruction */
+            *((CUSTOM_RAND_TYPE*)&output[i]) = my_rng_seed_gen();
+            i += randReturnSize;
+        }
+    }
+
+    return 0;
+}
+
+
+#if !defined(NO_CLOCK_SPEEDUP) && !defined(USE_CLOCK_HZ)
+    /* 320MHz */
+    #define USE_CLOCK_HZ 320000000UL
+#endif
+
+int main(void)
+{
+    int ret;
+    long clk_Hz = 16000000; /* default */
+
+#if WOLFSSL_SIFIVE_RISC_V_DEBUG
+    printf("check stack and heap addresses\n");
+    check(8);
+    printf("sleep for 10 seconds to verify timer, measure using a stopwatch\n");
+    delay(10);
+    printf("awake after sleeping for 10 seconds\n");
+#endif
+
+#ifdef USE_CLOCK_HZ
+    /* Speed up clock */
+    printf("SiFive HiFive1 Demo\n");
+    printf("Setting clock to %dMHz\n", USE_CLOCK_HZ/1000000);
+    clk_Hz = metal_clock_set_rate_hz(
+        &__METAL_DT_SIFIVE_FE310_G000_PLL_HANDLE->clock, USE_CLOCK_HZ
+    );
+#endif
+    printf("Actual Clock %dMHz\n", clk_Hz/1000000);
+
+    /* Reconfigure the SPI Bus for dual mode */
+    #define QSPI0_CTRL       0x10014000UL
+    #define FESPI_REG_FFMT   (*((volatile uint32_t *)(QSPI0_CTRL + 0x64)))
+    FESPI_REG_FFMT = 0xbb1447;
+
+#ifdef DEBUG_WOLFSSL
+    wolfSSL_Debugging_ON();
+#endif
+
+    if ((ret = wolfCrypt_Init()) != 0) {
+        printf("wolfCrypt_Init failed %d\n", ret);
+        return -1;
+    }
+
+#ifndef NO_CRYPT_TEST
+    printf("\nwolfCrypt Test Started\n");
+    wolfcrypt_test(NULL);
+    printf("\nwolfCrypt Test Completed\n");
+#endif
+
+#ifndef NO_CRYPT_BENCHMARK
+    printf("\nBenchmark Test Started\n");
+    benchmark_test(NULL);
+    printf("\nBenchmark Test Completed\n");
+#endif
+
+    if ((ret = wolfCrypt_Cleanup()) != 0) {
+        printf("wolfCrypt_Cleanup failed %d\n", ret);
+        return -1;
+    }
+    return 0;
+}
diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h
new file mode 100644
index 000000000..2f7f136cb
--- /dev/null
+++ b/IDE/ECLIPSE/SIFIVE/user_settings.h
@@ -0,0 +1,592 @@
+/* user_settings.h
+ *
+ * Copyright (C) 2019 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Example Settings for SiFive HiFive1 */
+
+#ifndef WOLFSSL_USER_SETTINGS_H
+#define WOLFSSL_USER_SETTINGS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* SiFive HiFive */
+/* ------------------------------------------------------------------------- */
+#undef  WOLFSSL_SIFIVE_RISC_V
+#define WOLFSSL_SIFIVE_RISC_V
+
+
+/* ------------------------------------------------------------------------- */
+/* Platform */
+/* ------------------------------------------------------------------------- */
+
+#undef  WOLFSSL_GENERAL_ALIGNMENT
+#define WOLFSSL_GENERAL_ALIGNMENT   4
+
+#undef  SINGLE_THREADED
+#define SINGLE_THREADED
+
+#undef  WOLFSSL_SMALL_STACK
+#define WOLFSSL_SMALL_STACK
+
+#undef  WOLFSSL_USER_IO
+#define WOLFSSL_USER_IO
+
+
+/* ------------------------------------------------------------------------- */
+/* Math Configuration */
+/* ------------------------------------------------------------------------- */
+#undef  SIZEOF_LONG_LONG
+#define SIZEOF_LONG_LONG 8
+
+#undef USE_FAST_MATH
+
+#if 1
+    #define USE_FAST_MATH
+
+    #undef  TFM_TIMING_RESISTANT
+    #define TFM_TIMING_RESISTANT
+
+    /* Optimizations */
+    //#define TFM_ARM
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Asymmetric */
+/* ------------------------------------------------------------------------- */
+/* RSA */
+/* Not enabled due to memory constraints on HiFive1 */
+#undef NO_RSA
+#if 0
+    #ifdef USE_FAST_MATH
+        /* Maximum math bits (Max RSA key bits * 2) */
+        #undef  FP_MAX_BITS
+        #define FP_MAX_BITS     4096
+    #endif
+
+    /* half as much memory but twice as slow */
+    #undef  RSA_LOW_MEM
+    #define RSA_LOW_MEM
+
+    /* Enables blinding mode, to prevent timing attacks */
+    #if 1
+        #undef  WC_RSA_BLINDING
+        #define WC_RSA_BLINDING
+    #else
+        #undef  WC_NO_HARDEN
+        #define WC_NO_HARDEN
+    #endif
+
+    /* RSA PSS Support */
+    #if 0
+        #define WC_RSA_PSS
+    #endif
+
+    #if 0
+        #define WC_RSA_NO_PADDING
+    #endif
+#else
+    #define NO_RSA
+#endif
+
+/* ECC */
+#undef HAVE_ECC
+#if 1
+    #define HAVE_ECC
+
+    /* Manually define enabled curves */
+    #undef  ECC_USER_CURVES
+    #define ECC_USER_CURVES
+
+    #ifdef ECC_USER_CURVES
+        /* Manual Curve Selection, FP_MAX_BITS must be adjusted accordingly */
+        // #define HAVE_ECC192
+        // #define HAVE_ECC224
+        #undef NO_ECC256
+        // #define HAVE_ECC384
+        // #define HAVE_ECC521
+    #endif
+
+    /* Fixed point cache (speeds repeated operations against same private key) */
+    #undef  FP_ECC
+    //#define FP_ECC
+    #ifdef FP_ECC
+        /* Bits / Entries */
+        #undef  FP_ENTRIES
+        #define FP_ENTRIES  2
+        #undef  FP_LUT
+        #define FP_LUT      4
+    #endif
+
+    /* Optional ECC calculation method */
+    /* Note: doubles heap usage, but slightly faster */
+    #undef  ECC_SHAMIR
+    //#define ECC_SHAMIR
+
+    /* Reduces heap usage, but slower */
+    #undef  ECC_TIMING_RESISTANT
+    #define ECC_TIMING_RESISTANT
+
+    /* Enable cofactor support */
+    #undef  HAVE_ECC_CDH
+    //#define HAVE_ECC_CDH
+
+    /* Validate import */
+    #undef  WOLFSSL_VALIDATE_ECC_IMPORT
+    //#define WOLFSSL_VALIDATE_ECC_IMPORT
+
+    /* Compressed Key Support */
+    #undef  HAVE_COMP_KEY
+    //#define HAVE_COMP_KEY
+
+    /* Use alternate ECC size for ECC math */
+    #ifdef USE_FAST_MATH
+        #ifdef NO_RSA
+            /* Custom fastmath size if not using RSA */
+            /* MAX = ROUND32(ECC BITS 256) + SIZE_OF_MP_DIGIT(32) */
+            #undef  FP_MAX_BITS
+            #define FP_MAX_BITS     (256 + 32)
+        #else
+            #undef  ALT_ECC_SIZE
+            /* Disable alternate ECC size, since it uses HEAP allocations.
+                Heap is limited resource on HiFive1 */
+            //#define ALT_ECC_SIZE
+        #endif
+    #endif
+#endif
+
+/* DH */
+#undef  NO_DH
+#if 0
+    /* Use table for DH instead of -lm (math) lib dependency */
+    #if 0
+        #define WOLFSSL_DH_CONST
+    #endif
+
+    #define HAVE_FFDHE_2048
+    //#define HAVE_FFDHE_4096
+    //#define HAVE_FFDHE_6144
+    //#define HAVE_FFDHE_8192
+#else
+    #define NO_DH
+#endif
+
+
+/* Wolf Single Precision Math */
+/* Optional ECC SECP256R1 acceleration using optimized C code */
+#undef WOLFSSL_SP
+#if 1
+    #define WOLFSSL_SP
+    #define WOLFSSL_SP_SMALL  /* use smaller version of code (requires heap) */
+    #define SP_WORD_SIZE 32   /* force 32-bit type */
+    #define WOLFSSL_SP_MATH   /* only SP math - eliminates fast math code */
+    //#define WOLFSSL_SP_DIV_32 /* do not use 64-bit divides */
+
+    #ifdef HAVE_ECC
+        #define WOLFSSL_HAVE_SP_ECC
+    #endif
+    #ifndef NO_RSA
+        #define WOLFSSL_HAVE_SP_RSA
+    #endif
+#endif
+
+/* Ed25519 / Curve25519 */
+#undef HAVE_CURVE25519
+#undef HAVE_ED25519
+#if 1
+    #define HAVE_CURVE25519
+    #define HAVE_ED25519 /* ED25519 Requires SHA512 */
+
+    /* Optionally use small math (less flash usage, but much slower) */
+    #if 1
+        /* Curve and Ed 25519 small */
+        #define CURVED25519_SMALL
+    #endif
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Symmetric Ciphers */
+/* ------------------------------------------------------------------------- */
+
+/* AES */
+#undef NO_AES
+#if 1
+    #undef  HAVE_AES_CBC
+    #define HAVE_AES_CBC
+
+    #undef  HAVE_AESGCM
+    #define HAVE_AESGCM
+
+    /* GCM Method: GCM_SMALL, GCM_WORD32 or GCM_TABLE */
+    #define GCM_SMALL
+
+    #undef  WOLFSSL_AES_DIRECT
+    //#define WOLFSSL_AES_DIRECT
+
+    #undef  HAVE_AES_ECB
+    //#define HAVE_AES_ECB
+
+    #undef  WOLFSSL_AES_COUNTER
+    //#define WOLFSSL_AES_COUNTER
+
+    #undef  HAVE_AESCCM
+    //#define HAVE_AESCCM
+#endif
+
+/* DES3 */
+#undef NO_DES3
+#if 0
+#else
+    #define NO_DES3
+#endif
+
+/* ChaCha20 / Poly1305 */
+#undef HAVE_CHACHA
+#undef HAVE_POLY1305
+#if 1
+    #define HAVE_CHACHA
+    #define HAVE_POLY1305
+
+    /* Needed for Poly1305 */
+    #undef  HAVE_ONE_TIME_AUTH
+    #define HAVE_ONE_TIME_AUTH
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Symmetric Hashing */
+/* ------------------------------------------------------------------------- */
+/* Sha */
+#undef NO_SHA
+#if 1
+    /* 1k smaller, but 25% slower */
+    //#define USE_SLOW_SHA
+#else
+    #define NO_SHA
+#endif
+
+/* Sha256 */
+#undef NO_SHA256
+#if 1
+    /* not unrolled - ~2k smaller and ~25% slower */
+    //#define USE_SLOW_SHA256
+
+    /* Sha224 */
+    #if 0
+        #define WOLFSSL_SHA224
+    #endif
+#else
+    #define NO_SHA256
+#endif
+
+/* Sha512 */
+#undef WOLFSSL_SHA512
+#if 1
+    #define WOLFSSL_SHA512
+
+    /* Sha384 */
+    #undef  WOLFSSL_SHA384
+    #if 0
+        #define WOLFSSL_SHA384
+    #endif
+
+    /* over twice as small, but 50% slower */
+    #define USE_SLOW_SHA512
+#endif
+
+/* Sha3 */
+#undef WOLFSSL_SHA3
+#if 0
+    #define WOLFSSL_SHA3
+#endif
+
+/* MD5 */
+#undef  NO_MD5
+#if 0
+
+#else
+    #define NO_MD5
+#endif
+
+/* Blake2B */
+#undef HAVE_BLAKE2
+#if 0
+    #define HAVE_BLAKE2
+#endif
+
+/* Blake2S */
+#undef HAVE_BLAKE2S
+#if 0
+    #define HAVE_BLAKE2S
+#endif
+
+/* HKDF */
+#undef HAVE_HKDF
+#if 0
+    #define HAVE_HKDF
+#endif
+
+/* CMAC */
+#undef WOLFSSL_CMAC
+#if 0
+    #define WOLFSSL_CMAC
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Benchmark / Test */
+/* ------------------------------------------------------------------------- */
+/* Use reduced benchmark / test sizes */
+#undef  BENCH_EMBEDDED
+#define BENCH_EMBEDDED
+
+#undef  USE_CERT_BUFFERS_2048
+#define USE_CERT_BUFFERS_2048
+
+#undef  USE_CERT_BUFFERS_1024
+//#define USE_CERT_BUFFERS_1024
+
+#undef  USE_CERT_BUFFERS_256
+#define USE_CERT_BUFFERS_256
+
+
+/* ------------------------------------------------------------------------- */
+/* Debugging */
+/* ------------------------------------------------------------------------- */
+
+#undef DEBUG_WOLFSSL
+#undef NO_ERROR_STRINGS
+#if 0
+    #define DEBUG_WOLFSSL
+#else
+    #if 0
+        #define NO_ERROR_STRINGS
+    #endif
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Memory */
+/* ------------------------------------------------------------------------- */
+
+/* Override Memory API's */
+#if 0
+    #undef  XMALLOC_OVERRIDE
+    #define XMALLOC_OVERRIDE
+
+    /* prototypes for user heap override functions */
+    /* Note: Realloc only required for normal math */
+    #include <stddef.h>  /* for size_t */
+    extern void *myMalloc(size_t n, void* heap, int type);
+    extern void myFree(void *p, void* heap, int type);
+    extern void *myRealloc(void *p, size_t n, void* heap, int type);
+
+    #define XMALLOC(n, h, t)     myMalloc(n, h, t)
+    #define XFREE(p, h, t)       myFree(p, h, t)
+    #define XREALLOC(p, n, h, t) myRealloc(p, n, h, t)
+#endif
+
+/* Static memory */
+#if 0
+    /* Static memory requires fast math */
+    #define WOLFSSL_STATIC_MEMORY
+
+    /* Disable fallback malloc/free */
+    #define WOLFSSL_NO_MALLOC
+    #if 1
+        #define WOLFSSL_MALLOC_CHECK /* trap malloc failure */
+    #endif
+#endif
+
+/* Memory callbacks */
+#if 0
+    #undef  USE_WOLFSSL_MEMORY
+    #define USE_WOLFSSL_MEMORY
+
+    /* Use this to measure / print heap usage */
+    #if 1
+        #undef  WOLFSSL_TRACK_MEMORY
+        #define WOLFSSL_TRACK_MEMORY
+
+        #undef  WOLFSSL_DEBUG_MEMORY
+        #define WOLFSSL_DEBUG_MEMORY
+    #endif
+#else
+    #ifndef WOLFSSL_STATIC_MEMORY
+        #define NO_WOLFSSL_MEMORY
+        /* Otherwise we will use stdlib malloc, free and realloc */
+    #endif
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Port */
+/* ------------------------------------------------------------------------- */
+
+/* Override Current Time */
+#if defined(WOLFSSL_SIFIVE_RISC_V)
+    #define WOLFSSL_USER_CURRTIME /* for benchmarks, uses "custom_time()" function */
+    #define WOLFSSL_GMTIME
+    #define USER_TICKS
+#else
+    // extern unsigned long my_time(unsigned long* timer);
+    // #define XTIME my_time
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* RNG */
+/* ------------------------------------------------------------------------- */
+#if 0
+    /* Bypass P-RNG and use only HW RNG */
+    #define CUSTOM_RAND_TYPE      unsigned int
+    extern int my_rng_gen_block(unsigned char* output, unsigned int sz);
+    #undef  CUSTOM_RAND_GENERATE_BLOCK
+    #define CUSTOM_RAND_GENERATE_BLOCK  my_rng_gen_block
+#else
+    #define HAVE_HASHDRBG
+
+    /* Seed Source */
+    /* Size of returned HW RNG value */
+    #define CUSTOM_RAND_TYPE      unsigned int
+    extern unsigned int my_rng_seed_gen(void);
+    #undef  CUSTOM_RAND_GENERATE
+    #define CUSTOM_RAND_GENERATE  my_rng_seed_gen
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* Enable Features */
+/* ------------------------------------------------------------------------- */
+#undef WOLFSSL_TLS13
+#if 0
+    #define WOLFSSL_TLS13
+#endif
+
+#undef WOLFSSL_KEY_GEN
+#if 0
+    #define WOLFSSL_KEY_GEN
+#endif
+
+/* reduce DH test time */
+#define WOLFSSL_OLD_PRIME_CHECK
+
+#undef  KEEP_PEER_CERT
+//#define KEEP_PEER_CERT
+
+#undef  HAVE_COMP_KEY
+//#define HAVE_COMP_KEY
+
+#undef  HAVE_TLS_EXTENSIONS
+#define HAVE_TLS_EXTENSIONS
+
+#undef  HAVE_SUPPORTED_CURVES
+#define HAVE_SUPPORTED_CURVES
+
+#undef  WOLFSSL_BASE64_ENCODE
+//#define WOLFSSL_BASE64_ENCODE
+
+/* TLS Session Cache */
+#if 0
+    #define SMALL_SESSION_CACHE
+#else
+    #define NO_SESSION_CACHE
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+/* Disable Features */
+/* ------------------------------------------------------------------------- */
+#undef  NO_WOLFSSL_SERVER
+//#define NO_WOLFSSL_SERVER
+
+#undef  NO_WOLFSSL_CLIENT
+//#define NO_WOLFSSL_CLIENT
+
+#undef  NO_CRYPT_TEST
+//#define NO_CRYPT_TEST
+
+#undef  NO_CRYPT_BENCHMARK
+//#define NO_CRYPT_BENCHMARK
+
+#undef  WOLFCRYPT_ONLY
+//#define WOLFCRYPT_ONLY
+
+/* In-lining of misc.c functions */
+/* If defined, must include wolfcrypt/src/misc.c in build */
+/* Slower, but about 1k smaller */
+#undef  NO_INLINE
+//#define NO_INLINE
+
+#undef  NO_FILESYSTEM
+#define NO_FILESYSTEM
+
+#undef  NO_WRITEV
+#define NO_WRITEV
+
+#undef  NO_MAIN_DRIVER
+#define NO_MAIN_DRIVER
+
+#undef  NO_DEV_RANDOM
+#define NO_DEV_RANDOM
+
+#undef  NO_DSA
+#define NO_DSA
+
+#undef  NO_RC4
+#define NO_RC4
+
+#undef  NO_OLD_TLS
+#define NO_OLD_TLS
+
+#undef  NO_HC128
+#define NO_HC128
+
+#undef  NO_RABBIT
+#define NO_RABBIT
+
+#undef  NO_PSK
+#define NO_PSK
+
+#undef  NO_MD4
+#define NO_MD4
+
+#undef  NO_PWDBASED
+#define NO_PWDBASED
+
+#undef  NO_CODING
+//#define NO_CODING
+
+#undef  NO_ASN_TIME
+//#define NO_ASN_TIME
+
+#undef  NO_CERTS
+//#define NO_CERTS
+
+#undef  NO_SIG_WRAPPER
+//#define NO_SIG_WRAPPER
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* WOLFSSL_USER_SETTINGS_H */
diff --git a/IDE/LPCXPRESSO/README.md b/IDE/LPCXPRESSO/README.md
index 9a93c021a..e934caa20 100644
--- a/IDE/LPCXPRESSO/README.md
+++ b/IDE/LPCXPRESSO/README.md
@@ -2,15 +2,16 @@
 
 To use, install the NXP LPCXpresso IDE and import the projects in a new workspace.
 
-1. Run LPCXpresso and choose a workspace location.
-2. Right click in the project exporer window and choose Inport.
-3. Under General choose "Existing Projects into Workspace".
-4. Under "Select root directory" click browse and select the wolfSSL root.
-5. Check the "Search for nested projects" box.
-5. Make sure "wolfssl" and "wolfssl_example" are checked under "Projects:".
-6. Click finish.
-7. Download the board and chip LPCOpen package for your platform.
-8. Import the projects. For example "lpc_board_nxp_lpcxpresso_1837" and "lpc_chip_18xx" are the ones for the LPC18S37.
+1. Change names of `LPCExpresso.project` and `LPCExpresso.cproject` files to `.project` and `.cproject`
+2. Run LPCXpresso and choose a workspace location.
+3. Right click in the project explorer window and choose Import.
+4. Under General choose "Existing Projects into Workspace".
+5. Under "Select root directory" click browse and select the wolfSSL root.
+6. Check the "Search for nested projects" box.
+7. Make sure "wolfssl" and "wolfssl_example" are checked under "Projects:".
+8. Click finish.
+9. Download the board and chip LPCOpen package for your platform.
+10. Import the projects. For example "lpc_board_nxp_lpcxpresso_1837" and "lpc_chip_18xx" are the ones for the LPC18S37.
 
 To setup this example to work with different baords/chips you will need to locate the LPCOpen sources for LPCXpresso on the NXP website and import the board and chip projects. Then you will need to update the "wolfssl_example" project properties to reference these projects (C/C++ General -> Paths and Symbols -> References). See the [LPCOpen v2.xx LPCXpresso quickstart guide for all platforms](https://www.lpcware.com/content/project/lpcopen-platform-nxp-lpc-microcontrollers/lpcopen-v200-quickstart-guides/lpcopen-1) for additional information.
 
diff --git a/IDE/include.am b/IDE/include.am
index 205ee6a35..a70a88fef 100644
--- a/IDE/include.am
+++ b/IDE/include.am
@@ -18,6 +18,7 @@ include IDE/GCC-ARM/include.am
 include IDE/CSBENCH/include.am
 include IDE/ECLIPSE/DEOS/include.am
 include IDE/ECLIPSE/MICRIUM/include.am
+include IDE/ECLIPSE/SIFIVE/include.am
 include IDE/mynewt/include.am
 include IDE/Renesas/cs+/Projects/include.am
 include IDE/Renesas/e2studio/Projects/include.am
diff --git a/.cproject b/LPCExpresso.cproject
similarity index 100%
rename from .cproject
rename to LPCExpresso.cproject
diff --git a/.project b/LPCExpresso.project
similarity index 100%
rename from .project
rename to LPCExpresso.project
diff --git a/examples/benchmark/tls_bench.c b/examples/benchmark/tls_bench.c
index c52935ad7..cd3f8fedd 100644
--- a/examples/benchmark/tls_bench.c
+++ b/examples/benchmark/tls_bench.c
@@ -611,7 +611,12 @@ static int bench_tls_client(info_t* info)
         cli_ctx = wolfSSL_CTX_new(wolfTLSv1_3_client_method());
 #endif
     if (!tls13)
+#if !defined(WOLFSSL_TLS13)
         cli_ctx = wolfSSL_CTX_new(wolfSSLv23_client_method());
+#elif !defined(WOLFSSL_NO_TLS12)
+        cli_ctx = wolfSSL_CTX_new(wolfTLSv1_2_client_method());
+#endif
+
     if (cli_ctx == NULL) {
         printf("error creating ctx\n");
         ret = MEMORY_E; goto exit;
@@ -1195,10 +1200,10 @@ static void print_stats(stats_t* wcStat, const char* desc, const char* cipher, i
            cipher,
            wcStat->txTotal + wcStat->rxTotal,
            wcStat->connCount,
-           wcStat->txTime * 1000,
            wcStat->rxTime * 1000,
-           wcStat->txTotal / wcStat->txTime / 1024 / 1024,
+           wcStat->txTime * 1000,
            wcStat->rxTotal / wcStat->rxTime / 1024 / 1024,
+           wcStat->txTotal / wcStat->txTime / 1024 / 1024,
            wcStat->connTime * 1000,
            wcStat->connTime * 1000 / wcStat->connCount);
 }
diff --git a/src/bio.c b/src/bio.c
index d8349801b..c4b225759 100644
--- a/src/bio.c
+++ b/src/bio.c
@@ -19,6 +19,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  */
 
+#include <wolfssl/wolfcrypt/settings.h>
+
 #if !defined(WOLFSSL_BIO_INCLUDED)
     #ifndef WOLFSSL_IGNORE_FILE_WARN
         #warning bio.c does not need to be compiled separately from ssl.c
diff --git a/src/include.am b/src/include.am
index 4861f00ce..2eb5697e7 100644
--- a/src/include.am
+++ b/src/include.am
@@ -340,10 +340,14 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/rabbit.c
 endif
 
 if BUILD_CHACHA
+if BUILD_ARMASM
+src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha.c
 if BUILD_INTELASM
 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha_asm.S
 endif
+endif
 if BUILD_POLY1305
 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
 endif
diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c
index f4d041800..71b81086b 100644
--- a/wolfcrypt/src/chacha.c
+++ b/wolfcrypt/src/chacha.c
@@ -27,6 +27,10 @@
  */
 
 
+#ifdef WOLFSSL_ARMASM
+    /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
+
+#else
 #ifdef HAVE_CONFIG_H
     #include <config.h>
 #endif
@@ -316,3 +320,4 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
 
 #endif /* HAVE_CHACHA*/
 
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/evp.c b/wolfcrypt/src/evp.c
index 0312a68f6..8db6311c4 100644
--- a/wolfcrypt/src/evp.c
+++ b/wolfcrypt/src/evp.c
@@ -19,6 +19,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  */
 
+#include <wolfssl/wolfcrypt/settings.h>
+
 #if !defined(WOLFSSL_EVP_INCLUDED)
     #ifndef WOLFSSL_IGNORE_FILE_WARN
         #warning evp.c does not need to be compiled seperatly from ssl.c
@@ -358,15 +360,15 @@ WOLFSSL_API int wolfSSL_EVP_CipherUpdate(WOLFSSL_EVP_CIPHER_CTX *ctx,
             if ((ctx->flags & WOLFSSL_EVP_CIPH_NO_PADDING) ||
                     (ctx->block_size == 1)) {
                 ctx->lastUsed = 0;
-                XMEMCPY(ctx->lastBlock, &out[ctx->block_size * blocks], ctx->block_size);
                 *outl+= ctx->block_size * blocks;
             } else {
                 if (inl == 0) {
                     ctx->lastUsed = 1;
                     blocks = blocks - 1; /* save last block to check padding in
                                           * EVP_CipherFinal call */
+                    XMEMCPY(ctx->lastBlock, &out[ctx->block_size * blocks],
+                            ctx->block_size);
                 }
-                XMEMCPY(ctx->lastBlock, &out[ctx->block_size * blocks], ctx->block_size);
                 *outl+= ctx->block_size * blocks;
             }
         } else {
@@ -446,7 +448,10 @@ WOLFSSL_API int  wolfSSL_EVP_CipherFinal(WOLFSSL_EVP_CIPHER_CTX *ctx,
             if ((fl = checkPad(ctx, ctx->lastBlock)) >= 0) {
                 XMEMCPY(out, ctx->lastBlock, fl);
                 *outl = fl;
-            } else return 0;
+            }
+            else {
+                return WOLFSSL_FAILURE;
+            }
         }
        /* return error in cases where the block length is incorrect */
         if (ctx->lastUsed == 0 && ctx->bufUsed == 0) {
diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S
index b052136aa..a0f57c5a2 100644
--- a/wolfcrypt/src/fe_x25519_asm.S
+++ b/wolfcrypt/src/fe_x25519_asm.S
@@ -157,16 +157,15 @@ fe_frombytes:
 _fe_frombytes:
 #endif /* __APPLE__ */
         movq	$0x7fffffffffffffff, %r9
-        # Copy
         movq	(%rsi), %rdx
         movq	8(%rsi), %rax
         movq	16(%rsi), %rcx
         movq	24(%rsi), %r8
+        andq	%r9, %r8
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
         movq	%r8, 24(%rdi)
-        andq	%r9, 24(%rdi)
         repz retq
 #ifndef __APPLE__
 .size	fe_frombytes,.-fe_frombytes
@@ -1264,7 +1263,7 @@ _fe_mul_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -1415,7 +1414,7 @@ _fe_sq_x64:
         movq	$19, %rax
         adcq	%rdx, %r13
         mulq	%r14
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r11, %r8
         adcq	%r12, %r9
         adcq	%r13, %r10
@@ -1629,7 +1628,7 @@ _fe_sq2_x64:
         mulq	%r14
         #  Add remaining produce results in
         addq	%r15, %rcx
-        addq	%r11, %r8
+        adcq	%r11, %r8
         adcq	%r12, %r9
         adcq	%r13, %r10
         adcq	%rax, %r10
@@ -2045,68 +2044,22 @@ L_curve25519_x64_bits:
         xorq	%r10, 48(%rsp)
         xorq	%r11, 56(%rsp)
         movq	%rbp, %rbx
-        # Sub
-        movq	64(%rsp), %rcx
-        movq	72(%rsp), %r9
-        movq	80(%rsp), %r10
-        movq	88(%rsp), %r11
-        subq	32(%rsp), %rcx
-        movq	$0x00, %rbp
-        sbbq	40(%rsp), %r9
-        movq	$-19, %rax
-        sbbq	48(%rsp), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	56(%rsp), %r11
-        sbbq	$0x00, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %rcx
-        adcq	%rbp, %r9
-        adcq	%rbp, %r10
-        adcq	%rdx, %r11
-        movq	%rcx, 96(%rsp)
-        movq	%r9, 104(%rsp)
-        movq	%r10, 112(%rsp)
-        movq	%r11, 120(%rsp)
-        # Sub
-        movq	(%rdi), %rcx
-        movq	8(%rdi), %r9
-        movq	16(%rdi), %r10
-        movq	24(%rdi), %r11
-        subq	(%rsp), %rcx
-        movq	$0x00, %rbp
-        sbbq	8(%rsp), %r9
-        movq	$-19, %rax
-        sbbq	16(%rsp), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rsp), %r11
-        sbbq	$0x00, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %rcx
-        adcq	%rbp, %r9
-        adcq	%rbp, %r10
-        adcq	%rdx, %r11
-        movq	%rcx, 128(%rsp)
-        movq	%r9, 136(%rsp)
-        movq	%r10, 144(%rsp)
-        movq	%r11, 152(%rsp)
         # Add
         movq	(%rdi), %rcx
         movq	8(%rdi), %r9
-        addq	(%rsp), %rcx
         movq	16(%rdi), %r10
-        adcq	8(%rsp), %r9
         movq	24(%rdi), %rbp
+        movq	%rcx, %r12
+        addq	(%rsp), %rcx
+        movq	%r9, %r13
+        adcq	8(%rsp), %r9
+        movq	%r10, %r14
         adcq	16(%rsp), %r10
-        movq	$-19, %rax
+        movq	%rbp, %r15
         adcq	24(%rsp), %rbp
-        movq	$0x7fffffffffffffff, %rdx
+        movq	$-19, %rax
         movq	%rbp, %r11
+        movq	$0x7fffffffffffffff, %rdx
         sarq	$63, %rbp
         #   Mask the modulus
         andq	%rbp, %rax
@@ -2116,22 +2069,47 @@ L_curve25519_x64_bits:
         sbbq	%rbp, %r9
         sbbq	%rbp, %r10
         sbbq	%rdx, %r11
+        # Sub
+        subq	(%rsp), %r12
+        movq	$0x00, %rbp
+        sbbq	8(%rsp), %r13
+        movq	$-19, %rax
+        sbbq	16(%rsp), %r14
+        movq	$0x7fffffffffffffff, %rdx
+        sbbq	24(%rsp), %r15
+        sbbq	$0x00, %rbp
+        #   Mask the modulus
+        andq	%rbp, %rax
+        andq	%rbp, %rdx
+        #   Add modulus (if underflow)
+        addq	%rax, %r12
+        adcq	%rbp, %r13
+        adcq	%rbp, %r14
+        adcq	%rdx, %r15
         movq	%rcx, (%rdi)
         movq	%r9, 8(%rdi)
         movq	%r10, 16(%rdi)
         movq	%r11, 24(%rdi)
+        movq	%r12, 128(%rsp)
+        movq	%r13, 136(%rsp)
+        movq	%r14, 144(%rsp)
+        movq	%r15, 152(%rsp)
         # Add
         movq	64(%rsp), %rcx
         movq	72(%rsp), %r9
-        addq	32(%rsp), %rcx
         movq	80(%rsp), %r10
-        adcq	40(%rsp), %r9
         movq	88(%rsp), %rbp
+        movq	%rcx, %r12
+        addq	32(%rsp), %rcx
+        movq	%r9, %r13
+        adcq	40(%rsp), %r9
+        movq	%r10, %r14
         adcq	48(%rsp), %r10
-        movq	$-19, %rax
+        movq	%rbp, %r15
         adcq	56(%rsp), %rbp
-        movq	$0x7fffffffffffffff, %rdx
+        movq	$-19, %rax
         movq	%rbp, %r11
+        movq	$0x7fffffffffffffff, %rdx
         sarq	$63, %rbp
         #   Mask the modulus
         andq	%rbp, %rax
@@ -2141,10 +2119,31 @@ L_curve25519_x64_bits:
         sbbq	%rbp, %r9
         sbbq	%rbp, %r10
         sbbq	%rdx, %r11
+        # Sub
+        subq	32(%rsp), %r12
+        movq	$0x00, %rbp
+        sbbq	40(%rsp), %r13
+        movq	$-19, %rax
+        sbbq	48(%rsp), %r14
+        movq	$0x7fffffffffffffff, %rdx
+        sbbq	56(%rsp), %r15
+        sbbq	$0x00, %rbp
+        #   Mask the modulus
+        andq	%rbp, %rax
+        andq	%rbp, %rdx
+        #   Add modulus (if underflow)
+        addq	%rax, %r12
+        adcq	%rbp, %r13
+        adcq	%rbp, %r14
+        adcq	%rdx, %r15
         movq	%rcx, (%rsp)
         movq	%r9, 8(%rsp)
         movq	%r10, 16(%rsp)
         movq	%r11, 24(%rsp)
+        movq	%r12, 96(%rsp)
+        movq	%r13, 104(%rsp)
+        movq	%r14, 112(%rsp)
+        movq	%r15, 120(%rsp)
         # Multiply
         #  A[0] * B[0]
         movq	(%rdi), %rax
@@ -2270,7 +2269,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -2423,7 +2422,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -2549,7 +2548,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -2675,7 +2674,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -2706,15 +2705,19 @@ L_curve25519_x64_bits:
         # Add
         movq	32(%rsp), %rcx
         movq	40(%rsp), %r9
-        addq	(%rsp), %rcx
         movq	48(%rsp), %r10
-        adcq	8(%rsp), %r9
         movq	56(%rsp), %rbp
+        movq	%rcx, %r12
+        addq	(%rsp), %rcx
+        movq	%r9, %r13
+        adcq	8(%rsp), %r9
+        movq	%r10, %r14
         adcq	16(%rsp), %r10
-        movq	$-19, %rax
+        movq	%rbp, %r15
         adcq	24(%rsp), %rbp
-        movq	$0x7fffffffffffffff, %rdx
+        movq	$-19, %rax
         movq	%rbp, %r11
+        movq	$0x7fffffffffffffff, %rdx
         sarq	$63, %rbp
         #   Mask the modulus
         andq	%rbp, %rax
@@ -2724,35 +2727,31 @@ L_curve25519_x64_bits:
         sbbq	%rbp, %r9
         sbbq	%rbp, %r10
         sbbq	%rdx, %r11
-        movq	%rcx, 64(%rsp)
-        movq	%r9, 72(%rsp)
-        movq	%r10, 80(%rsp)
-        movq	%r11, 88(%rsp)
         # Sub
-        movq	32(%rsp), %rcx
-        movq	40(%rsp), %r9
-        movq	48(%rsp), %r10
-        movq	56(%rsp), %r11
-        subq	(%rsp), %rcx
+        subq	(%rsp), %r12
         movq	$0x00, %rbp
-        sbbq	8(%rsp), %r9
+        sbbq	8(%rsp), %r13
         movq	$-19, %rax
-        sbbq	16(%rsp), %r10
+        sbbq	16(%rsp), %r14
         movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rsp), %r11
+        sbbq	24(%rsp), %r15
         sbbq	$0x00, %rbp
         #   Mask the modulus
         andq	%rbp, %rax
         andq	%rbp, %rdx
         #   Add modulus (if underflow)
-        addq	%rax, %rcx
-        adcq	%rbp, %r9
-        adcq	%rbp, %r10
-        adcq	%rdx, %r11
-        movq	%rcx, (%rsp)
-        movq	%r9, 8(%rsp)
-        movq	%r10, 16(%rsp)
-        movq	%r11, 24(%rsp)
+        addq	%rax, %r12
+        adcq	%rbp, %r13
+        adcq	%rbp, %r14
+        adcq	%rdx, %r15
+        movq	%rcx, 64(%rsp)
+        movq	%r9, 72(%rsp)
+        movq	%r10, 80(%rsp)
+        movq	%r11, 88(%rsp)
+        movq	%r12, (%rsp)
+        movq	%r13, 8(%rsp)
+        movq	%r14, 16(%rsp)
+        movq	%r15, 24(%rsp)
         # Multiply
         #  A[0] * B[0]
         movq	96(%rsp), %rax
@@ -2878,7 +2877,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -3029,7 +3028,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -3188,7 +3187,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -3366,7 +3365,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -3519,7 +3518,7 @@ L_curve25519_x64_bits:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -3939,7 +3938,7 @@ L_curve25519_x64_inv_8:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -4405,7 +4404,7 @@ _fe_ge_to_p2_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -4561,7 +4560,7 @@ _fe_ge_to_p2_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -4717,7 +4716,7 @@ _fe_ge_to_p2_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -4905,7 +4904,7 @@ _fe_ge_to_p3_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -5061,7 +5060,7 @@ _fe_ge_to_p3_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -5217,7 +5216,7 @@ _fe_ge_to_p3_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -5373,7 +5372,7 @@ _fe_ge_to_p3_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -5535,7 +5534,7 @@ _fe_ge_dbl_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -5663,7 +5662,7 @@ _fe_ge_dbl_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -5803,7 +5802,7 @@ _fe_ge_dbl_x64:
         mulq	%r15
         #  Add remaining produce results in
         addq	%rcx, %r8
-        addq	%r12, %r9
+        adcq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
         adcq	%rax, %r11
@@ -5958,7 +5957,7 @@ _fe_ge_dbl_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -6316,7 +6315,7 @@ _fe_ge_madd_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -6472,7 +6471,7 @@ _fe_ge_madd_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -6628,7 +6627,7 @@ _fe_ge_madd_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -7014,7 +7013,7 @@ _fe_ge_msub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -7170,7 +7169,7 @@ _fe_ge_msub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -7326,7 +7325,7 @@ _fe_ge_msub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -7712,7 +7711,7 @@ _fe_ge_add_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -7868,7 +7867,7 @@ _fe_ge_add_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -8024,7 +8023,7 @@ _fe_ge_add_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -8180,7 +8179,7 @@ _fe_ge_add_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -8566,7 +8565,7 @@ _fe_ge_sub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -8722,7 +8721,7 @@ _fe_ge_sub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -8878,7 +8877,7 @@ _fe_ge_sub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -9034,7 +9033,7 @@ _fe_ge_sub_x64:
         movq	$19, %rax
         adcq	%rdx, %r14
         mulq	%r15
-        #  Add remaining produce results in
+        #  Add remaining product results in
         addq	%r12, %r9
         adcq	%r13, %r10
         adcq	%r14, %r11
@@ -10052,68 +10051,22 @@ L_curve25519_avx2_bits:
         xorq	%r11, 48(%rsp)
         xorq	%r12, 56(%rsp)
         movq	%rax, 184(%rsp)
-        # Sub
-        movq	64(%rsp), %r9
-        movq	72(%rsp), %r10
-        movq	80(%rsp), %r11
-        movq	88(%rsp), %r12
-        subq	32(%rsp), %r9
-        movq	$0x00, %rax
-        sbbq	40(%rsp), %r10
-        movq	$-19, %rcx
-        sbbq	48(%rsp), %r11
-        movq	$0x7fffffffffffffff, %rbx
-        sbbq	56(%rsp), %r12
-        sbbq	$0x00, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Add modulus (if underflow)
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	%rax, %r11
-        adcq	%rbx, %r12
-        movq	%r9, 96(%rsp)
-        movq	%r10, 104(%rsp)
-        movq	%r11, 112(%rsp)
-        movq	%r12, 120(%rsp)
-        # Sub
-        movq	(%rdi), %r9
-        movq	8(%rdi), %r10
-        movq	16(%rdi), %r11
-        movq	24(%rdi), %r12
-        subq	(%rsp), %r9
-        movq	$0x00, %rax
-        sbbq	8(%rsp), %r10
-        movq	$-19, %rcx
-        sbbq	16(%rsp), %r11
-        movq	$0x7fffffffffffffff, %rbx
-        sbbq	24(%rsp), %r12
-        sbbq	$0x00, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Add modulus (if underflow)
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	%rax, %r11
-        adcq	%rbx, %r12
-        movq	%r9, 128(%rsp)
-        movq	%r10, 136(%rsp)
-        movq	%r11, 144(%rsp)
-        movq	%r12, 152(%rsp)
         # Add
         movq	(%rdi), %r9
         movq	8(%rdi), %r10
-        addq	(%rsp), %r9
         movq	16(%rdi), %r11
-        adcq	8(%rsp), %r10
         movq	24(%rdi), %rax
+        movq	%r9, %r13
+        addq	(%rsp), %r9
+        movq	%r10, %r14
+        adcq	8(%rsp), %r10
+        movq	%r11, %r15
         adcq	16(%rsp), %r11
-        movq	$-19, %rcx
+        movq	%rax, %rbp
         adcq	24(%rsp), %rax
-        movq	$0x7fffffffffffffff, %rbx
+        movq	$-19, %rcx
         movq	%rax, %r12
+        movq	$0x7fffffffffffffff, %rbx
         sarq	$63, %rax
         #   Mask the modulus
         andq	%rax, %rcx
@@ -10123,22 +10076,47 @@ L_curve25519_avx2_bits:
         sbbq	%rax, %r10
         sbbq	%rax, %r11
         sbbq	%rbx, %r12
+        # Sub
+        subq	(%rsp), %r13
+        movq	$0x00, %rax
+        sbbq	8(%rsp), %r14
+        movq	$-19, %rcx
+        sbbq	16(%rsp), %r15
+        movq	$0x7fffffffffffffff, %rbx
+        sbbq	24(%rsp), %rbp
+        sbbq	$0x00, %rax
+        #   Mask the modulus
+        andq	%rax, %rcx
+        andq	%rax, %rbx
+        #   Add modulus (if underflow)
+        addq	%rcx, %r13
+        adcq	%rax, %r14
+        adcq	%rax, %r15
+        adcq	%rbx, %rbp
         movq	%r9, (%rdi)
         movq	%r10, 8(%rdi)
         movq	%r11, 16(%rdi)
         movq	%r12, 24(%rdi)
+        movq	%r13, 128(%rsp)
+        movq	%r14, 136(%rsp)
+        movq	%r15, 144(%rsp)
+        movq	%rbp, 152(%rsp)
         # Add
         movq	64(%rsp), %r9
         movq	72(%rsp), %r10
-        addq	32(%rsp), %r9
         movq	80(%rsp), %r11
-        adcq	40(%rsp), %r10
         movq	88(%rsp), %rax
+        movq	%r9, %r13
+        addq	32(%rsp), %r9
+        movq	%r10, %r14
+        adcq	40(%rsp), %r10
+        movq	%r11, %r15
         adcq	48(%rsp), %r11
-        movq	$-19, %rcx
+        movq	%rax, %rbp
         adcq	56(%rsp), %rax
-        movq	$0x7fffffffffffffff, %rbx
+        movq	$-19, %rcx
         movq	%rax, %r12
+        movq	$0x7fffffffffffffff, %rbx
         sarq	$63, %rax
         #   Mask the modulus
         andq	%rax, %rcx
@@ -10148,10 +10126,31 @@ L_curve25519_avx2_bits:
         sbbq	%rax, %r10
         sbbq	%rax, %r11
         sbbq	%rbx, %r12
+        # Sub
+        subq	32(%rsp), %r13
+        movq	$0x00, %rax
+        sbbq	40(%rsp), %r14
+        movq	$-19, %rcx
+        sbbq	48(%rsp), %r15
+        movq	$0x7fffffffffffffff, %rbx
+        sbbq	56(%rsp), %rbp
+        sbbq	$0x00, %rax
+        #   Mask the modulus
+        andq	%rax, %rcx
+        andq	%rax, %rbx
+        #   Add modulus (if underflow)
+        addq	%rcx, %r13
+        adcq	%rax, %r14
+        adcq	%rax, %r15
+        adcq	%rbx, %rbp
         movq	%r9, (%rsp)
         movq	%r10, 8(%rsp)
         movq	%r11, 16(%rsp)
         movq	%r12, 24(%rsp)
+        movq	%r13, 96(%rsp)
+        movq	%r14, 104(%rsp)
+        movq	%r15, 112(%rsp)
+        movq	%rbp, 120(%rsp)
         # Multiply
         # A[0] * B[0]
         movq	(%rdi), %rdx
@@ -10607,15 +10606,19 @@ L_curve25519_avx2_bits:
         # Add
         movq	32(%rsp), %r9
         movq	40(%rsp), %r10
-        addq	(%rsp), %r9
         movq	48(%rsp), %r11
-        adcq	8(%rsp), %r10
         movq	56(%rsp), %rax
+        movq	%r9, %r13
+        addq	(%rsp), %r9
+        movq	%r10, %r14
+        adcq	8(%rsp), %r10
+        movq	%r11, %r15
         adcq	16(%rsp), %r11
-        movq	$-19, %rcx
+        movq	%rax, %rbp
         adcq	24(%rsp), %rax
-        movq	$0x7fffffffffffffff, %rbx
+        movq	$-19, %rcx
         movq	%rax, %r12
+        movq	$0x7fffffffffffffff, %rbx
         sarq	$63, %rax
         #   Mask the modulus
         andq	%rax, %rcx
@@ -10625,35 +10628,31 @@ L_curve25519_avx2_bits:
         sbbq	%rax, %r10
         sbbq	%rax, %r11
         sbbq	%rbx, %r12
-        movq	%r9, 64(%rsp)
-        movq	%r10, 72(%rsp)
-        movq	%r11, 80(%rsp)
-        movq	%r12, 88(%rsp)
         # Sub
-        movq	32(%rsp), %r9
-        movq	40(%rsp), %r10
-        movq	48(%rsp), %r11
-        movq	56(%rsp), %r12
-        subq	(%rsp), %r9
+        subq	(%rsp), %r13
         movq	$0x00, %rax
-        sbbq	8(%rsp), %r10
+        sbbq	8(%rsp), %r14
         movq	$-19, %rcx
-        sbbq	16(%rsp), %r11
+        sbbq	16(%rsp), %r15
         movq	$0x7fffffffffffffff, %rbx
-        sbbq	24(%rsp), %r12
+        sbbq	24(%rsp), %rbp
         sbbq	$0x00, %rax
         #   Mask the modulus
         andq	%rax, %rcx
         andq	%rax, %rbx
         #   Add modulus (if underflow)
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	%rax, %r11
-        adcq	%rbx, %r12
-        movq	%r9, (%rsp)
-        movq	%r10, 8(%rsp)
-        movq	%r11, 16(%rsp)
-        movq	%r12, 24(%rsp)
+        addq	%rcx, %r13
+        adcq	%rax, %r14
+        adcq	%rax, %r15
+        adcq	%rbx, %rbp
+        movq	%r9, 64(%rsp)
+        movq	%r10, 72(%rsp)
+        movq	%r11, 80(%rsp)
+        movq	%r12, 88(%rsp)
+        movq	%r13, (%rsp)
+        movq	%r14, 8(%rsp)
+        movq	%r15, 16(%rsp)
+        movq	%rbp, 24(%rsp)
         # Multiply
         # A[0] * B[0]
         movq	96(%rsp), %rdx
diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am
index 3e7b3a377..ba1f7b6a7 100644
--- a/wolfcrypt/src/include.am
+++ b/wolfcrypt/src/include.am
@@ -48,6 +48,9 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
               wolfcrypt/src/port/ti/ti-ccm.c \
               wolfcrypt/src/port/pic32/pic32mz-crypt.c \
               wolfcrypt/src/port/nrf51.c \
+              wolfcrypt/src/port/arm/armv8-aes.c \
+              wolfcrypt/src/port/arm/armv8-sha256.c \
+              wolfcrypt/src/port/arm/armv8-chacha.c \
               wolfcrypt/src/port/arm/armv8-curve25519.c \
               wolfcrypt/src/port/arm/armv7-curve25519.c \
               wolfcrypt/src/port/arm/armv8-sha512-asm.c \
diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c
new file mode 100644
index 000000000..76487d683
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-chacha.c
@@ -0,0 +1,2858 @@
+/* armv8-chacha.c
+ *
+ * Copyright (C) 2006-2019 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ *
+ */
+
+/*  The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
+ *  https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+ */
+
+#ifdef WOLFSSL_ARMASM
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef HAVE_CHACHA
+
+#include <wolfssl/wolfcrypt/chacha.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #define WOLFSSL_MISC_INCLUDED
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef CHACHA_AEAD_TEST
+    #include <stdio.h>
+#endif
+
+#ifdef CHACHA_TEST
+    #include <stdio.h>
+#endif
+
+#ifdef BIG_ENDIAN_ORDER
+    #define LITTLE32(x) ByteReverseWord32(x)
+#else
+    #define LITTLE32(x) (x)
+#endif
+
+/* Number of rounds */
+#define ROUNDS  20
+
+#define U32C(v) (v##U)
+#define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF))
+#define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0])
+
+#define PLUS(v,w)   (U32V((v) + (w)))
+#define PLUSONE(v)  (PLUS((v),1))
+
+#define ARM_SIMD_LEN_BYTES 16
+
+/**
+  * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
+  * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
+  */
+int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
+{
+    word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
+
+#ifdef CHACHA_AEAD_TEST
+    word32 i;
+    printf("NONCE : ");
+    for (i = 0; i < CHACHA_IV_BYTES; i++) {
+        printf("%02x", inIv[i]);
+    }
+    printf("\n\n");
+#endif
+
+    if (ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
+
+    ctx->X[CHACHA_IV_BYTES+0] = counter;           /* block counter */
+    ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */
+    ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */
+    ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
+
+    return 0;
+}
+
+/* "expand 32-byte k" as unsigned 32 byte */
+static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+/* "expand 16-byte k" as unsigned 16 byte */
+static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
+
+/**
+  * Key setup. 8 word iv (nonce)
+  */
+int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
+{
+    const word32* constants;
+    const byte*   k;
+
+#ifdef XSTREAM_ALIGN
+    word32 alignKey[8];
+#endif
+
+    if (ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ)
+        return BAD_FUNC_ARG;
+
+#ifdef XSTREAM_ALIGN
+    if ((wolfssl_word)key % 4) {
+        WOLFSSL_MSG("wc_ChachaSetKey unaligned key");
+        XMEMCPY(alignKey, key, keySz);
+        k = (byte*)alignKey;
+    }
+    else {
+        k = key;
+    }
+#else
+    k = key;
+#endif /* XSTREAM_ALIGN */
+
+#ifdef CHACHA_AEAD_TEST
+    word32 i;
+    printf("ChaCha key used :\n");
+    for (i = 0; i < keySz; i++) {
+        printf("%02x", key[i]);
+        if ((i + 1) % 8 == 0)
+           printf("\n");
+    }
+    printf("\n\n");
+#endif
+
+    ctx->X[4] = U8TO32_LITTLE(k +  0);
+    ctx->X[5] = U8TO32_LITTLE(k +  4);
+    ctx->X[6] = U8TO32_LITTLE(k +  8);
+    ctx->X[7] = U8TO32_LITTLE(k + 12);
+    if (keySz == CHACHA_MAX_KEY_SZ) {
+        k += 16;
+        constants = sigma;
+    }
+    else {
+        constants = tau;
+    }
+    ctx->X[ 8] = U8TO32_LITTLE(k +  0);
+    ctx->X[ 9] = U8TO32_LITTLE(k +  4);
+    ctx->X[10] = U8TO32_LITTLE(k +  8);
+    ctx->X[11] = U8TO32_LITTLE(k + 12);
+    ctx->X[ 0] = constants[0];
+    ctx->X[ 1] = constants[1];
+    ctx->X[ 2] = constants[2];
+    ctx->X[ 3] = constants[3];
+
+    return 0;
+}
+
+static const word32 L_chacha20_neon_inc_first_word[] = {
+    0x1,
+    0x0,
+    0x0,
+    0x0,
+};
+
+#ifdef __aarch64__
+
+static const word32 L_chacha20_neon_add_all_counters[] = {
+    0x0,
+    0x1,
+    0x2,
+    0x3,
+};
+
+static const word32 L_chacha20_neon_rol8[] = {
+    0x2010003,
+    0x6050407,
+    0xa09080b,
+    0xe0d0c0f,
+};
+
+static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m, byte* c, word32 bytes)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_320 with %d bytes\n", bytes);
+#endif /*CHACHA_TEST */
+
+    __asm__ __volatile__ (
+        /*
+         * The layout of used registers is:
+         * ARM
+         * w4-w19: these registers hold the fifth Chacha block for calculation in regular ARM
+         * w20: loop counter for how many even-odd rounds need to be executed
+         * w21: the counter offset for the block in ARM registers
+         * NEON
+         * v0-v15: the vi'th register holds the i'th word of four blocks during the quarter rounds.
+         *         these registers are later transposed make ADDing the input and XORing the message easier.
+         * v16-v19: these are helper registers that are used as temporary location to store data
+         * v20-v23: load the next message block
+         * v24-v27: the 64 byte intial Chacha block
+         * v28: vector to increment the counter words of each block
+         * v29: vector of 5's to increment counters between L_chacha20_arm64_outer_%= loops
+         * v30: table lookup indices to rotate values by 8
+         */
+
+        /* Load counter-add values for each block */
+        "LD1    {v28.4s}, [%[L_chacha20_neon_add_all_counters]] \n\t"
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v30.16b}, [%[L_chacha20_neon_rol8]] \n\t"
+        /* For adding 5 to each counter-add for next 320-byte chunk */
+        "MOVI   v29.4s, #5 \n\t"
+        /* Counter for 5th block in regular registers */
+        "MOV    w21, #4 \n\t"
+        /* Load state to encrypt */
+        "LD1    {v24.4s-v27.4s}, [%[input]] \n\t"
+        "\n"
+    "L_chacha20_arm64_outer_%=: \n\t"
+        /* Move state into regular registers */
+        "MOV    x4, v24.d[0] \n\t"
+        "MOV    x6, v24.d[1] \n\t"
+        "MOV    x8, v25.d[0] \n\t"
+        "MOV    x10, v25.d[1] \n\t"
+        "MOV    x12, v26.d[0] \n\t"
+        "MOV    x14, v26.d[1] \n\t"
+        "MOV    x16, v27.d[0] \n\t"
+        "MOV    x18, v27.d[1] \n\t"
+        /* Move state into vector registers (x4) */
+        "DUP    v0.4s, v24.s[0] \n\t"
+        "DUP    v1.4s, v24.s[1] \n\t"
+        "LSR    x5, x4, #32 \n\t"
+        "DUP    v2.4s, v24.s[2] \n\t"
+        "DUP    v3.4s, v24.s[3] \n\t"
+        "LSR    x7, x6, #32 \n\t"
+        "DUP    v4.4s, v25.s[0] \n\t"
+        "DUP    v5.4s, v25.s[1] \n\t"
+        "LSR    x9, x8, #32 \n\t"
+        "DUP    v6.4s, v25.s[2] \n\t"
+        "DUP    v7.4s, v25.s[3] \n\t"
+        "LSR    x11, x10, #32 \n\t"
+        "DUP    v8.4s, v26.s[0] \n\t"
+        "DUP    v9.4s, v26.s[1] \n\t"
+        "LSR    x13, x12, #32 \n\t"
+        "DUP    v10.4s, v26.s[2] \n\t"
+        "DUP    v11.4s, v26.s[3] \n\t"
+        "LSR    x15, x14, #32 \n\t"
+        "DUP    v12.4s, v27.s[0] \n\t"
+        "DUP    v13.4s, v27.s[1] \n\t"
+        "LSR    x17, x16, #32 \n\t"
+        "DUP    v14.4s, v27.s[2] \n\t"
+        "DUP    v15.4s, v27.s[3] \n\t"
+        "LSR    x19, x18, #32 \n\t"
+        /* Add to counter word */
+        "ADD    v12.4s, v12.4s, v28.4s \n\t"
+        "ADD    w16, w16, w21 \n\t"
+        /* Set number of odd+even rounds to perform */
+        "MOV    w20, #10 \n\t"
+        "\n"
+    "L_chacha20_arm64_inner_%=: \n\t"
+        "SUBS   w20, w20, #1 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4s, v0.4s, v4.4s \n\t"
+        "ADD    w4, w4, w8 \n\t"
+        "ADD    v1.4s, v1.4s, v5.4s \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "ADD    v2.4s, v2.4s, v6.4s \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "ADD    v3.4s, v3.4s, v7.4s \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "EOR    v12.16b, v12.16b, v0.16b \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "EOR    v13.16b, v13.16b, v1.16b \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "EOR    v14.16b, v14.16b, v2.16b \n\t"
+        "EOR    w18, w18, w6 \n\t"
+        "EOR    v15.16b, v15.16b, v3.16b \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "REV32  v12.8h, v12.8h \n\t"
+        "ROR    w16, w16, #16 \n\t"
+        "REV32  v13.8h, v13.8h \n\t"
+        "ROR    w17, w17, #16 \n\t"
+        "REV32  v14.8h, v14.8h \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        "REV32  v15.8h, v15.8h \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v8.4s, v8.4s, v12.4s \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "ADD    v9.4s, v9.4s, v13.4s \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "ADD    v10.4s, v10.4s, v14.4s \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "ADD    v11.4s, v11.4s, v15.4s \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "EOR    v16.16b, v4.16b, v8.16b \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "EOR    v17.16b, v5.16b, v9.16b \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "EOR    v18.16b, v6.16b, v10.16b \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "EOR    v19.16b, v7.16b, v11.16b \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "SHL    v4.4s, v16.4s, #12 \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "SHL    v5.4s, v17.4s, #12 \n\t"
+        "ROR    w9, w9, #20 \n\t"
+        "SHL    v6.4s, v18.4s, #12 \n\t"
+        "ROR    w10, w10, #20 \n\t"
+        "SHL    v7.4s, v19.4s, #12 \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "SRI    v4.4s, v16.4s, #20 \n\t"
+        "SRI    v5.4s, v17.4s, #20 \n\t"
+        "SRI    v6.4s, v18.4s, #20 \n\t"
+        "SRI    v7.4s, v19.4s, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4s, v0.4s, v4.4s \n\t"
+        "ADD    w4, w4, w8 \n\t"
+        "ADD    v1.4s, v1.4s, v5.4s \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "ADD    v2.4s, v2.4s, v6.4s \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "ADD    v3.4s, v3.4s, v7.4s \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "EOR    v12.16b, v12.16b, v0.16b \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "EOR    v13.16b, v13.16b, v1.16b \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "EOR    v14.16b, v14.16b, v2.16b \n\t"
+        "EOR    w18, w18, w6 \n\t"
+        "EOR    v15.16b, v15.16b, v3.16b \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "TBL    v12.16b, { v12.16b }, v30.16b \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "TBL    v13.16b, { v13.16b }, v30.16b \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "TBL    v14.16b, { v14.16b }, v30.16b \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        "TBL    v15.16b, { v15.16b }, v30.16b \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v8.4s, v8.4s, v12.4s \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "ADD    v9.4s, v9.4s, v13.4s \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "ADD    v10.4s, v10.4s, v14.4s \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "ADD    v11.4s, v11.4s, v15.4s \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "EOR    v16.16b, v4.16b, v8.16b \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "EOR    v17.16b, v5.16b, v9.16b \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "EOR    v18.16b, v6.16b, v10.16b \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "EOR    v19.16b, v7.16b, v11.16b \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "SHL    v4.4s, v16.4s, #7 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "SHL    v5.4s, v17.4s, #7 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "SHL    v6.4s, v18.4s, #7 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "SHL    v7.4s, v19.4s, #7 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "SRI    v4.4s, v16.4s, #25 \n\t"
+        "SRI    v5.4s, v17.4s, #25 \n\t"
+        "SRI    v6.4s, v18.4s, #25 \n\t"
+        "SRI    v7.4s, v19.4s, #25 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4s, v0.4s, v5.4s \n\t"
+        "ADD    w4, w4, w9 \n\t"
+        "ADD    v1.4s, v1.4s, v6.4s \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "ADD    v2.4s, v2.4s, v7.4s \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "ADD    v3.4s, v3.4s, v4.4s \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "EOR    v15.16b, v15.16b, v0.16b \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "EOR    v12.16b, v12.16b, v1.16b \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "EOR    v13.16b, v13.16b, v2.16b \n\t"
+        "EOR    w17, w17, w6 \n\t"
+        "EOR    v14.16b, v14.16b, v3.16b \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "REV32  v15.8h, v15.8h \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        "REV32  v12.8h, v12.8h \n\t"
+        "ROR    w16, w16, #16 \n\t"
+        "REV32  v13.8h, v13.8h \n\t"
+        "ROR    w17, w17, #16 \n\t"
+        "REV32  v14.8h, v14.8h \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v10.4s, v10.4s, v15.4s \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "ADD    v11.4s, v11.4s, v12.4s \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "ADD    v8.4s, v8.4s, v13.4s \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "ADD    v9.4s, v9.4s, v14.4s \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "EOR    v16.16b, v5.16b, v10.16b \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "EOR    v17.16b, v6.16b, v11.16b \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "EOR    v18.16b, v7.16b, v8.16b \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "EOR    v19.16b, v4.16b, v9.16b \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "SHL    v5.4s, v16.4s, #12 \n\t"
+        "ROR    w9, w9, #20 \n\t"
+        "SHL    v6.4s, v17.4s, #12 \n\t"
+        "ROR    w10, w10, #20 \n\t"
+        "SHL    v7.4s, v18.4s, #12 \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "SHL    v4.4s, v19.4s, #12 \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "SRI    v5.4s, v16.4s, #20 \n\t"
+        "SRI    v6.4s, v17.4s, #20 \n\t"
+        "SRI    v7.4s, v18.4s, #20 \n\t"
+        "SRI    v4.4s, v19.4s, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4s, v0.4s, v5.4s \n\t"
+        "ADD    w4, w4, w9 \n\t"
+        "ADD    v1.4s, v1.4s, v6.4s \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "ADD    v2.4s, v2.4s, v7.4s \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "ADD    v3.4s, v3.4s, v4.4s \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "EOR    v15.16b, v15.16b, v0.16b \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "EOR    v12.16b, v12.16b, v1.16b \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "EOR    v13.16b, v13.16b, v2.16b \n\t"
+        "EOR    w17, w17, w6 \n\t"
+        "EOR    v14.16b, v14.16b, v3.16b \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "TBL    v15.16b, { v15.16b }, v30.16b \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        "TBL    v12.16b, { v12.16b }, v30.16b \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "TBL    v13.16b, { v13.16b }, v30.16b \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "TBL    v14.16b, { v14.16b }, v30.16b \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v10.4s, v10.4s, v15.4s \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "ADD    v11.4s, v11.4s, v12.4s \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "ADD    v8.4s, v8.4s, v13.4s \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "ADD    v9.4s, v9.4s, v14.4s \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "EOR    v16.16b, v5.16b, v10.16b \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "EOR    v17.16b, v6.16b, v11.16b \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "EOR    v18.16b, v7.16b, v8.16b \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "EOR    v19.16b, v4.16b, v9.16b \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "SHL    v5.4s, v16.4s, #7 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "SHL    v6.4s, v17.4s, #7 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "SHL    v7.4s, v18.4s, #7 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "SHL    v4.4s, v19.4s, #7 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "SRI    v5.4s, v16.4s, #25 \n\t"
+        "SRI    v6.4s, v17.4s, #25 \n\t"
+        "SRI    v7.4s, v18.4s, #25 \n\t"
+        "SRI    v4.4s, v19.4s, #25 \n\t"
+        "BNE    L_chacha20_arm64_inner_%= \n\t"
+        /* Add counter now rather than after transposed */
+        "ADD    v12.4s, v12.4s, v28.4s \n\t"
+        "ADD    w16, w16, w21 \n\t"
+        /* Load message */
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        /* Transpose vectors */
+        "TRN1   v16.4s, v0.4s, v1.4s \n\t"
+        "TRN1   v18.4s, v2.4s, v3.4s \n\t"
+        "TRN2   v17.4s, v0.4s, v1.4s \n\t"
+        "TRN2   v19.4s, v2.4s, v3.4s \n\t"
+        "TRN1   v0.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v1.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v2.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v3.2d, v17.2d, v19.2d \n\t"
+        "TRN1   v16.4s, v4.4s, v5.4s \n\t"
+        "TRN1   v18.4s, v6.4s, v7.4s \n\t"
+        "TRN2   v17.4s, v4.4s, v5.4s \n\t"
+        "TRN2   v19.4s, v6.4s, v7.4s \n\t"
+        "TRN1   v4.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v5.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v6.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v7.2d, v17.2d, v19.2d \n\t"
+        "TRN1   v16.4s, v8.4s, v9.4s \n\t"
+        "TRN1   v18.4s, v10.4s, v11.4s \n\t"
+        "TRN2   v17.4s, v8.4s, v9.4s \n\t"
+        "TRN2   v19.4s, v10.4s, v11.4s \n\t"
+        "TRN1   v8.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v9.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v10.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v11.2d, v17.2d, v19.2d \n\t"
+        "TRN1   v16.4s, v12.4s, v13.4s \n\t"
+        "TRN1   v18.4s, v14.4s, v15.4s \n\t"
+        "TRN2   v17.4s, v12.4s, v13.4s \n\t"
+        "TRN2   v19.4s, v14.4s, v15.4s \n\t"
+        "TRN1   v12.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v13.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v14.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v15.2d, v17.2d, v19.2d \n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        "ADD    v16.4s, v0.4s, v24.4s \n\t"
+        "ADD    v17.4s, v4.4s, v25.4s \n\t"
+        "ADD    v18.4s, v8.4s, v26.4s \n\t"
+        "ADD    v19.4s, v12.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "ADD    v16.4s, v1.4s, v24.4s \n\t"
+        "ADD    v17.4s, v5.4s, v25.4s \n\t"
+        "ADD    v18.4s, v9.4s, v26.4s \n\t"
+        "ADD    v19.4s, v13.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "ADD    v16.4s, v2.4s, v24.4s \n\t"
+        "ADD    v17.4s, v6.4s, v25.4s \n\t"
+        "ADD    v18.4s, v10.4s, v26.4s \n\t"
+        "ADD    v19.4s, v14.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "ADD    v16.4s, v3.4s, v24.4s \n\t"
+        "ADD    v17.4s, v7.4s, v25.4s \n\t"
+        "ADD    v18.4s, v11.4s, v26.4s \n\t"
+        "ADD    v19.4s, v15.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        /* Move regular registers into vector registers for adding and xor */
+        "ORR    x4, x4, x5, LSL #32 \n\t"
+        "ORR    x6, x6, x7, LSL #32 \n\t"
+        "ORR    x8, x8, x9, LSL #32 \n\t"
+        "MOV    v16.d[0], x4 \n\t"
+        "ORR    x10, x10, x11, LSL #32 \n\t"
+        "MOV    v16.d[1], x6 \n\t"
+        "ORR    x12, x12, x13, LSL #32 \n\t"
+        "MOV    v17.d[0], x8 \n\t"
+        "ORR    x14, x14, x15, LSL #32 \n\t"
+        "MOV    v17.d[1], x10 \n\t"
+        "ORR    x16, x16, x17, LSL #32 \n\t"
+        "MOV    v18.d[0], x12 \n\t"
+        "ORR    x18, x18, x19, LSL #32 \n\t"
+        "MOV    v18.d[1], x14 \n\t"
+        "MOV    v19.d[0], x16 \n\t"
+        "MOV    v19.d[1], x18 \n\t"
+        /* Add back state, XOR in message and store */
+        "ADD    v16.4s, v16.4s, v24.4s \n\t"
+        "ADD    v17.4s, v17.4s, v25.4s \n\t"
+        "ADD    v18.4s, v18.4s, v26.4s \n\t"
+        "ADD    v19.4s, v19.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "ADD    w21, w21, #5 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "SUBS   %[bytes], %[bytes], #320 \n\t"
+        "ADD    v28.4s, v28.4s, v29.4s \n\t"
+        "BNE    L_chacha20_arm64_outer_%= \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c),
+          [bytes] "+r" (bytes)
+        : [L_chacha20_neon_add_all_counters] "r" (L_chacha20_neon_add_all_counters),
+          [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
+        : "memory", "cc",
+          "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12",
+          "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21",
+          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"
+    );
+}
+#endif /* __aarch64__ */
+
+/**
+  * Converts word into bytes with rotations having been done.
+  */
+static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_256\n");
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+    __asm__ __volatile__ (
+        // v0-v3 - first block
+        // v12 first block helper
+        // v4-v7 - second block
+        // v13 second block helper
+        // v8-v11 - third block
+        // v14 third block helper
+        // w4-w19 - fourth block
+
+        // v0  0  1  2  3
+        // v1  4  5  6  7
+        // v2  8  9 10 11
+        // v3 12 13 14 15
+        // load CHACHA state with indices placed as shown above
+        /* Load state to encrypt */
+        "LD1    {v20.4S-v23.4S}, [%[input]] \n\t"
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v24.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+        /* Move state into regular registers */
+        "MOV    x4, v20.D[0] \n\t"
+        "MOV    x6, v20.D[1] \n\t"
+        "MOV    x8, v21.D[0] \n\t"
+        "MOV    x10, v21.D[1] \n\t"
+        "MOV    x12, v22.D[0] \n\t"
+        "MOV    x14, v22.D[1] \n\t"
+        "MOV    x16, v23.D[0] \n\t"
+        "MOV    x18, v23.D[1] \n\t"
+        /* Move state into vector registers (x3) */
+        "MOV    v0.16B, v20.16B \n\t"
+        "MOV    v1.16B, v21.16B \n\t"
+        "LSR    x19, x18, #32 \n\t"
+        "MOV    v2.16B, v22.16B \n\t"
+        "ADD    w20, w16, #1 \n\t"
+        "MOV    v3.16B, v23.16B \n\t"
+        "LSR    x17, x16, #32 \n\t"
+        "MOV    v4.16B, v20.16B \n\t"
+        "MOV    v5.16B, v21.16B \n\t"
+        "LSR    x15, x14, #32 \n\t"
+        "MOV    v6.16B, v22.16B \n\t"
+        "ADD    w21, w16, #2 \n\t"
+        "MOV    v7.16B, v23.16B \n\t"
+        "LSR    x13, x12, #32 \n\t"
+        "MOV    v8.16B, v20.16B \n\t"
+        "MOV    v9.16B, v21.16B \n\t"
+        "LSR    x11, x10, #32 \n\t"
+        "MOV    v10.16B, v22.16B \n\t"
+        "ADD    w16, w16, #3 \n\t"
+        "MOV    v11.16B, v23.16B \n\t"
+        "LSR    x9, x8, #32 \n\t"
+        /* Set counter word */
+        "MOV    v7.S[0], w20 \n\t"
+        "LSR    x7, x6, #32 \n\t"
+        "MOV    v11.S[0], w21 \n\t"
+        "LSR    x5, x4, #32 \n\t"
+        /* Set number of odd+even rounds to perform */
+        "MOV    w3, #10 \n\t"
+        "\n"
+    "L_chacha20_arm64_256_loop_%=: \n\t"
+        "SUBS   w3, w3, #1 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    w4, w4, w8 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "EOR    w18, w18, w6 \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        "ROR    w16, w16, #16 \n\t"
+        "REV32  v11.8H, v11.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ROR    w17, w17, #16 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "SHL    v9.4S, v14.4S, #12 \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "SRI    v9.4S, v14.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ROR    w9, w9, #20 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ROR    w10, w10, #20 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w4, w4, w8 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "TBL    v3.16B, { v3.16B }, v24.16B \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "TBL    v7.16B, { v7.16B }, v24.16B \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "TBL    v11.16B, { v11.16B }, v24.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "EOR    w18, w18, w6 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "SHL    v9.4S, v14.4S, #7 \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "SRI    v9.4S, v14.4S, #25 \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #4 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #12 \n\t"
+        "EXT    v9.16B, v9.16B, v9.16B, #4 \n\t"
+        "EXT    v10.16B, v10.16B, v10.16B, #8 \n\t"
+        "EXT    v11.16B, v11.16B, v11.16B, #12 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    w4, w4, w9 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "EOR    w17, w17, w6 \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        "REV32  v11.8H, v11.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ROR    w16, w16, #16 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ROR    w17, w17, #16 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "SHL    v9.4S, v14.4S, #12 \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        "ROR    w9, w9, #20 \n\t"
+        "SRI    v9.4S, v14.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ROR    w10, w10, #20 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w4, w4, w9 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "TBL    v3.16B, { v3.16B }, v24.16B \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "TBL    v7.16B, { v7.16B }, v24.16B \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "TBL    v11.16B, { v11.16B }, v24.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "EOR    w17, w17, w6 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "SHL    v9.4S, v14.4S, #7 \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "SRI    v9.4S, v14.4S, #25 \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #12 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #4 \n\t"
+        "EXT    v9.16B, v9.16B, v9.16B, #12 \n\t"
+        "EXT    v10.16B, v10.16B, v10.16B, #8 \n\t"
+        "EXT    v11.16B, v11.16B, v11.16B, #4 \n\t"
+        "BNE    L_chacha20_arm64_256_loop_%= \n\t"
+        /* Load message */
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        /* Add one (2 added during calculating vector results) */
+        "ADD    w16, w16, #1 \n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        "ADD    v0.4S, v0.4S, v20.4S \n\t"
+        "ADD    v1.4S, v1.4S, v21.4S \n\t"
+        "ADD    v2.4S, v2.4S, v22.4S \n\t"
+        "ADD    v3.4S, v3.4S, v23.4S \n\t"
+        "EOR    v0.16B, v0.16B, v16.16B \n\t"
+        "EOR    v1.16B, v1.16B, v17.16B \n\t"
+        "EOR    v2.16B, v2.16B, v18.16B \n\t"
+        "EOR    v3.16B, v3.16B, v19.16B \n\t"
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        "ST1    {v0.4S-v3.4S}, [%[c]], #64 \n\t"
+        "MOV    v23.S[0], w20 \n\t"
+        "ADD    v4.4S, v4.4S, v20.4S \n\t"
+        "ADD    v5.4S, v5.4S, v21.4S \n\t"
+        "ADD    v6.4S, v6.4S, v22.4S \n\t"
+        "ADD    v7.4S, v7.4S, v23.4S \n\t"
+        "EOR    v4.16B, v4.16B, v16.16B \n\t"
+        "EOR    v5.16B, v5.16B, v17.16B \n\t"
+        "EOR    v6.16B, v6.16B, v18.16B \n\t"
+        "EOR    v7.16B, v7.16B, v19.16B \n\t"
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        "ST1    {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+        "MOV    v23.S[0], w21 \n\t"
+        "ADD    v8.4S, v8.4S, v20.4S \n\t"
+        "ADD    v9.4S, v9.4S, v21.4S \n\t"
+        "ADD    v10.4S, v10.4S, v22.4S \n\t"
+        "ADD    v11.4S, v11.4S, v23.4S \n\t"
+        "EOR    v8.16B, v8.16B, v16.16B \n\t"
+        "EOR    v9.16B, v9.16B, v17.16B \n\t"
+        "EOR    v10.16B, v10.16B, v18.16B \n\t"
+        "EOR    v11.16B, v11.16B, v19.16B \n\t"
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        "ST1    {v8.4S-v11.4S}, [%[c]], #64 \n\t"
+        /* Move regular registers into vector registers for adding and xor */
+        "ORR    x4, x4, x5, lsl #32 \n\t"
+        "ORR    x6, x6, x7, lsl #32 \n\t"
+        "ORR    x8, x8, x9, lsl #32 \n\t"
+        "MOV    v12.D[0], x4 \n\t"
+        "ORR    x10, x10, x11, lsl #32 \n\t"
+        "MOV    v12.D[1], x6 \n\t"
+        "ORR    x12, x12, x13, lsl #32 \n\t"
+        "MOV    v13.D[0], x8 \n\t"
+        "ORR    x14, x14, x15, lsl #32 \n\t"
+        "MOV    v13.D[1], x10 \n\t"
+        "ORR    x16, x16, x17, lsl #32 \n\t"
+        "MOV    v14.D[0], x12 \n\t"
+        "ORR    x18, x18, x19, lsl #32 \n\t"
+        "MOV    v14.D[1], x14 \n\t"
+        "MOV    v15.D[0], x16 \n\t"
+        "MOV    v15.D[1], x18 \n\t"
+        /* Add back state, XOR in message and store */
+        "ADD    v12.4S, v12.4S, v20.4S \n\t"
+        "ADD    v13.4S, v13.4S, v21.4S \n\t"
+        "ADD    v14.4S, v14.4S, v22.4S \n\t"
+        "ADD    v15.4S, v15.4S, v23.4S \n\t"
+        "EOR    v12.16B, v12.16B, v16.16B \n\t"
+        "EOR    v13.16B, v13.16B, v17.16B \n\t"
+        "EOR    v14.16B, v14.16B, v18.16B \n\t"
+        "EOR    v15.16B, v15.16B, v19.16B \n\t"
+        "ST1    {v12.4S-v15.4S}, [%[c]], #64 \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c)
+        : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
+        : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
+          "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+          "x17", "x18", "x19", "x20", "x21", "v0", "v1",
+          "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+          "v9", "v10", "v11", "v12", "v13", "v14",
+          "v15", "v16", "v17", "v18", "v19", "v20",
+          "v21", "v22", "v23"
+    );
+#else
+    word32 x[CHACHA_CHUNK_WORDS];
+    word32* x_addr = x;
+    __asm__ __volatile__ (
+        // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
+        // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+
+        ".align 2 \n\t"
+        "LDR r14, %[input] \n\t" // load input address
+
+        "LDM r14, { r0-r12 } \n\t"
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  10  11  12
+        "VMOV d0, r0, r1 \n\t"
+        "VMOV d1, r2, r3 \n\t"
+        "VMOV d2, r4, r5 \n\t"
+        "VMOV d3, r6, r7 \n\t"
+        "VMOV d4, r8, r9 \n\t"
+        "STRD r10, r11, %[x_10] \n\t"
+        "VMOV d5, r10, r11 \n\t"
+        "LDRD r11, r10, [r14, #4*14] \n\t"
+        "VMOV q4, q0 \n\t"
+        "VMOV q5, q1 \n\t"
+        "VMOV q6, q2 \n\t"
+        "VMOV q8, q0 \n\t"
+        "VMOV q9, q1 \n\t"
+        "VMOV q10, q2 \n\t"
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  15  14  12
+        "VMOV d7, r11, r10 \n\t"
+        "STR r10, %[x_15] \n\t"
+        "VMOV d15, r11, r10 \n\t"
+        "VMOV d23, r11, r10 \n\t"
+        "MOV r10, r12 \n\t"
+        "MOV r12, r11 \n\t"
+        "LDR r11, [r14, #4*13] \n\t"
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  12  13  14
+
+        "MOV r14, %[rounds] \n\t"
+
+        "VMOV d6, r10, r11 \n\t"
+        "ADD r10, r10, #1 \n\t"
+        "VMOV d14, r10, r11 \n\t"
+        "ADD r10, r10, #1 \n\t"
+        "VMOV d22, r10, r11 \n\t"
+        "ADD r10, r10, #1 \n\t" // ARM calculates the fourth block (two was already added earlier)
+        "\n"
+    "L_chacha20_arm32_256_loop_%=: \n\t"
+        "SUBS r14, r14, #1 \n\t"
+
+        // 0, 4,  8, 12
+        // 1, 5,  9, 13
+
+        // ODD ROUND
+        "ADD r0, r0, r4 \n\t" // 0 0 4
+        "VADD.I32 q0, q0, q1 \n\t"
+        "ADD r1, r1, r5 \n\t" // 1 1 5
+        "VADD.I32 q4, q4, q5 \n\t"
+        "EOR r10, r10, r0 \n\t" // 12 12 0
+        "VADD.I32 q8, q8, q9 \n\t"
+        "EOR r11, r11, r1 \n\t" // 13 13 1
+        "VEOR q12, q3, q0 \n\t"
+        "ROR r10, r10, #16 \n\t" // 12 12
+        "VEOR q13, q7, q4 \n\t"
+        "ROR r11, r11, #16 \n\t" // 13 13
+        "VEOR q14, q11, q8 \n\t"
+        "ADD r8, r8, r10 \n\t" // 8 8 12
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q12 \n\t"
+        "ADD r9, r9, r11 \n\t" //  9 9 13
+        "VREV32.16 q7, q13 \n\t"
+        "EOR r4, r4, r8 \n\t" // 4 4 8
+        "VREV32.16 q11, q14 \n\t"
+
+        "EOR r5, r5, r9 \n\t" // 5 5 9
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r4, r4, #20 \n\t" // 4 4
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r5, r5, #20 \n\t" // 5 5
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r0, r0, r4 \n\t" // 0 0 4
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r1, r1, r5 \n\t" // 1 1 5
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r10, r10, r0 \n\t" // 12 12 0
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r11, r11, r1 \n\t" // 13 13 1
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #12 \n\t"
+        "ROR r10, r10, #24 \n\t" // 12 12
+        "VSHL.I32 q5, q13, #12 \n\t"
+        "ROR r11, r11, #24 \n\t" // 13 13
+        "VSHL.I32 q9, q14, #12 \n\t"
+        "ADD r8, r8, r10 \n\t" // 8 8 12
+        "VSRI.I32 q1, q12, #20 \n\t"
+        "ADD r9, r9, r11 \n\t" // 9 9 13
+        "VSRI.I32 q5, q13, #20 \n\t"
+        "STR r11, %[x_13] \n\t"
+        "VSRI.I32 q9, q14, #20 \n\t"
+
+        "LDR r11, %[x_15] \n\t"
+        "VADD.I32 q0, q0, q1 \n\t"
+        "EOR r4, r4, r8 \n\t" // 4 4 8
+        "VADD.I32 q4, q4, q5 \n\t"
+        "STR r8, %[x_8] \n\t"
+        "VADD.I32 q8, q8, q9 \n\t"
+        "LDR r8, %[x_10] \n\t"
+        "VEOR q12, q3, q0 \n\t"
+        "EOR r5, r5, r9 \n\t" // 5 5 9
+        "VEOR q13, q7, q4 \n\t"
+        "STR r9, %[x_9] \n\t"
+        "VEOR q14, q11, q8 \n\t"
+        "LDR r9, %[x_11] \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q12, #8 \n\t"
+        "ROR r4, r4, #25 \n\t" // 4 4
+        "VSHL.I32 q7, q13, #8 \n\t"
+        "ROR r5, r5, #25 \n\t" // 5 5
+        "VSHL.I32 q11, q14, #8 \n\t"
+
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7 10 11  12  15  14
+
+        // 2, 6, 10, 14
+        // 3, 7, 11, 15
+
+        "ADD r2, r2, r6 \n\t" // 2 2 6
+        "VSRI.I32 q3, q12, #24 \n\t"
+        "ADD r3, r3, r7 \n\t" // 3 3 7
+        "VSRI.I32 q7, q13, #24 \n\t"
+        "EOR r12, r12, r2 \n\t" // 14 14 2
+        "VSRI.I32 q11, q14, #24 \n\t"
+
+        "EOR r11, r11, r3 \n\t" // 15 15 3
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r12, r12, #16 \n\t" // 14 14
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r11, r11, #16 \n\t" // 15 15
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r8, r8, r12 \n\t" // 10 10 14
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r9, r9, r11 \n\t" // 11 11 15
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r6, r6, r8 \n\t" // 6 6 10
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r7, r7, r9 \n\t" // 7 7 11
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #7 \n\t"
+        "ROR r6, r6, #20 \n\t" // 6 6
+        "VSHL.I32 q5, q13, #7 \n\t"
+        "ROR r7, r7, #20 \n\t" // 7 7
+        "VSHL.I32 q9, q14, #7 \n\t"
+        "ADD r2, r2, r6 \n\t" // 2 2 6
+        "VSRI.I32 q1, q12, #25 \n\t"
+        "ADD r3, r3, r7 \n\t" // 3 3 7
+        "VSRI.I32 q5, q13, #25 \n\t"
+        "EOR r12, r12, r2 \n\t" // 14 14 2
+        "VSRI.I32 q9, q14, #25 \n\t"
+
+        // EVEN ROUND
+
+        "EOR r11, r11, r3 \n\t" // 15 15 3
+        "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one
+        "ROR r12, r12, #24 \n\t" // 14 14
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "ROR r11, r11, #24 \n\t" // 15 15
+        "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three
+
+        "ADD r8, r8, r12 \n\t" // 10 10 14
+        "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one
+        "ADD r9, r9, r11 \n\t" // 11 11 15
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "EOR r6, r6, r8 \n\t" // 6 6 10
+        "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three
+
+        "EOR r7, r7, r9 \n\t" // 7 7 11
+        "VEXT.8 q9, q9, q9, #4 \n\t" // permute elements left by one
+        "ROR r6, r6, #25 \n\t" // 6 6
+        "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two
+        "ROR r7, r7, #25 \n\t" // 7 7
+        "VEXT.8 q11, q11, q11, #12 \n\t" // permute elements left by three
+
+        // 0, 5, 10, 15
+        // 1, 6, 11, 12
+
+        "ADD r0, r0, r5 \n\t" // 0 0 5
+        "VADD.I32 q0, q0, q1 \n\t"
+        "ADD r1, r1, r6 \n\t" // 1 1 6
+        "VADD.I32 q4, q4, q5 \n\t"
+        "EOR r11, r11, r0 \n\t" // 15 15 0
+        "VADD.I32 q8, q8, q9 \n\t"
+        "EOR r10, r10, r1 \n\t" // 12 12 1
+        "VEOR q12, q3, q0 \n\t"
+        "ROR r11, r11, #16 \n\t" // 15 15
+        "VEOR q13, q7, q4 \n\t"
+        "ROR r10, r10, #16 \n\t" // 12 12
+        "VEOR q14, q11, q8 \n\t"
+        "ADD r8, r8, r11 \n\t" // 10 10 15
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q12 \n\t"
+        "ADD r9, r9, r10 \n\t" // 11 11 12
+        "VREV32.16 q7, q13 \n\t"
+        "EOR r5, r5, r8 \n\t" // 5 5 10
+        "VREV32.16 q11, q14 \n\t"
+
+        "EOR r6, r6, r9 \n\t" // 6 6 11
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r5, r5, #20 \n\t" // 5 5
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r6, r6, #20 \n\t" // 6 6
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r0, r0, r5 \n\t" // 0 0 5
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r1, r1, r6 \n\t" // 1 1 6
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r11, r11, r0 \n\t" // 15 15 0
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r10, r10, r1 \n\t" // 12 12 1
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #12 \n\t"
+        "ROR r11, r11, #24 \n\t" // 15 15
+        "VSHL.I32 q5, q13, #12 \n\t"
+        "ROR r10, r10, #24 \n\t" // 12 12
+        "VSHL.I32 q9, q14, #12 \n\t"
+        "ADD r8, r8, r11 \n\t" // 10 10 15
+        "VSRI.I32 q1, q12, #20 \n\t"
+        "STR r11, %[x_15] \n\t"
+        "VSRI.I32 q5, q13, #20 \n\t"
+        "LDR r11, %[x_13] \n\t"
+        "VSRI.I32 q9, q14, #20 \n\t"
+
+        "ADD r9, r9, r10 \n\t" // 11 11 12
+        "VADD.I32 q0, q0, q1 \n\t"
+        "EOR r5, r5, r8 \n\t" // 5 5 10
+        "VADD.I32 q4, q4, q5 \n\t"
+        "STR r8, %[x_10] \n\t"
+        "VADD.I32 q8, q8, q9 \n\t"
+        "LDR r8, %[x_8] \n\t"
+        "VEOR q12, q3, q0 \n\t"
+        "EOR r6, r6, r9 \n\t" // 6 6 11
+        "VEOR q13, q7, q4 \n\t"
+        "STR r9, %[x_11] \n\t"
+        "VEOR q14, q11, q8 \n\t"
+        "LDR r9, %[x_9] \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q12, #8 \n\t"
+        "ROR r5, r5, #25 \n\t" // 5 5
+        "VSHL.I32 q7, q13, #8 \n\t"
+        "ROR r6, r6, #25 \n\t" // 6 6
+        "VSHL.I32 q11, q14, #8 \n\t"
+
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  12  13  14
+
+        // 2, 7,  8, 13
+        // 3, 4,  9, 14
+
+        "ADD r2, r2, r7 \n\t" // 2 2 7
+        "VSRI.I32 q3, q12, #24 \n\t"
+        "ADD r3, r3, r4 \n\t" // 3 3 4
+        "VSRI.I32 q7, q13, #24 \n\t"
+        "EOR r11, r11, r2 \n\t" // 13 13 2
+        "VSRI.I32 q11, q14, #24 \n\t"
+
+        "EOR r12, r12, r3 \n\t" // 14 14 3
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r11, r11, #16 \n\t" // 13 13
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r12, r12, #16 \n\t" // 14 14
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r8, r8, r11 \n\t" // 8 8 13
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r9, r9, r12 \n\t" // 9 9 14
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r7, r7, r8 \n\t" // 7 7 8
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r4, r4, r9 \n\t" // 4 4 9
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #7 \n\t"
+        "ROR r7, r7, #20 \n\t" // 7 7
+        "VSHL.I32 q5, q13, #7 \n\t"
+        "ROR r4, r4, #20 \n\t" // 4 4
+        "VSHL.I32 q9, q14, #7 \n\t"
+        "ADD r2, r2, r7 \n\t" // 2 2 7
+        "VSRI.I32 q1, q12, #25 \n\t"
+        "ADD r3, r3, r4 \n\t" // 3 3 4
+        "VSRI.I32 q5, q13, #25 \n\t"
+        "EOR r11, r11, r2 \n\t" // 13 13 2
+        "VSRI.I32 q9, q14, #25 \n\t"
+
+        "EOR r12, r12, r3 \n\t" // 14 14 3
+        "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three
+        "ROR r11, r11, #24 \n\t" // 13 13
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "ROR r12, r12, #24 \n\t" // 14 14
+        "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one
+
+        "ADD r8, r8, r11 \n\t" // 8 8 13
+        "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three
+        "ADD r9, r9, r12 \n\t" // 9 9 14
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "EOR r7, r7, r8 \n\t" // 7 7 8
+        "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
+
+        "EOR r4, r4, r9 \n\t" // 4 4 9
+        "VEXT.8 q9, q9, q9, #12 \n\t" // permute elements left by three
+        "ROR r7, r7, #25 \n\t" // 7 7
+        "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two
+        "ROR r4, r4, #25 \n\t" // 4 4
+        "VEXT.8 q11, q11, q11, #4 \n\t" // permute elements left by one
+
+        "BNE L_chacha20_arm32_256_loop_%= \n\t"
+
+        "LDR r14, %[x_addr] \n\t" // load address of x to r14
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  12  13  14
+        "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier
+        "STM r14, { r0-r9 } \n\t"
+        "STRD r10, r11, [r14, #4*12] \n\t"
+        "LDR r9, %[input] \n\t" // load input address
+        "STR r12, [r14, #4*14] \n\t"
+        "LDR r10, %[c] \n\t" // load c address
+
+        "VLDM r9, { q12-q15 } \n\t"
+        "LDR r12, %[m] \n\t" // load m address
+
+        "VADD.I32 q0, q0, q12 \n\t"
+        "VADD.I32 q1, q1, q13 \n\t"
+        "VADD.I32 q2, q2, q14 \n\t"
+        "VADD.I32 q3, q3, q15 \n\t"
+
+        "VADD.I32 q4, q4, q12 \n\t"
+        "VADD.I32 q5, q5, q13 \n\t"
+        "VADD.I32 q6, q6, q14 \n\t"
+        "VADD.I32 q7, q7, q15 \n\t"
+
+        "MOV r11, #1 \n\t"
+
+        "VADD.I32 q8, q8, q12 \n\t"
+        "VMOV.I32 q12, #0 \n\t"
+        "VADD.I32 q9, q9, q13 \n\t"
+        "VMOV.I32 d24[0], r11 \n\t"
+        "VADD.I32 q10, q10, q14 \n\t"
+        "VADD.I32 q11, q11, q15 \n\t"
+
+        "VADD.I32 q11, q11, q12 \n\t" // add one to counter
+        "VADD.I32 q7, q7, q12 \n\t" // add one to counter
+        "VADD.I32 q11, q11, q12 \n\t" // add one to counter
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VEOR q0, q0, q12 \n\t"
+        "VEOR q1, q1, q13 \n\t"
+        "VEOR q2, q2, q14 \n\t"
+        "VEOR q3, q3, q15 \n\t"
+        "VSTM r10!, { q0-q3 } \n\t" // store to c
+
+        "VLDM r14, { q0-q3 } \n\t " // load final block from x
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VEOR q4, q4, q12 \n\t"
+        "VEOR q5, q5, q13 \n\t"
+        "VEOR q6, q6, q14 \n\t"
+        "VEOR q7, q7, q15 \n\t"
+        "VSTM r10!, { q4-q7 } \n\t" // store to c
+
+        "VLDM r9, { q4-q7 } \n\t" // load input
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VEOR q8, q8, q12 \n\t"
+        "VEOR q9, q9, q13 \n\t"
+        "VEOR q10, q10, q14 \n\t"
+        "VEOR q11, q11, q15 \n\t"
+        "VSTM r10!, { q8-q11 } \n\t" // store to c
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VADD.I32 q0, q0, q4 \n\t"
+        "VADD.I32 q1, q1, q5 \n\t"
+        "VADD.I32 q2, q2, q6 \n\t"
+        "VADD.I32 q3, q3, q7 \n\t" // three was added earlier
+        "VEOR q0, q0, q12 \n\t"
+        "VEOR q1, q1, q13 \n\t"
+        "VEOR q2, q2, q14 \n\t"
+        "VEOR q3, q3, q15 \n\t"
+        "VSTM r10!, { q0-q3 } \n\t" // store to c
+
+            : [c] "+m" (c),
+              [x_0] "=m" (x),
+              [x_8] "=m" (x[8]),
+              [x_9] "=m" (x[9]),
+              [x_10] "=m" (x[10]),
+              [x_11] "=m" (x[11]),
+              [x_13] "=m" (x[13]),
+              [x_15] "=m" (x[15])
+            : [rounds] "I" (ROUNDS/2), [input] "m" (input),
+              [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES),
+              [m] "m" (m), [x_addr] "m" (x_addr)
+            : "memory", "cc",
+              "r0", "r1", "r2", "r3",
+              "r4", "r5", "r6", "r7",
+              "r8", "r9", "r10", "r11", "r12", "r14",
+              "q0",  "q1",  "q2", "q3", "q4",
+              "q5",  "q6",  "q7", "q8", "q9",
+              "q10", "q11", "q12", "q13", "q14", "q15"
+    );
+
+#endif /* __aarch64__ */
+    return CHACHA_CHUNK_BYTES * 4;
+}
+
+
+static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_128\n");
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+    __asm__ __volatile__ (
+        /* Load incrementer register to modify counter */
+        "LD1    {v22.16B}, [%[L_chacha20_neon_inc_first_word]] \n\t"
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v23.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+        /* Load state to encrypt */
+        "LD1    {v18.4S-v21.4S}, [%[input]] \n\t"
+        /* Load message */
+        "LD1    {v14.4S-v17.4S}, [%[m]], #64 \n\t"
+        /* Move state into vector registers (x3) */
+        "MOV    v0.16B, v18.16B \n\t"
+        "MOV    v1.16B, v19.16B \n\t"
+        "MOV    v2.16B, v20.16B \n\t"
+        "MOV    v3.16B, v21.16B \n\t"
+        "MOV    v4.16B, v18.16B \n\t"
+        "MOV    v5.16B, v19.16B \n\t"
+        "MOV    v6.16B, v20.16B \n\t"
+        "MOV    v7.16B, v21.16B \n\t"
+        /* Add counter word */
+        "ADD    v7.4S, v7.4S, v22.4S \n\t"
+        /* Set number of odd+even rounds to perform */
+        "MOV    w3, #10 \n\t"
+        "\n"
+    "L_chacha20_arm64_128_loop_%=: \n\t"
+        "SUBS   w3, w3, #1 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v23.16B \n\t"
+        "TBL    v7.16B, { v7.16B }, v23.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #4 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #12 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v23.16B \n\t"
+        "TBL    v7.16B, { v7.16B }, v23.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #12 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #4 \n\t"
+        "BNE    L_chacha20_arm64_128_loop_%= \n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        "ADD    v0.4S, v0.4S, v18.4S \n\t"
+        "ADD    v1.4S, v1.4S, v19.4S \n\t"
+        "ADD    v2.4S, v2.4S, v20.4S \n\t"
+        "ADD    v3.4S, v3.4S, v21.4S \n\t"
+        "EOR    v0.16B, v0.16B, v14.16B \n\t"
+        "EOR    v1.16B, v1.16B, v15.16B \n\t"
+        "EOR    v2.16B, v2.16B, v16.16B \n\t"
+        "EOR    v3.16B, v3.16B, v17.16B \n\t"
+        "LD1    {v14.4S-v17.4S}, [%[m]], #64 \n\t"
+        "ST1    {v0.4S-v3.4S}, [%[c]], #64 \n\t"
+        "ADD    v21.4S, v21.4S, v22.4S \n\t"
+        "ADD    v4.4S, v4.4S, v18.4S \n\t"
+        "ADD    v5.4S, v5.4S, v19.4S \n\t"
+        "ADD    v6.4S, v6.4S, v20.4S \n\t"
+        "ADD    v7.4S, v7.4S, v21.4S \n\t"
+        "EOR    v4.16B, v4.16B, v14.16B \n\t"
+        "EOR    v5.16B, v5.16B, v15.16B \n\t"
+        "EOR    v6.16B, v6.16B, v16.16B \n\t"
+        "EOR    v7.16B, v7.16B, v17.16B \n\t"
+        "ST1    {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c)
+        : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8),
+          [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+        : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+          "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+          "v16", "v17", "v18", "v19", "v20", "v21"
+    );
+#else
+    __asm__ __volatile__ (
+        "MOV r11, %[rounds] \n\t"
+        "MOV r12, #1 \n\t"
+        "VLDM %[input], { q0-q3 } \n\t"
+        "VMOV.I32 q8, #0 \n\t"
+        "VMOV q4, q0 \n\t"
+        "VMOV.I32 d16[0], r12 \n\t"
+        "VMOV q5, q1 \n\t"
+        "VMOV q6, q2 \n\t"
+        "VADD.I32 q7, q3, q8 \n\t" // add one to counter
+
+        // store input
+        "VMOV q10, q0 \n\t"
+        "VMOV q11, q1 \n\t"
+        "VMOV q12, q2 \n\t"
+        "VMOV q13, q3 \n\t"
+        "\n"
+    "L_chacha20_arm32_128_loop_%=: \n\t"
+        "SUBS r11, r11, #1 \n\t"
+
+        // ODD ROUND
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q8 \n\t"
+        "VREV32.16 q7, q9 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #12 \n\t"
+        "VSHL.I32 q5, q9, #12 \n\t"
+        "VSRI.I32 q1, q8, #20 \n\t"
+        "VSRI.I32 q5, q9, #20 \n\t"
+
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q8, #8 \n\t"
+        "VSHL.I32 q7, q9, #8 \n\t"
+        "VSRI.I32 q3, q8, #24 \n\t"
+        "VSRI.I32 q7, q9, #24 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #7 \n\t"
+        "VSHL.I32 q5, q9, #7 \n\t"
+        "VSRI.I32 q1, q8, #25 \n\t"
+        "VSRI.I32 q5, q9, #25 \n\t"
+
+        // EVEN ROUND
+
+        "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three
+
+        "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three
+
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q8 \n\t"
+        "VREV32.16 q7, q9 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #12 \n\t"
+        "VSHL.I32 q5, q9, #12 \n\t"
+        "VSRI.I32 q1, q8, #20 \n\t"
+        "VSRI.I32 q5, q9, #20 \n\t"
+
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q8, #8 \n\t"
+        "VSHL.I32 q7, q9, #8 \n\t"
+        "VSRI.I32 q3, q8, #24 \n\t"
+        "VSRI.I32 q7, q9, #24 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #7 \n\t"
+        "VSHL.I32 q5, q9, #7 \n\t"
+        "VSRI.I32 q1, q8, #25 \n\t"
+        "VSRI.I32 q5, q9, #25 \n\t"
+
+        "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one
+
+        "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
+
+        "BNE L_chacha20_arm32_128_loop_%= \n\t"
+
+        "VMOV.I32 q8, #0 \n\t"
+        "VADD.I32 q0, q0, q10 \n\t"
+        "VADD.I32 q1, q1, q11 \n\t"
+        "VMOV.I32 d16[0], r12 \n\t"
+        "VADD.I32 q2, q2, q12 \n\t"
+        "VADD.I32 q3, q3, q13 \n\t"
+
+        "VADD.I32 q13, q13, q8 \n\t" // add one to counter
+
+        "VADD.I32 q4, q4, q10 \n\t"
+        "VADD.I32 q5, q5, q11 \n\t"
+        "VADD.I32 q6, q6, q12 \n\t"
+        "VADD.I32 q7, q7, q13 \n\t"
+
+        "VLDM %[m], { q8-q15 } \n\t"
+        "VEOR q0, q0, q8 \n\t"
+        "VEOR q1, q1, q9 \n\t"
+        "VEOR q2, q2, q10 \n\t"
+        "VEOR q3, q3, q11 \n\t"
+        "VEOR q4, q4, q12 \n\t"
+        "VEOR q5, q5, q13 \n\t"
+        "VEOR q6, q6, q14 \n\t"
+        "VEOR q7, q7, q15 \n\t"
+        "VSTM %[c], { q0-q7 } \n\t"
+
+        : [c] "+r" (c), [m] "+r" (m)
+        : [rounds] "I" (ROUNDS/2), [input] "r" (input),
+          [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES)
+        : "memory", "cc",
+          "r11", "r12",
+          "q0",  "q1",  "q2", "q3", "q4",
+          "q5",  "q6",  "q7", "q8", "q9",
+          "q10", "q11", "q12", "q13", "q14", "q15"
+    );
+#endif /* __aarch64__ */
+    return CHACHA_CHUNK_BYTES * 2;
+}
+
+static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
+                                           byte* c, word32 bytes)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_64 with %d bytes\n", bytes);
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+    __asm__ __volatile__ (
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v13.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+        "LD1    {v14.4S}, [%[L_chacha20_neon_inc_first_word]] \n\t"
+        /* Load state to encrypt */
+        "LD1    {v8.4S-v11.4S}, [%[input]] \n\t"
+        "\n"
+    "L_chacha20_arm64_64_loop_%=: \n\t"
+        /* Move state into vector registers (x3) */
+        "MOV    v0.16B, v8.16B \n\t"
+        "MOV    v1.16B, v9.16B \n\t"
+        "MOV    v2.16B, v10.16B \n\t"
+        "MOV    v3.16B, v11.16B \n\t"
+        /* Add counter word */
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Add back state */
+        "ADD    v0.4S, v0.4S, v8.4S \n\t"
+        "ADD    v1.4S, v1.4S, v9.4S \n\t"
+        "ADD    v2.4S, v2.4S, v10.4S \n\t"
+        "ADD    v3.4S, v3.4S, v11.4S \n\t"
+        "CMP    %[bytes], #64 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_64_%= \n\t"
+        "LD1    {v4.4S-v7.4S}, [%[m]], #64 \n\t"
+        "EOR    v4.16B, v4.16B, v0.16B \n\t"
+        "EOR    v5.16B, v5.16B, v1.16B \n\t"
+        "EOR    v6.16B, v6.16B, v2.16B \n\t"
+        "EOR    v7.16B, v7.16B, v3.16B \n\t"
+        "ST1    {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+        "SUBS   %[bytes], %[bytes], #64 \n\t"
+        "ADD    v11.4S, v11.4S, v14.4S \n\t"
+        "BNE    L_chacha20_arm64_64_loop_%= \n\t"
+        "B      L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_64_%=: \n\t"
+        "CMP    %[bytes], #32 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_32_%= \n\t"
+        "LD1    {v4.4S, v5.4S}, [%[m]], #32 \n\t"
+        "EOR    v4.16B, v4.16B, v0.16B \n\t"
+        "EOR    v5.16B, v5.16B, v1.16B \n\t"
+        "ST1    {v4.4S, v5.4S}, [%[c]], #32 \n\t"
+        "SUBS   %[bytes], %[bytes], #32 \n\t"
+        "MOV    v0.16B, v2.16B \n\t"
+        "MOV    v1.16B, v3.16B \n\t"
+        "BEQ    L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_32_%=: \n\t"
+        "CMP    %[bytes], #16 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_16_%= \n\t"
+        "LD1    {v4.4S}, [%[m]], #16 \n\t"
+        "EOR    v4.16B, v4.16B, v0.16B \n\t"
+        "ST1    {v4.4S}, [%[c]], #16 \n\t"
+        "SUBS   %[bytes], %[bytes], #16 \n\t"
+        "MOV    v0.16B, v1.16B \n\t"
+        "BEQ    L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_16_%=: \n\t"
+        "CMP    %[bytes], #8 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_8_%= \n\t"
+        "LD1    {v4.2S}, [%[m]], #8 \n\t"
+        "EOR    v4.8B, v4.8B, v0.8B \n\t"
+        "ST1    {v4.2S}, [%[c]], #8 \n\t"
+        "SUBS   %[bytes], %[bytes], #8 \n\t"
+        "MOV    v0.D[0], v0.D[1] \n\t"
+        "BEQ    L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_8_%=: \n\t"
+        "MOV    x4, v0.D[0] \n\t"
+        "LSL    x5, %[bytes], #3 \n\t"
+        "\n"
+    "L_chacha20_arm64_64_loop_lt_8_%=: \n\t"
+        "LDRB   w6, [%[m], %[bytes]] \n\t"
+        "ROR    x7, x4, x5 \n\t"
+        "EOR    w6, w6, w7 \n\t"
+        "STRB   w6, [%[c], %[bytes]] \n\t"
+        "SUBS   %[bytes], %[bytes], #1 \n\t"
+        "SUB    x5, x5, #8 \n\t"
+        "BGE    L_chacha20_arm64_64_loop_lt_8_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_done_%=: \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes)
+        : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8),
+          [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+        : "memory", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3",
+          "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+    );
+#else
+    __asm__ __volatile__ (
+        /* Get the input state */
+        "VLDM       %[input], { q8-q11 } \n\t"
+        /* Get the incrementer register */
+        "VLDM       %[L_chacha20_neon_inc_first_word], { q14 } \n\t"
+        "\n"
+    "L_chacha20_arm32_64_outer_loop_%=: \n\t"
+        /* Copy over the input state */
+        "VMOV       q0, q8               \n\t"
+        "VMOV       q1, q9               \n\t"
+        "VMOV       q2, q10              \n\t"
+        "VMOV       q3, q11              \n\t"
+        /* Compute quarter rounds */
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Add back state */
+        "VADD.I32   q0, q0, q8           \n\t"
+        "VADD.I32   q1, q1, q9           \n\t"
+        "VADD.I32   q2, q2, q10          \n\t"
+        "VADD.I32   q3, q3, q11          \n\t"
+        "CMP        %[bytes], #64        \n\t"
+        "BLT        L_chacha20_arm32_64_lt_64_%= \n\t"
+        /* XOR full 64 byte block */
+        "VLDM       %[m], { q4-q7 }      \n\t"
+        "ADD        %[m], %[m], #64      \n\t"
+        "VEOR       q0, q0, q4           \n\t"
+        "VEOR       q1, q1, q5           \n\t"
+        "VEOR       q2, q2, q6           \n\t"
+        "VEOR       q3, q3, q7           \n\t"
+        "VSTM       %[c], { q0-q3 }      \n\t"
+        "ADD        %[c], %[c], #64      \n\t"
+        "SUBS       %[bytes], %[bytes], #64 \n\t"
+        "VADD.I32   q11, q11, q14        \n\t"
+        "BNE        L_chacha20_arm32_64_outer_loop_%= \n\t"
+        "B          L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_64_%=: \n\t"
+        /* XOR 32 bytes */
+        "CMP        %[bytes], #32        \n\t"
+        "BLT        L_chacha20_arm32_64_lt_32_%= \n\t"
+        "VLDM       %[m], { q4-q5 }      \n\t"
+        "ADD        %[m], %[m], #32      \n\t"
+        "VEOR       q4, q4, q0           \n\t"
+        "VEOR       q5, q5, q1           \n\t"
+        "VSTM       %[c], { q4-q5 }      \n\t"
+        "ADD        %[c], %[c], #32      \n\t"
+        "SUBS       %[bytes], %[bytes], #32 \n\t"
+        "VMOV       q0, q2               \n\t"
+        "VMOV       q1, q3               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_32_%=: \n\t"
+        /* XOR 16 bytes */
+        "CMP        %[bytes], #16        \n\t"
+        "BLT        L_chacha20_arm32_64_lt_16_%= \n\t"
+        "VLDM       %[m], { q4 }         \n\t"
+        "ADD        %[m], %[m], #16      \n\t"
+        "VEOR       q4, q4, q0           \n\t"
+        "VSTM       %[c], { q4 }         \n\t"
+        "ADD        %[c], %[c], #16      \n\t"
+        "SUBS       %[bytes], %[bytes], #16 \n\t"
+        "VMOV       q0, q1               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_16_%=: \n\t"
+        /* XOR 8 bytes */
+        "CMP        %[bytes], #8         \n\t"
+        "BLT        L_chacha20_arm32_64_lt_8_%= \n\t"
+        "VLDR       d8, [%[m], #0]       \n\t"
+        "ADD        %[m], %[m], #8       \n\t"
+        "VEOR       d8, d8, d0           \n\t"
+        "VSTR       d8, [%[c], #0]       \n\t"
+        "ADD        %[c], %[c], #8       \n\t"
+        "SUBS       %[bytes], %[bytes], #8 \n\t"
+        "VMOV       d0, d1               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_8_%=: \n\t"
+        /* XOR 4 bytes */
+        "CMP        %[bytes], #4         \n\t"
+        "BLT        L_chacha20_arm32_64_lt_4_%= \n\t"
+        "LDR        r12, [%[m]], #4      \n\t"
+        "VMOV       r14, d0[0]           \n\t"
+        "EOR        r12, r12, r14        \n\t"
+        "STR        r12, [%[c]], #4      \n\t"
+        "SUBS       %[bytes], %[bytes], #4 \n\t"
+        "VTRN.32    d0, d0               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_4_%=: \n\t"
+        /* XOR remaining bytes */
+        "VMOV       r14, d0[0]           \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_4_loop_%=: \n\t"
+        "LDRB       r12, [%[m]], #1      \n\t"
+        "EOR        r12, r12, r14        \n\t"
+        "STRB       r12, [%[c]], #1      \n\t"
+        "SUBS       %[bytes], %[bytes], #1 \n\t"
+        "LSR        r14, r14, #8         \n\t"
+        "BGT        L_chacha20_arm32_64_lt_4_loop_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_done_%=: \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes)
+        : [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+        : "memory", "cc",
+          "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q14", "r12", "r14"
+    );
+#endif /* __aarch64__ */
+}
+
+/**
+  * Encrypt a stream of bytes
+  */
+static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
+                                    word32 bytes)
+{
+    int    processed;
+
+#ifdef __aarch64__
+    if (bytes >= CHACHA_CHUNK_BYTES * 5) {
+        processed = (bytes / (CHACHA_CHUNK_BYTES * 5)) * CHACHA_CHUNK_BYTES * 5;
+        wc_Chacha_encrypt_320(ctx->X, m, c, processed);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes >= CHACHA_CHUNK_BYTES * 4) {
+#else
+    while (bytes >= CHACHA_CHUNK_BYTES * 4) {
+#endif /*__aarch64__ */
+        processed = wc_Chacha_encrypt_256(ctx->X, m, c);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes >= CHACHA_CHUNK_BYTES * 2) {
+        processed = wc_Chacha_encrypt_128(ctx->X, m, c);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes > 0) {
+        wc_Chacha_encrypt_64(ctx->X, m, c, bytes);
+        if (bytes > 64)
+            ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+        ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+    }
+}
+
+/**
+  * API to encrypt/decrypt a message of any size.
+  */
+int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
+                      word32 msglen)
+{
+    if (ctx == NULL || output == NULL || input == NULL)
+        return BAD_FUNC_ARG;
+
+    wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
+
+    return 0;
+}
+
+#endif /* HAVE_CHACHA*/
+
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/st/stsafe.c b/wolfcrypt/src/port/st/stsafe.c
index 9b5e7503c..93ad35d4d 100644
--- a/wolfcrypt/src/port/st/stsafe.c
+++ b/wolfcrypt/src/port/st/stsafe.c
@@ -39,7 +39,7 @@ int SSL_STSAFE_LoadDeviceCertificate(byte** pRawCertificate,
 
     /* Try reading device certificate from ST-SAFE Zone 0 */
     err = stsafe_interface_read_device_certificate_raw(
-        pRawCertificate, pRawCertificateLen);
+        pRawCertificate, (uint32_t*)pRawCertificateLen);
     if (err == 0) {
     #if 0
         /* example for loading into WOLFSSL_CTX */
@@ -154,7 +154,7 @@ int SSL_STSAFE_VerifyPeerCertCb(WOLFSSL* ssl,
     if (err == 0) {
         /* Verify signature */
         err = stsafe_interface_verify(curve_id, (uint8_t*)hash, sigRS,
-            pubKeyX, pubKeyY, result);
+            pubKeyX, pubKeyY, (int32_t*)result);
     }
 
     wc_ecc_free(&key);
@@ -325,4 +325,191 @@ int SSL_STSAFE_SetupPkCallbackCtx(WOLFSSL* ssl, void* user_ctx)
 
 #endif /* HAVE_PK_CALLBACKS */
 
+#ifdef WOLF_CRYPTO_CB
+
+int wolfSSL_STSAFE_CryptoDevCb(int devId, wc_CryptoInfo* info, void* ctx)
+{
+    int rc = CRYPTOCB_UNAVAILABLE;
+    wolfSTSAFE_CryptoCb_Ctx* stsCtx = (wolfSTSAFE_CryptoCb_Ctx*)ctx;
+
+    if (info == NULL || ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    (void)devId;
+    (void)stsCtx;
+
+    if (info->algo_type == WC_ALGO_TYPE_SEED) {
+        /* use the STSAFE hardware for RNG seed */
+    #if !defined(WC_NO_RNG) && defined(USE_STSAFE_RNG_SEED)
+        while (info->seed.sz > 0) {
+            rc = stsafe_interface_getrandom(info->seed.seed, info->seed.sz);
+            if (rc < 0) {
+                return rc;
+            }
+            info->seed.seed += rc;
+            info->seed.sz -= rc;
+        }
+        rc = 0;
+    #else
+        rc = CRYPTOCB_UNAVAILABLE;
+    #endif
+    }
+#ifdef HAVE_ECC
+    else if (info->algo_type == WC_ALGO_TYPE_PK) {
+    #ifdef USE_STSAFE_VERBOSE
+        printf("STSAFE Pk: Type %d\n", info->pk.type);
+    #endif
+
+        if (info->pk.type == WC_PK_TYPE_EC_KEYGEN) {
+            byte pubKeyRaw[STSAFE_MAX_PUBKEY_RAW_LEN];
+            StSafeA_KeySlotNumber slot;
+            StSafeA_CurveId curve_id;
+            int ecc_curve, key_sz;
+
+            WOLFSSL_MSG("STSAFE: ECC KeyGen");
+
+            /* get curve */
+            ecc_curve = info->pk.eckg.curveId;
+            curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+            key_sz = stsafe_get_key_size(curve_id);
+
+            /* generate new ephemeral key on device */
+            rc = stsafe_interface_create_key(&slot, curve_id,
+                (uint8_t*)pubKeyRaw);
+            if (rc != 0) {
+                return rc;
+            }
+
+            /* load generated public key into key, used by wolfSSL */
+            rc = wc_ecc_import_unsigned(info->pk.eckg.key, pubKeyRaw,
+                &pubKeyRaw[key_sz], NULL, ecc_curve);
+        }
+        else if (info->pk.type == WC_PK_TYPE_ECDSA_SIGN) {
+            byte digest[STSAFE_MAX_KEY_LEN];
+            byte sigRS[STSAFE_MAX_SIG_LEN];
+            byte *r, *s;
+            StSafeA_CurveId curve_id;
+            word32 inSz = info->pk.eccsign.inlen;
+            int key_sz;
+
+            WOLFSSL_MSG("STSAFE: ECC Sign");
+
+            curve_id = stsafe_get_curve_mode();
+            key_sz = stsafe_get_key_size(curve_id);
+
+            /* truncate input to match key size */
+            if (inSz > key_sz)
+                inSz = key_sz;
+
+            /* Build input digest */
+            XMEMSET(&digest[0], 0, sizeof(digest));
+            XMEMCPY(&digest[key_sz - inSz], info->pk.eccsign.in, inSz);
+
+            /* Sign using slot 0: Result is R then S */
+            /* Sign will always use the curve type in slot 0
+                (the TLS curve needs to match) */
+            XMEMSET(sigRS, 0, sizeof(sigRS));
+            rc = stsafe_interface_sign(STSAFE_A_SLOT_0, curve_id,
+                (uint8_t*)info->pk.eccsign.in, sigRS);
+            if (rc != 0) {
+                return rc;
+            }
+
+            /* Convert R and S to signature */
+            r = &sigRS[0];
+            s = &sigRS[key_sz];
+            rc = wc_ecc_rs_raw_to_sig((const byte*)r, key_sz, (const byte*)s,
+                key_sz, info->pk.eccsign.out, info->pk.eccsign.outlen);
+            if (rc != 0) {
+                WOLFSSL_MSG("Error converting RS to Signature");
+            }
+        }
+        else if (info->pk.type == WC_PK_TYPE_ECDSA_VERIFY) {
+            byte sigRS[STSAFE_MAX_SIG_LEN];
+            byte *r, *s;
+            word32 r_len = STSAFE_MAX_SIG_LEN/2, s_len = STSAFE_MAX_SIG_LEN/2;
+            byte pubKeyX[STSAFE_MAX_PUBKEY_RAW_LEN/2];
+            byte pubKeyY[STSAFE_MAX_PUBKEY_RAW_LEN/2];
+            word32 pubKeyX_len = sizeof(pubKeyX);
+            word32 pubKeyY_len = sizeof(pubKeyY);
+            StSafeA_CurveId curve_id;
+            int ecc_curve, key_sz;
+
+            WOLFSSL_MSG("STSAFE: ECC Verify");
+
+            if (info->pk.eccverify.key == NULL)
+                return BAD_FUNC_ARG;
+
+            /* determine curve */
+            ecc_curve = info->pk.eccverify.key->dp->id;
+            curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+            key_sz = stsafe_get_key_size(curve_id);
+
+            /* Extract Raw X and Y coordinates of the public key */
+            rc = wc_ecc_export_public_raw(info->pk.eccverify.key,
+                pubKeyX, &pubKeyX_len,
+                pubKeyY, &pubKeyY_len);
+            if (rc == 0) {
+                /* Extract R and S from signature */
+                XMEMSET(sigRS, 0, sizeof(sigRS));
+                r = &sigRS[0];
+                s = &sigRS[key_sz];
+                rc = wc_ecc_sig_to_rs(info->pk.eccverify.sig,
+                    info->pk.eccverify.siglen, r, &r_len, s, &s_len);
+                (void)r_len;
+                (void)s_len;
+            }
+            if (rc == 0) {
+                /* Verify signature */
+                rc = stsafe_interface_verify(curve_id,
+                    (uint8_t*)info->pk.eccverify.hash, sigRS, pubKeyX, pubKeyY,
+                    (int32_t*)info->pk.eccverify.res);
+            }
+        }
+        else if (info->pk.type == WC_PK_TYPE_ECDH) {
+            byte otherKeyX[STSAFE_MAX_KEY_LEN];
+            byte otherKeyY[STSAFE_MAX_KEY_LEN];
+            word32 otherKeyX_len = sizeof(otherKeyX);
+            word32 otherKeyY_len = sizeof(otherKeyY);
+            StSafeA_CurveId curve_id;
+            int ecc_curve;
+
+            WOLFSSL_MSG("STSAFE: PMS");
+
+            if (info->pk.ecdh.public_key == NULL)
+                return BAD_FUNC_ARG;
+
+            /* get curve */
+            ecc_curve = info->pk.ecdh.public_key->dp->id;
+            curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+
+            /* Export otherKey raw X and Y */
+            rc = wc_ecc_export_public_raw(info->pk.ecdh.public_key,
+                &otherKeyX[0], (word32*)&otherKeyX_len,
+                &otherKeyY[0], (word32*)&otherKeyY_len);
+            if (rc == 0) {
+                /* Compute shared secret */
+            	*info->pk.ecdh.outlen = 0;
+                rc = stsafe_interface_shared_secret(curve_id,
+                    otherKeyX, otherKeyY,
+                    info->pk.ecdh.out, (int32_t*)info->pk.ecdh.outlen);
+            }
+        }
+    }
+#endif /* HAVE_ECC */
+
+    /* need to return negative here for error */
+    if (rc != 0 && rc != CRYPTOCB_UNAVAILABLE) {
+        WOLFSSL_MSG("STSAFE: CryptoCb failed");
+    #ifdef USE_STSAFE_VERBOSE
+        printf("STSAFE: CryptoCb failed %d\n", rc);
+    #endif
+        rc = WC_HW_E;
+    }
+
+    return rc;
+}
+
+#endif /* WOLF_CRYPTO_CB */
+
 #endif /* WOLFSSL_STSAFEA100 */
diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c
index 7ab7f92ec..4903ac9ea 100644
--- a/wolfcrypt/src/random.c
+++ b/wolfcrypt/src/random.c
@@ -2345,7 +2345,6 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz)
     #else
         #pragma message("Warning: write a real random seed!!!!, just for testing now")
     #endif
-
     int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz)
     {
         word32 i;
diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c
index 7e3e90dcc..6a5b9861a 100644
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -39,10 +39,6 @@
                                     defined(WOLFSSL_HAVE_SP_ECC)
 
 #ifdef RSA_LOW_MEM
-#ifndef SP_RSA_PRIVATE_EXP_D
-#define SP_RSA_PRIVATE_EXP_D
-#endif
-
 #ifndef WOLFSSL_SP_SMALL
 #define WOLFSSL_SP_SMALL
 #endif
@@ -3670,7 +3666,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e,
 
 #endif /* (WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH) && !WOLFSSL_RSA_PUBLIC_ONLY */
 
-#ifdef WOLFSSL_HAVE_SP_DH
+#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 2048 bits, just need to subtract.
  *
@@ -3685,7 +3681,8 @@ static void sp_2048_mont_norm_64(sp_digit* r, sp_digit* m)
     sp_2048_sub_in_place_64(r, m);
 }
 
-#endif /* WOLFSSL_HAVE_SP_DH */
+#endif /* WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH */
+
 /* Conditionally subtract b from a using the mask m.
  * m is -1 to subtract and 0 when not copying.
  *
@@ -4072,8 +4069,8 @@ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, sp_digit* a, sp_digit* m)
     return sp_2048_div_64_cond(a, m, NULL, r);
 }
 
-#if (defined(SP_RSA_PRIVATE_EXP_D) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
-                                                     defined(WOLFSSL_HAVE_SP_DH)
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
+    defined(WOLFSSL_HAVE_SP_DH)
 #ifdef WOLFSSL_SP_SMALL
 /* Modular exponentiate a to the e mod m. (r = a^e mod m)
  *
@@ -4346,7 +4343,7 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e,
     return err;
 }
 #endif /* WOLFSSL_SP_SMALL */
-#endif /* (SP_RSA_PRIVATE_EXP_D && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
+#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
 
 #ifdef WOLFSSL_HAVE_SP_RSA
 /* RSA public key operation.
@@ -9134,7 +9131,8 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e,
 
 #endif /* (WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH) && !WOLFSSL_RSA_PUBLIC_ONLY */
 
-#ifdef WOLFSSL_HAVE_SP_DH
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
+                                                     defined(WOLFSSL_HAVE_SP_DH)
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 3072 bits, just need to subtract.
  *
@@ -9149,7 +9147,9 @@ static void sp_3072_mont_norm_96(sp_digit* r, sp_digit* m)
     sp_3072_sub_in_place_96(r, m);
 }
 
-#endif /* WOLFSSL_HAVE_SP_DH */
+#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
+
+
 /* Conditionally subtract b from a using the mask m.
  * m is -1 to subtract and 0 when not copying.
  *
@@ -9542,7 +9542,7 @@ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, sp_digit* a, sp_digit* m)
     return sp_3072_div_96_cond(a, m, NULL, r);
 }
 
-#if (defined(SP_RSA_PRIVATE_EXP_D) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                      defined(WOLFSSL_HAVE_SP_DH)
 #ifdef WOLFSSL_SP_SMALL
 /* Modular exponentiate a to the e mod m. (r = a^e mod m)
@@ -9816,7 +9816,7 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e,
     return err;
 }
 #endif /* WOLFSSL_SP_SMALL */
-#endif /* (SP_RSA_PRIVATE_EXP_D && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
+#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
 
 #ifdef WOLFSSL_HAVE_SP_RSA
 /* RSA public key operation.
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index cfe7f37f6..565b67070 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -4363,14 +4363,138 @@ int chacha_test(void)
 
     const byte* keys[] = {key1, key2, key3, key4};
 
-    static const byte ivs1[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
-    static const byte ivs2[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
-    static const byte ivs3[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01};
-    static const byte ivs4[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+    static const byte ivs1[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+    static const byte ivs2[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+    static const byte ivs3[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00};
+    static const byte ivs4[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
 
 
     const byte* ivs[] = {ivs1, ivs2, ivs3, ivs4};
 
+#ifndef BENCH_EMBEDDED
+    static const byte cipher_big_result[] = {
+        0x06, 0xa6, 0x5d, 0x31, 0x21, 0x6c, 0xdb, 0x37, 0x48, 0x7c, 0x01, 0x9d,
+        0x72, 0xdf, 0x0a, 0x5b, 0x64, 0x74, 0x20, 0xba, 0x9e, 0xe0, 0x26, 0x7a,
+        0xbf, 0xdf, 0x83, 0x34, 0x3b, 0x4f, 0x94, 0x3f, 0x37, 0x89, 0xaf, 0x00,
+        0xdf, 0x0f, 0x2e, 0x75, 0x16, 0x41, 0xf6, 0x7a, 0x86, 0x94, 0x9d, 0x32,
+        0x56, 0xf0, 0x79, 0x71, 0x68, 0x6f, 0xa6, 0x6b, 0xc6, 0x59, 0x49, 0xf6,
+        0x10, 0x34, 0x03, 0x03, 0x16, 0x53, 0x9a, 0x98, 0x2a, 0x46, 0xde, 0x17,
+        0x06, 0x65, 0x70, 0xca, 0x0a, 0x1f, 0xab, 0x80, 0x26, 0x96, 0x3f, 0x3e,
+        0x7a, 0x3c, 0xa8, 0x87, 0xbb, 0x65, 0xdd, 0x5e, 0x07, 0x7b, 0x34, 0xe0,
+        0x56, 0xda, 0x32, 0x13, 0x30, 0xc9, 0x0c, 0xd7, 0xba, 0xe4, 0x1f, 0xa6,
+        0x91, 0x4f, 0x72, 0x9f, 0xd9, 0x5c, 0x62, 0x7d, 0xa6, 0xc2, 0xbc, 0x87,
+        0xae, 0x64, 0x11, 0x94, 0x3b, 0xbc, 0x6c, 0x23, 0xbd, 0x7d, 0x00, 0xb4,
+        0x99, 0xf2, 0x68, 0xb5, 0x59, 0x70, 0x93, 0xad, 0x69, 0xd0, 0xb1, 0x28,
+        0x70, 0x92, 0xeb, 0xec, 0x39, 0x80, 0x82, 0xde, 0x44, 0xe2, 0x8a, 0x26,
+        0xb3, 0xe9, 0x45, 0xcf, 0x83, 0x76, 0x9f, 0x6a, 0xa0, 0x46, 0x4a, 0x3d,
+        0x26, 0x56, 0xaf, 0x49, 0x41, 0x26, 0x1b, 0x6a, 0x41, 0x37, 0x65, 0x91,
+        0x72, 0xc4, 0xe7, 0x3c, 0x17, 0x31, 0xae, 0x2e, 0x2b, 0x31, 0x45, 0xe4,
+        0x93, 0xd3, 0x10, 0xaa, 0xc5, 0x62, 0xd5, 0x11, 0x4b, 0x57, 0x1d, 0xad,
+        0x48, 0x06, 0xd0, 0x0d, 0x98, 0xa5, 0xc6, 0x5b, 0xd0, 0x9e, 0x22, 0xc0,
+        0x00, 0x32, 0x5a, 0xf5, 0x1c, 0x89, 0x6d, 0x54, 0x97, 0x55, 0x6b, 0x46,
+        0xc5, 0xc7, 0xc4, 0x48, 0x9c, 0xbf, 0x47, 0xdc, 0x03, 0xc4, 0x1b, 0xcb,
+        0x65, 0xa6, 0x91, 0x9d, 0x6d, 0xf1, 0xb0, 0x7a, 0x4d, 0x3b, 0x03, 0x95,
+        0xf4, 0x8b, 0x0b, 0xae, 0x39, 0xff, 0x3f, 0xf6, 0xc0, 0x14, 0x18, 0x8a,
+        0xe5, 0x19, 0xbd, 0xc1, 0xb4, 0x05, 0x4e, 0x29, 0x2f, 0x0b, 0x33, 0x76,
+        0x28, 0x16, 0xa4, 0xa6, 0x93, 0x04, 0xb5, 0x55, 0x6b, 0x89, 0x3d, 0xa5,
+        0x0f, 0xd3, 0xad, 0xfa, 0xd9, 0xfd, 0x05, 0x5d, 0x48, 0x94, 0x25, 0x5a,
+        0x2c, 0x9a, 0x94, 0x80, 0xb0, 0xe7, 0xcb, 0x4d, 0x77, 0xbf, 0xca, 0xd8,
+        0x55, 0x48, 0xbd, 0x66, 0xb1, 0x85, 0x81, 0xb1, 0x37, 0x79, 0xab, 0x52,
+        0x08, 0x14, 0x12, 0xac, 0xcd, 0x45, 0x4d, 0x53, 0x6b, 0xca, 0x96, 0xc7,
+        0x3b, 0x2f, 0x73, 0xb1, 0x5a, 0x23, 0xbd, 0x65, 0xd5, 0xea, 0x17, 0xb3,
+        0xdc, 0xa1, 0x17, 0x1b, 0x2d, 0xb3, 0x9c, 0xd0, 0xdb, 0x41, 0x77, 0xef,
+        0x93, 0x20, 0x52, 0x3e, 0x9d, 0xf5, 0xbf, 0x33, 0xf7, 0x52, 0xc1, 0x90,
+        0xa0, 0x15, 0x17, 0xce, 0xf7, 0xf7, 0xd0, 0x3a, 0x3b, 0xd1, 0x72, 0x56,
+        0x31, 0x81, 0xae, 0x60, 0xab, 0x40, 0xc1, 0xd1, 0x28, 0x77, 0x53, 0xac,
+        0x9f, 0x11, 0x0a, 0x88, 0x36, 0x4b, 0xda, 0x57, 0xa7, 0x28, 0x5c, 0x85,
+        0xd3, 0x85, 0x9b, 0x79, 0xad, 0x05, 0x1c, 0x37, 0x14, 0x5e, 0x0d, 0xd0,
+        0x23, 0x03, 0x42, 0x1d, 0x48, 0x5d, 0xc5, 0x3c, 0x5a, 0x08, 0xa9, 0x0d,
+        0x6e, 0x82, 0x7c, 0x2e, 0x3c, 0x41, 0xcc, 0x96, 0x8e, 0xad, 0xee, 0x2a,
+        0x61, 0x0b, 0x16, 0x0f, 0xa9, 0x24, 0x40, 0x85, 0xbc, 0x9f, 0x28, 0x8d,
+        0xe6, 0x68, 0x4d, 0x8f, 0x30, 0x48, 0xd9, 0x73, 0x73, 0x6c, 0x9a, 0x7f,
+        0x67, 0xf7, 0xde, 0x4c, 0x0a, 0x8b, 0xe4, 0xb3, 0x08, 0x2a, 0x52, 0xda,
+        0x54, 0xee, 0xcd, 0xb5, 0x62, 0x4a, 0x26, 0x20, 0xfb, 0x40, 0xbb, 0x39,
+        0x3a, 0x0f, 0x09, 0xe8, 0x00, 0xd1, 0x24, 0x97, 0x60, 0xe9, 0x83, 0x83,
+        0xfe, 0x9f, 0x9c, 0x15, 0xcf, 0x69, 0x03, 0x9f, 0x03, 0xe1, 0xe8, 0x6e,
+        0xbd, 0x87, 0x58, 0x68, 0xee, 0xec, 0xd8, 0x29, 0x46, 0x23, 0x49, 0x92,
+        0x72, 0x95, 0x5b, 0x49, 0xca, 0xe0, 0x45, 0x59, 0xb2, 0xca, 0xf4, 0xfc,
+        0xb7, 0x59, 0x37, 0x49, 0x28, 0xbc, 0xf3, 0xd7, 0x61, 0xbc, 0x4b, 0xf3,
+        0xa9, 0x4b, 0x2f, 0x05, 0xa8, 0x01, 0xa5, 0xdc, 0x00, 0x6e, 0x01, 0xb6,
+        0x45, 0x3c, 0xd5, 0x49, 0x7d, 0x5c, 0x25, 0xe8, 0x31, 0x87, 0xb2, 0xb9,
+        0xbf, 0xb3, 0x01, 0x62, 0x0c, 0xd0, 0x48, 0x77, 0xa2, 0x34, 0x0f, 0x16,
+        0x22, 0x28, 0xee, 0x54, 0x08, 0x93, 0x3b, 0xe4, 0xde, 0x7e, 0x63, 0xf7,
+        0x97, 0x16, 0x5d, 0x71, 0x58, 0xc2, 0x2e, 0xf2, 0x36, 0xa6, 0x12, 0x65,
+        0x94, 0x17, 0xac, 0x66, 0x23, 0x7e, 0xc6, 0x72, 0x79, 0x24, 0xce, 0x8f,
+        0x55, 0x19, 0x97, 0x44, 0xfc, 0x55, 0xec, 0x85, 0x26, 0x27, 0xdb, 0x38,
+        0xb1, 0x42, 0x0a, 0xdd, 0x05, 0x99, 0x28, 0xeb, 0x03, 0x6c, 0x9a, 0xe9,
+        0x17, 0xf6, 0x2c, 0xb0, 0xfe, 0xe7, 0xa4, 0xa7, 0x31, 0xda, 0x4d, 0xb0,
+        0x29, 0xdb, 0xdd, 0x8d, 0x12, 0x13, 0x9c, 0xb4, 0xcc, 0x83, 0x97, 0xfb,
+        0x1a, 0xdc, 0x08, 0xd6, 0x30, 0x62, 0xe8, 0xeb, 0x8b, 0x61, 0xcb, 0x1d,
+        0x06, 0xe3, 0xa5, 0x4d, 0x35, 0xdb, 0x59, 0xa8, 0x2d, 0x87, 0x27, 0x44,
+        0x6f, 0xc0, 0x38, 0x97, 0xe4, 0x85, 0x00, 0x02, 0x09, 0xf6, 0x69, 0x3a,
+        0xcf, 0x08, 0x1b, 0x21, 0xbb, 0x79, 0xb1, 0xa1, 0x34, 0x09, 0xe0, 0x80,
+        0xca, 0xb0, 0x78, 0x8a, 0x11, 0x97, 0xd4, 0x07, 0xbe, 0x1b, 0x6a, 0x5d,
+        0xdb, 0xd6, 0x1f, 0x76, 0x6b, 0x16, 0xf0, 0x58, 0x84, 0x5f, 0x59, 0xce,
+        0x62, 0x34, 0xc3, 0xdf, 0x94, 0xb8, 0x2f, 0x84, 0x68, 0xf0, 0xb8, 0x51,
+        0xd9, 0x6d, 0x8e, 0x4a, 0x1d, 0xe6, 0x5c, 0xd8, 0x86, 0x25, 0xe3, 0x24,
+        0xfd, 0x21, 0x61, 0x13, 0x48, 0x3e, 0xf6, 0x7d, 0xa6, 0x71, 0x9b, 0xd2,
+        0x6e, 0xe6, 0xd2, 0x08, 0x94, 0x62, 0x6c, 0x98, 0xfe, 0x2f, 0x9c, 0x88,
+        0x7e, 0x78, 0x15, 0x02, 0x00, 0xf0, 0xba, 0x24, 0x91, 0xf2, 0xdc, 0x47,
+        0x51, 0x4d, 0x15, 0x5e, 0x91, 0x5f, 0x57, 0x5b, 0x1d, 0x35, 0x24, 0x45,
+        0x75, 0x9b, 0x88, 0x75, 0xf1, 0x2f, 0x85, 0xe7, 0x89, 0xd1, 0x01, 0xb4,
+        0xc8, 0x18, 0xb7, 0x97, 0xef, 0x4b, 0x90, 0xf4, 0xbf, 0x10, 0x27, 0x3c,
+        0x60, 0xff, 0xc4, 0x94, 0x20, 0x2f, 0x93, 0x4b, 0x4d, 0xe3, 0x80, 0xf7,
+        0x2c, 0x71, 0xd9, 0xe3, 0x68, 0xb4, 0x77, 0x2b, 0xc7, 0x0d, 0x39, 0x92,
+        0xef, 0x91, 0x0d, 0xb2, 0x11, 0x50, 0x0e, 0xe8, 0xad, 0x3b, 0xf6, 0xb5,
+        0xc6, 0x14, 0x4d, 0x33, 0x53, 0xa7, 0x60, 0x15, 0xc7, 0x27, 0x51, 0xdc,
+        0x54, 0x29, 0xa7, 0x0d, 0x6a, 0x7b, 0x72, 0x13, 0xad, 0x7d, 0x41, 0x19,
+        0x4e, 0x42, 0x49, 0xcc, 0x42, 0xe4, 0xbd, 0x99, 0x13, 0xd9, 0x7f, 0xf3,
+        0x38, 0xa4, 0xb6, 0x33, 0xed, 0x07, 0x48, 0x7e, 0x8e, 0x82, 0xfe, 0x3a,
+        0x9d, 0x75, 0x93, 0xba, 0x25, 0x4e, 0x37, 0x3c, 0x0c, 0xd5, 0x69, 0xa9,
+        0x2d, 0x9e, 0xfd, 0xe8, 0xbb, 0xf5, 0x0c, 0xe2, 0x86, 0xb9, 0x5e, 0x6f,
+        0x28, 0xe4, 0x19, 0xb3, 0x0b, 0xa4, 0x86, 0xd7, 0x24, 0xd0, 0xb8, 0x89,
+        0x7b, 0x76, 0xec, 0x05, 0x10, 0x5b, 0x68, 0xe9, 0x58, 0x66, 0xa3, 0xc5,
+        0xb6, 0x63, 0x20, 0x0e, 0x0e, 0xea, 0x3d, 0x61, 0x5e, 0xda, 0x3d, 0x3c,
+        0xf9, 0xfd, 0xed, 0xa9, 0xdb, 0x52, 0x94, 0x8a, 0x00, 0xca, 0x3c, 0x8d,
+        0x66, 0x8f, 0xb0, 0xf0, 0x5a, 0xca, 0x3f, 0x63, 0x71, 0xbf, 0xca, 0x99,
+        0x37, 0x9b, 0x75, 0x97, 0x89, 0x10, 0x6e, 0xcf, 0xf2, 0xf5, 0xe3, 0xd5,
+        0x45, 0x9b, 0xad, 0x10, 0x71, 0x6c, 0x5f, 0x6f, 0x7f, 0x22, 0x77, 0x18,
+        0x2f, 0xf9, 0x99, 0xc5, 0x69, 0x58, 0x03, 0x12, 0x86, 0x82, 0x3e, 0xbf,
+        0xc2, 0x12, 0x35, 0x43, 0xa3, 0xd9, 0x18, 0x4f, 0x41, 0x11, 0x6b, 0xf3,
+        0x67, 0xaf, 0x3d, 0x78, 0xe4, 0x22, 0x2d, 0xb3, 0x48, 0x43, 0x31, 0x1d,
+        0xef, 0xa8, 0xba, 0x49, 0x8e, 0xa9, 0xa7, 0xb6, 0x18, 0x77, 0x84, 0xca,
+        0xbd, 0xa2, 0x02, 0x1b, 0x6a, 0xf8, 0x5f, 0xda, 0xff, 0xcf, 0x01, 0x6a,
+        0x86, 0x69, 0xa9, 0xe9, 0xcb, 0x60, 0x1e, 0x15, 0xdc, 0x8f, 0x5d, 0x39,
+        0xb5, 0xce, 0x55, 0x5f, 0x47, 0x97, 0xb1, 0x19, 0x6e, 0x21, 0xd6, 0x13,
+        0x39, 0xb2, 0x24, 0xe0, 0x62, 0x82, 0x9f, 0xed, 0x12, 0x81, 0xed, 0xee,
+        0xab, 0xd0, 0x2f, 0x19, 0x89, 0x3f, 0x57, 0x2e, 0xc2, 0xe2, 0x67, 0xe8,
+        0xae, 0x03, 0x56, 0xba, 0xd4, 0xd0, 0xa4, 0x89, 0x03, 0x06, 0x5b, 0xcc,
+        0xf2, 0x22, 0xb8, 0x0e, 0x76, 0x79, 0x4a, 0x42, 0x1d, 0x37, 0x51, 0x5a,
+        0xaa, 0x46, 0x6c, 0x2a, 0xdd, 0x66, 0xfe, 0xc6, 0x68, 0xc3, 0x38, 0xa2,
+        0xae, 0x5b, 0x98, 0x24, 0x5d, 0x43, 0x05, 0x82, 0x38, 0x12, 0xd3, 0xd1,
+        0x75, 0x2d, 0x4f, 0x61, 0xbd, 0xb9, 0x10, 0x87, 0x44, 0x2a, 0x78, 0x07,
+        0xff, 0xf4, 0x0f, 0xa1, 0xf3, 0x68, 0x9f, 0xbe, 0xae, 0xa2, 0x91, 0xf0,
+        0xc7, 0x55, 0x7a, 0x52, 0xd5, 0xa3, 0x8d, 0x6f, 0xe4, 0x90, 0x5c, 0xf3,
+        0x5f, 0xce, 0x3d, 0x23, 0xf9, 0x8e, 0xae, 0x14, 0xfb, 0x82, 0x9a, 0xa3,
+        0x04, 0x5f, 0xbf, 0xad, 0x3e, 0xf2, 0x97, 0x0a, 0x60, 0x40, 0x70, 0x19,
+        0x72, 0xad, 0x66, 0xfb, 0x78, 0x1b, 0x84, 0x6c, 0x98, 0xbc, 0x8c, 0xf8,
+        0x4f, 0xcb, 0xb5, 0xf6, 0xaf, 0x7a, 0xb7, 0x93, 0xef, 0x67, 0x48, 0x02,
+        0x2c, 0xcb, 0xe6, 0x77, 0x0f, 0x7b, 0xc1, 0xee, 0xc5, 0xb6, 0x2d, 0x7e,
+        0x62, 0xa0, 0xc0, 0xa7, 0xa5, 0x80, 0x31, 0x92, 0x50, 0xa1, 0x28, 0x22,
+        0x95, 0x03, 0x17, 0xd1, 0x0f, 0xf6, 0x08, 0xe5, 0xec
+    };
+#define CHACHA_BIG_TEST_SIZE 1305
+#ifndef WOLFSSL_SMALL_STACK
+    byte   cipher_big[CHACHA_BIG_TEST_SIZE] = {0};
+    byte   plain_big[CHACHA_BIG_TEST_SIZE] = {0};
+    byte   input_big[CHACHA_BIG_TEST_SIZE] = {0};
+#else
+    byte*  cipher_big;
+    byte*  plain_big;
+    byte*  input_big;
+#endif /* WOLFSSL_SMALL_STACK */
+    int    block_size;
+#endif /* BENCH_EMBEDDED */
 
     byte a[] = {0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90};
     byte b[] = {0x45,0x40,0xf0,0x5a,0x9f,0x1f,0xb2,0x96};
@@ -4384,6 +4508,26 @@ int chacha_test(void)
     test_chacha[2] = c;
     test_chacha[3] = d;
 
+#ifndef BENCH_EMBEDDED
+#ifdef WOLFSSL_SMALL_STACK
+    cipher_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (cipher_big == NULL) {
+        return MEMORY_E;
+    }
+    plain_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (plain_big == NULL) {
+        return MEMORY_E;
+    }
+    input_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (input_big == NULL) {
+        return MEMORY_E;
+    }
+    XMEMSET(cipher_big, 0, sizeof(CHACHA_BIG_TEST_SIZE));
+    XMEMSET(plain_big, 0, sizeof(CHACHA_BIG_TEST_SIZE));
+    XMEMSET(input_big, 0, sizeof(CHACHA_BIG_TEST_SIZE));
+#endif /* WOLFSSL_SMALL_STACK */
+#endif /* BENCH_EMBEDDED */
+
     for (i = 0; i < times; ++i) {
         if (i < 3) {
             keySz = 32;
@@ -4444,6 +4588,66 @@ int chacha_test(void)
     if (XMEMCMP(plain + 64, sliver, 64))
         return -4320;
 
+#ifndef BENCH_EMBEDDED
+    /* test of encrypting more data */
+    keySz = 32;
+
+    ret |= wc_Chacha_SetKey(&enc, keys[0], keySz);
+    ret |= wc_Chacha_SetKey(&dec, keys[0], keySz);
+    if (ret != 0)
+        return ret;
+
+    ret |= wc_Chacha_SetIV(&enc, ivs[2], 0);
+    ret |= wc_Chacha_SetIV(&dec, ivs[2], 0);
+    if (ret != 0)
+        return ret;
+
+    ret |= wc_Chacha_Process(&enc, cipher_big, plain_big,  CHACHA_BIG_TEST_SIZE);
+    ret |= wc_Chacha_Process(&dec, plain_big,  cipher_big, CHACHA_BIG_TEST_SIZE);
+    if (ret != 0)
+        return ret;
+
+    if (XMEMCMP(plain_big, input_big, sizeof(input_big)))
+        return -4330;
+
+    if (XMEMCMP(cipher_big, cipher_big_result, CHACHA_BIG_TEST_SIZE))
+        return -4331;
+
+    for (i = 0; i < 18; ++i) {
+        /* this will test all paths */
+        // block sizes: 1 2 3 4 7 8 15 16 31 32 63 64 127 128 255 256 511 512
+        block_size = (2 << (i%9)) - (i<9?1:0);
+        keySz = 32;
+
+        ret |= wc_Chacha_SetKey(&enc, keys[0], keySz);
+        ret |= wc_Chacha_SetKey(&dec, keys[0], keySz);
+        if (ret != 0)
+            return ret;
+
+        ret |= wc_Chacha_SetIV(&enc, ivs[2], 0);
+        ret |= wc_Chacha_SetIV(&dec, ivs[2], 0);
+        if (ret != 0)
+            return ret;
+
+        ret |= wc_Chacha_Process(&enc, cipher_big, plain_big,  block_size);
+        ret |= wc_Chacha_Process(&dec, plain_big,  cipher_big, block_size);
+        if (ret != 0)
+            return ret;
+
+        if (XMEMCMP(plain_big, input_big, block_size))
+            return -4340-i;
+
+        if (XMEMCMP(cipher_big, cipher_big_result, block_size))
+            return -4360-i;
+    }
+
+#ifdef WOLFSSL_SMALL_STACK
+    XFREE(cipher_big, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    XFREE(plain_big, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    XFREE(input_big, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif /* WOLFSSL_SMALL_STACK */
+#endif /* BENCH_EMBEDDED */
+
     return 0;
 }
 #endif /* HAVE_CHACHA */
@@ -5457,10 +5661,17 @@ static int aes_key_size_test(void)
     word32 keySize;
 #endif
 
+#if !defined(HAVE_FIPS) || \
+    defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
+    /* w/ FIPS v1 (cert 2425) wc_AesInit just returns 0 always as it's not
+     * supported with that FIPS version */
     ret = wc_AesInit(NULL, HEAP_HINT, devId);
     if (ret != BAD_FUNC_ARG)
         return -4800;
+#endif
+
     ret = wc_AesInit(&aes, HEAP_HINT, devId);
+    /* 0 check OK for FIPSv1 */
     if (ret != 0)
         return -4801;
 
@@ -13094,6 +13305,113 @@ static int openssl_aes_test(void)
             return -7334;
     }
 
+    /* set buffers to be exact size to catch potential over read/write */
+    {
+        /* EVP_CipherUpdate test */
+        const byte cbcPlain[] =
+        {
+            0x6b,0xc1,0xbe,0xe2,0x2e,0x40,0x9f,0x96,
+            0xe9,0x3d,0x7e,0x11,0x73,0x93,0x17,0x2a,
+            0xae,0x2d,0x8a,0x57,0x1e,0x03,0xac,0x9c,
+            0x9e,0xb7,0x6f,0xac,0x45,0xaf,0x8e,0x51,
+            0x30,0xc8,0x1c,0x46,0xa3,0x5c,0xe4,0x11,
+            0xe5,0xfb,0xc1,0x19,0x1a,0x0a,0x52,0xef,
+            0xf6,0x9f,0x24,0x45,0xdf,0x4f,0x9b,0x17,
+            0xad,0x2b,0x41,0x7b,0xe6,0x6c,0x37,0x10
+        };
+
+        byte key[] = "0123456789abcdef   ";  /* align */
+        byte iv[]  = "1234567890abcdef   ";  /* align */
+
+        #define EVP_TEST_BUF_SZ 18
+        #define EVP_TEST_BUF_PAD 32
+        byte cipher[EVP_TEST_BUF_SZ];
+        byte plain [EVP_TEST_BUF_SZ];
+        byte padded[EVP_TEST_BUF_PAD];
+        EVP_CIPHER_CTX en;
+        EVP_CIPHER_CTX de;
+        int outlen ;
+        int total = 0;
+
+        EVP_CIPHER_CTX_init(&en);
+        if (EVP_CipherInit(&en, EVP_aes_128_cbc(),
+            (unsigned char*)key, (unsigned char*)iv, 1) == 0)
+            return -7370;
+        if (EVP_CIPHER_CTX_set_padding(&en, 0) != 1)
+            return -7372;
+        if (EVP_CipherUpdate(&en, (byte*)cipher, &outlen,
+                    (byte*)cbcPlain, EVP_TEST_BUF_SZ) == 0)
+            return -7372;
+        if (outlen != 16)
+            return -7373;
+        total += outlen;
+
+        /* should fail here */
+        if (EVP_CipherFinal(&en, (byte*)&cipher[total], &outlen) != 0)
+            return -7374;
+
+        /* turn padding back on and do successful encrypt */
+        total = 0;
+        EVP_CIPHER_CTX_init(&en);
+        if (EVP_CipherInit(&en, EVP_aes_128_cbc(),
+            (unsigned char*)key, (unsigned char*)iv, 1) == 0)
+            return -7375;
+        if (EVP_CIPHER_CTX_set_padding(&en, 1) != 1)
+            return -7376;
+        if (EVP_CipherUpdate(&en, (byte*)padded, &outlen,
+                    (byte*)cbcPlain, EVP_TEST_BUF_SZ) == 0)
+            return -7377;
+        if (outlen != 16)
+            return -7378;
+        total += outlen;
+
+        if (EVP_CipherFinal(&en, (byte*)&padded[total], &outlen) == 0)
+            return -7379;
+        total += outlen;
+        if (total != 32)
+            return -7380;
+        XMEMCPY(cipher, padded, EVP_TEST_BUF_SZ);
+
+        /* test out of bounds read on buffers w/o padding during decryption */
+        total = 0;
+        EVP_CIPHER_CTX_init(&de);
+        if (EVP_CipherInit(&de, EVP_aes_128_cbc(),
+            (unsigned char*)key, (unsigned char*)iv, 0) == 0)
+            return -7381;
+
+        if (EVP_CIPHER_CTX_set_padding(&de, 0) != 1)
+            return -7382;
+        if (EVP_CipherUpdate(&de, (byte*)plain, &outlen, (byte*)cipher,
+                    EVP_TEST_BUF_SZ) == 0)
+            return -7383;
+        if (outlen != 16)
+            return -7384;
+        total += outlen;
+
+        /* should fail since not using padding */
+        if (EVP_CipherFinal(&de, (byte*)&plain[total], &outlen) != 0)
+            return -7385;
+
+        total = 0;
+        EVP_CIPHER_CTX_init(&de);
+        if (EVP_CipherInit(&de, EVP_aes_128_cbc(),
+            (unsigned char*)key, (unsigned char*)iv, 0) == 0)
+            return -7386;
+        if (EVP_CIPHER_CTX_set_padding(&de, 1) != 1)
+            return -7387;
+        if (EVP_CipherUpdate(&de, (byte*)padded, &outlen, (byte*)padded,
+                    EVP_TEST_BUF_PAD) == 0)
+            return -7388;
+        if (outlen != 16)
+            return -7389;
+        total += outlen;
+
+        if (EVP_CipherFinal(&de, (byte*)&padded[total], &outlen) == 0)
+            return -7390;
+        if (XMEMCMP(padded, cbcPlain, EVP_TEST_BUF_SZ))
+            return -7391;
+    }
+
     {  /* evp_cipher test: EVP_aes_128_cbc */
         EVP_CIPHER_CTX ctx;
 
@@ -13774,8 +14092,9 @@ int openssl_test(void)
 #endif /* NO_DES3 */
 
 #if !defined(NO_AES) && !defined(WOLFCRYPT_ONLY)
-        if (openssl_aes_test() != 0)
+        if (openssl_aes_test() != 0) {
             return -7412;
+        }
 
 #if defined(WOLFSSL_AES_128) && defined(HAVE_AES_CBC)
     {  /* evp_cipher test: EVP_aes_128_cbc */
@@ -15991,6 +16310,19 @@ static int ecc_test_make_pub(WC_RNG* rng)
 
     wc_ecc_init_ex(&key, HEAP_HINT, devId);
 
+#ifdef USE_CERT_BUFFERS_256
+    tmp = (byte*)XMALLOC((size_t)sizeof_ecc_key_der_256, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
+    if (tmp == NULL) {
+        return -8311;
+    }
+    exportBuf = (byte*)XMALLOC((size_t)sizeof_ecc_key_der_256, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
+    if (exportBuf == NULL) {
+        XFREE(tmp, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
+        return -8312;
+    }
+    XMEMCPY(tmp, ecc_key_der_256, (size_t)sizeof_ecc_key_der_256);
+    tmpSz = (size_t)sizeof_ecc_key_der_256;
+#else
     tmp = (byte*)XMALLOC(FOURK_BUF, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
     if (tmp == NULL) {
         return -8311;
@@ -16000,11 +16332,6 @@ static int ecc_test_make_pub(WC_RNG* rng)
         XFREE(tmp, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
         return -8312;
     }
-
-#ifdef USE_CERT_BUFFERS_256
-    XMEMCPY(tmp, ecc_key_der_256, (size_t)sizeof_ecc_key_der_256);
-    tmpSz = (size_t)sizeof_ecc_key_der_256;
-#else
     file = XFOPEN(eccKeyDerFile, "rb");
     if (!file) {
         ERROR_OUT(-8313, done);
@@ -23704,7 +24031,8 @@ static int myCryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx)
     }
     else if (info->algo_type == WC_ALGO_TYPE_SEED) {
     #ifndef WC_NO_RNG
-        static byte seed[] = { 0x00, 0x00, 0x00, 0x01 };
+        static byte seed[sizeof(word32)] = { 0x00, 0x00, 0x00, 0x01 };
+        word32* seedWord32 = (word32*)seed;
         word32 len;
 
         /* wc_GenerateSeed is a local symbol so we need to fake the entropy. */
@@ -23715,7 +24043,7 @@ static int myCryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx)
             XMEMCPY(info->seed.seed, seed, sizeof(seed));
             info->seed.seed += len;
             info->seed.sz -= len;
-            (*((word32*)seed))++;
+            (*seedWord32)++;
         }
 
         ret = 0;
diff --git a/wolfssl/wolfcrypt/port/st/stsafe.h b/wolfssl/wolfcrypt/port/st/stsafe.h
index 4a60470db..e7c451d90 100644
--- a/wolfssl/wolfcrypt/port/st/stsafe.h
+++ b/wolfssl/wolfcrypt/port/st/stsafe.h
@@ -29,6 +29,8 @@
 
 #ifdef WOLFSSL_STSAFEA100
 
+/* The wolf STSAFE interface layer */
+/* Please contact wolfSSL for the STSAFE port files */
 #include "stsafe_interface.h"
 
 #ifndef STSAFE_MAX_KEY_LEN
@@ -52,11 +54,11 @@ WOLFSSL_API int SSL_STSAFE_VerifyPeerCertCb(WOLFSSL* ssl,
    const unsigned char* hash, unsigned int hashSz,
    const unsigned char* keyDer, unsigned int keySz,
    int* result, void* ctx);
-WOLFSSL_API int SSL_STSAFE_SignCertificateCb(WOLFSSL* ssl, 
+WOLFSSL_API int SSL_STSAFE_SignCertificateCb(WOLFSSL* ssl,
     const byte* in, word32 inSz,
-    byte* out, word32* outSz, 
+    byte* out, word32* outSz,
     const byte* key, word32 keySz, void* ctx);
-WOLFSSL_API int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl, 
+WOLFSSL_API int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl,
     ecc_key* otherKey,
     unsigned char* pubKeyDer, unsigned int* pubKeySz,
     unsigned char* out, unsigned int* outlen,
@@ -65,7 +67,27 @@ WOLFSSL_API int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl,
 /* Helper API's for setting up callbacks */
 WOLFSSL_API int SSL_STSAFE_SetupPkCallbacks(WOLFSSL_CTX* ctx);
 WOLFSSL_API int SSL_STSAFE_SetupPkCallbackCtx(WOLFSSL* ssl, void* user_ctx);
+#endif /* HAVE_PK_CALLBACKS */
+
+
+#ifdef WOLF_CRYPTO_CB
+
+#include <wolfssl/wolfcrypt/cryptocb.h>
+
+/* Device ID that's unique and valid (not INVALID_DEVID -2) */
+#define WOLF_STSAFE_DEVID 0x53545341; /* STSA */
+
+typedef struct wolfSTSAFE_CryptoCb_Ctx {
+#ifdef HAVE_ECC
+    ecc_key wolfEccKey;
 #endif
+    int devId;
+} wolfSTSAFE_CryptoCb_Ctx;
+
+WOLFSSL_API int wolfSSL_STSAFE_CryptoDevCb(int devId, wc_CryptoInfo* info,
+  void* ctx);
+
+#endif /* WOLF_CRYPTO_CB */
 
 #endif /* WOLFSSL_STSAFEA100 */