Merge pull request #10530 from SparkiDev/riscv_unaligned_fix

RISC-V ASM unaligned read/writes: alternative assembly
2026-07-05 17:00:49 +02:00 · 2026-06-03 16:29:33 -07:00
parent df9f3e4cf9 018e937a91
commit 4cce154024
10 changed files with 699 additions and 170 deletions
@@ -888,6 +888,7 @@ WOLFSSL_RENESAS_RZN2L
 WOLFSSL_RENESAS_TLS
 WOLFSSL_RENESAS_TSIP_IAREWRX
 WOLFSSL_REQUIRE_TCA
+WOLFSSL_RISCV_ASM_NO_UNALIGNED
 WOLFSSL_RNG_USE_FULL_SEED
 WOLFSSL_RSA_CHECK_D_ON_DECRYPT
 WOLFSSL_RSA_DECRYPT_TO_0_LEN
@@ -3859,7 +3859,7 @@ do
    # FSL, FSR, FSRI, CMOV, CMIX - QEMU doesn't know about these instructions
    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BIT_MANIPULATION_TERNARY"
    ;;
-  zkn|zkned)
+  zkned)
    # AES encrypt/decrpyt, SHA-2
    ENABLED_RISCV_ASM=yes
    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_SCALAR_CRYPTO_ASM"
@@ -1871,8 +1871,7 @@ int wc_AesSetKey(Aes* aes, const byte* key, word32 keyLen, const byte* iv,
 static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
 {
    __asm__ __volatile__ (
-        "ld          t2, 0(%[in])\n\t"
-        "ld          t3, 8(%[in])\n\t"
+        UNALIGNED_LD2(t2, t3, 0, %[in], t0)
        "ld          a3, 0(%[key])\n\t"
        "ld          a4, 8(%[key])\n\t"
        "ld          a5, 16(%[key])\n\t"
@@ -1897,8 +1896,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
        AESENC_2_ROUNDS(208, 216, 224, 232)
      "L_aes_encrypt_done:\n\t"
        AESENC_LAST_ROUND()
-        "sd          t2, 0(%[out])\n\t"
-        "sd          t3, 8(%[out])\n\t"
+        UNALIGNED_SD2(t2, t3, 0, %[out], t0)
        :
        : [in] "r" (in), [out] "r" (out), [key] "r" (aes->key),
          [rounds] "r" (aes->rounds)
@@ -1918,8 +1916,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
 static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
 {
    __asm__ __volatile__ (
-        "ld          t2, 0(%[in])\n\t"
-        "ld          t3, 8(%[in])\n\t"
+        UNALIGNED_LD2(t2, t3, 0, %[in], t0)
        "ld          a3, 0(%[key])\n\t"
        "ld          a4, 8(%[key])\n\t"
        "ld          a5, 16(%[key])\n\t"
@@ -1944,8 +1941,7 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
        AESDEC_2_ROUNDS(208, 216, 224, 232)
      "L_aes_decrypt_done:\n\t"
        AESDEC_LAST_ROUND()
-        "sd          t2, 0(%[out])\n\t"
-        "sd          t3, 8(%[out])\n\t"
+        UNALIGNED_SD2(t2, t3, 0, %[out], t0)
        :
        : [in] "r" (in), [out] "r" (out), [key] "r" (aes->key),
          [rounds] "r" (aes->rounds)
@@ -3209,8 +3205,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
        LOAD_WORD_REV(t2, 8, %[in])
        LOAD_WORD_REV(t3, 12, %[in])
 #else
-        "ld         t1,  0(%[in])\n\t"
-        "ld         t3,  8(%[in])\n\t"
+        UNALIGNED_LD2(t1, t3, 0, %[in], t0)
        REV8(REG_T1, REG_T1)
        REV8(REG_T3, REG_T3)
        "srli       t0, t1, 32\n\t"
@@ -3376,16 +3371,14 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
        REV8(REG_T1, REG_T1)
        REV8(REG_T3, REG_T3)
        /* Write encrypted block to output. */
-        "sd         t1,  0(%[out])\n\t"
-        "sd         t3,  8(%[out])\n\t"
+        UNALIGNED_SD2(t1, t3, 0, %[out], t0)
 #else
        PACK(REG_T1, REG_A5, REG_A4)
        PACK(REG_T3, REG_A7, REG_A6)
        REV8(REG_T1, REG_T1)
        REV8(REG_T3, REG_T3)
        /* Write encrypted block to output. */
-        "sd         t1,  0(%[out])\n\t"
-        "sd         t3,  8(%[out])\n\t"
+        UNALIGNED_SD2(t1, t3, 0, %[out], t0)
 #endif

        :
@@ -3641,8 +3634,7 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
        LOAD_WORD_REV(t2, 8, %[in])
        LOAD_WORD_REV(t3, 12, %[in])
 #else
-        "ld         t1,  0(%[in])\n\t"
-        "ld         t3,  8(%[in])\n\t"
+        UNALIGNED_LD2(t1, t3, 0, %[in], t0)
        REV8(REG_T1, REG_T1)
        REV8(REG_T3, REG_T3)
        "srli       t0, t1, 32\n\t"
@@ -3793,16 +3785,14 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
        REV8(REG_T1, REG_T1)
        REV8(REG_T3, REG_T3)
        /* Write encrypted block to output. */
-        "sd         t1,  0(%[out])\n\t"
-        "sd         t3,  8(%[out])\n\t"
+        UNALIGNED_SD2(t1, t3, 0, %[out], t0)
 #else
        PACK(REG_T1, REG_A5, REG_A4)
        PACK(REG_T3, REG_A7, REG_A6)
        REV8(REG_T1, REG_T1)
        REV8(REG_T3, REG_T3)
        /* Write encrypted block to output. */
-        "sd         t1,  0(%[out])\n\t"
-        "sd         t3,  8(%[out])\n\t"
+        UNALIGNED_SD2(t1, t3, 0, %[out], t0)
 #endif

        :
@@ -4113,7 +4103,7 @@ static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
 */
 int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
 {
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    word32 processed;
    int ret = 0;

@@ -4563,8 +4553,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
    byte* s, word32 sSz)
 {
    if (gcm != NULL) {
-        byte x[WC_AES_BLOCK_SIZE];
-        byte scratch[WC_AES_BLOCK_SIZE];
+        ALIGN8 byte x[WC_AES_BLOCK_SIZE];
+        ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
        byte* h = gcm->H;

        __asm__ __volatile__ (
@@ -4896,8 +4886,8 @@ static void GMULT(byte* x, byte* y)
 void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
    byte* s, word32 sSz)
 {
-    byte x[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte x[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    word32 blocks, partial;
    byte* h;

@@ -5163,8 +5153,7 @@ static void ghash_blocks(byte* x, byte* y, const byte* in, word32 blocks)

    "L_ghash_loop:\n\t"
        /* Load input block. */
-        "ld          t5, 0(%[in])\n\t"
-        "ld          a5, 8(%[in])\n\t"
+        UNALIGNED_LD2(t5, a5, 0, %[in], t4)
        /* Reverse bits to match x. */
 #ifdef WOLFSSL_RISCV_BIT_MANIPULATION
        BREV8(REG_T5, REG_T5)
@@ -5307,8 +5296,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
    byte* s, word32 sSz)
 {
    if (gcm != NULL) {
-        byte x[WC_AES_BLOCK_SIZE];
-        byte scratch[WC_AES_BLOCK_SIZE];
+        ALIGN8 byte x[WC_AES_BLOCK_SIZE];
+        ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
        word32 blocks, partial;
        byte* h = gcm->H;

@@ -5388,8 +5377,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
    const byte* aad, word32 aadSz)
 {
-    byte counter[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    /* Noticed different optimization levels treated head of array different.
     * Some cases was stack pointer plus offset others was a register containing
     * address. To make uniform for passing in to inline assembly code am using
@@ -5886,8 +5875,8 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
    const byte* aad, word32 aadSz)
 {
-    byte counter[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    /* Noticed different optimization levels treated head of array different.
     * Some cases was stack pointer plus offset others was a register containing
     * address. To make uniform for passing in to inline assembly code am using
@@ -6398,8 +6387,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
    const byte* aad, word32 aadSz)
 {
-    byte counter[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    /* Noticed different optimization levels treated head of array different.
     * Some cases was stack pointer plus offset others was a register containing
     * address. To make uniform for passing in to inline assembly code am using
@@ -7003,8 +6992,8 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    const byte* aad, word32 aadSz)
 {
    int ret = 0;
-    byte counter[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    /* Noticed different optimization levels treated head of array different.
     * Some cases was stack pointer plus offset others was a register containing
     * address. To make uniform for passing in to inline assembly code am using
@@ -7512,8 +7501,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    const byte* aad, word32 aadSz)
 {
    int ret = 0;
-    byte counter[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    /* Noticed different optimization levels treated head of array different.
     * Some cases was stack pointer plus offset others was a register containing
     * address. To make uniform for passing in to inline assembly code am using
@@ -8035,8 +8024,8 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    const byte* aad, word32 aadSz)
 {
    int ret = 0;
-    byte counter[WC_AES_BLOCK_SIZE];
-    byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
    /* Noticed different optimization levels treated head of array different.
     * Some cases was stack pointer plus offset others was a register containing
     * address. To make uniform for passing in to inline assembly code am using
@@ -8733,8 +8722,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
    byte* s, word32 sSz)
 {
    if (gcm != NULL) {
-        byte x[WC_AES_BLOCK_SIZE];
-        byte scratch[WC_AES_BLOCK_SIZE];
+        ALIGN8 byte x[WC_AES_BLOCK_SIZE];
+        ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
        word32 blocks, partial;

        XMEMSET(x, 0, WC_AES_BLOCK_SIZE);
@@ -8834,9 +8823,9 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    word32 partial = sz % WC_AES_BLOCK_SIZE;
    const byte* p = in;
    byte* c = out;
-    ALIGN16 byte counter[WC_AES_BLOCK_SIZE];
-    ALIGN16 byte initialCounter[WC_AES_BLOCK_SIZE];
-    ALIGN16 byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte initialCounter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];

    /* Validate parameters. */
    if ((aes == NULL) || (nonce == NULL) || (nonceSz == 0) || (tag == NULL) ||
@@ -8934,10 +8923,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
    word32 partial = sz % WC_AES_BLOCK_SIZE;
    const byte* c = in;
    byte* p = out;
-    ALIGN16 byte counter[WC_AES_BLOCK_SIZE];
-    ALIGN16 byte scratch[WC_AES_BLOCK_SIZE];
-    ALIGN16 byte Tprime[WC_AES_BLOCK_SIZE];
-    ALIGN16 byte EKY0[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte Tprime[WC_AES_BLOCK_SIZE];
+    ALIGN8 byte EKY0[WC_AES_BLOCK_SIZE];
    sword32 res;

    /* Validate parameters. */
@@ -1825,9 +1825,9 @@ static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m,
        VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000)
        VMV_X_S(REG_T0, REG_V0)
        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
-        "ld     t1, (%[m])\n\t"
+        UNALIGNED_LD(t1, 0, %[m], t2)
        "xor    t1, t1, t0\n\t"
-        "sd     t1, (%[c])\n\t"
+        UNALIGNED_SD(t1, 0, %[c], t2)
        "addi   %[bytes], %[bytes], -8\n\t"
        "addi   %[c], %[c], 8\n\t"
        "addi   %[m], %[m], 8\n\t"
@@ -2155,10 +2155,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
        "bltz   %[bytes], L_chacha20_riscv_over\n\t"

 #if !defined(WOLFSSL_RISCV_BIT_MANIPULATION)
-        "ld     t0, 0(%[m])\n\t"
-        "ld     t1, 8(%[m])\n\t"
-        "ld     t2, 16(%[m])\n\t"
-        "ld     s1, 24(%[m])\n\t"
+        UNALIGNED_LD4(t0, t1, t2, s1, 0, %[m], a3)
        "xor    a4, a4, t0\n\t"
        "xor    a6, a6, t1\n\t"
        "xor    t3, t3, t2\n\t"
@@ -2171,10 +2168,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
        "xor    a7, a7, t1\n\t"
        "xor    t4, t4, t2\n\t"
        "xor    t6, t6, s1\n\t"
-        "ld     t0, 32(%[m])\n\t"
-        "ld     t1, 40(%[m])\n\t"
-        "ld     t2, 48(%[m])\n\t"
-        "ld     s1, 56(%[m])\n\t"
+        UNALIGNED_LD4(t0, t1, t2, s1, 32, %[m], a3)
        "xor    s2, s2, t0\n\t"
        "xor    s4, s4, t1\n\t"
        "xor    s6, s6, t2\n\t"
@@ -2187,22 +2181,8 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
        "xor    s5, s5, t1\n\t"
        "xor    s7, s7, t2\n\t"
        "xor    s9, s9, s1\n\t"
-        "sw     a4, 0(%[c])\n\t"
-        "sw     a5, 4(%[c])\n\t"
-        "sw     a6, 8(%[c])\n\t"
-        "sw     a7, 12(%[c])\n\t"
-        "sw     t3, 16(%[c])\n\t"
-        "sw     t4, 20(%[c])\n\t"
-        "sw     t5, 24(%[c])\n\t"
-        "sw     t6, 28(%[c])\n\t"
-        "sw     s2, 32(%[c])\n\t"
-        "sw     s3, 36(%[c])\n\t"
-        "sw     s4, 40(%[c])\n\t"
-        "sw     s5, 44(%[c])\n\t"
-        "sw     s6, 48(%[c])\n\t"
-        "sw     s7, 52(%[c])\n\t"
-        "sw     s8, 56(%[c])\n\t"
-        "sw     s9, 60(%[c])\n\t"
+        UNALIGNED_SW8(a4, a5, a6, a7, t3, t4, t5, t6, 0, %[c], t0)
+        UNALIGNED_SW8(s2, s3, s4, s5, s6, s7, s8, s9, 32, %[c], t0)
 #else
        PACK(REG_A4, REG_A4, REG_A5)
        PACK(REG_A6, REG_A6, REG_A7)
@@ -2212,14 +2192,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
        PACK(REG_S4, REG_S4, REG_S5)
        PACK(REG_S6, REG_S6, REG_S7)
        PACK(REG_S8, REG_S8, REG_S9)
-        "ld     a5, 0(%[m])\n\t"
-        "ld     a7, 8(%[m])\n\t"
-        "ld     t4, 16(%[m])\n\t"
-        "ld     t6, 24(%[m])\n\t"
-        "ld     s3, 32(%[m])\n\t"
-        "ld     s5, 40(%[m])\n\t"
-        "ld     s7, 48(%[m])\n\t"
-        "ld     s9, 56(%[m])\n\t"
+        UNALIGNED_LD8(a5, a7, t4, t6, s3, s5, s7, s9, 0, %[m], t0)
        "xor    a4, a4, a5\n\t"
        "xor    a6, a6, a7\n\t"
        "xor    t3, t3, t4\n\t"
@@ -2228,14 +2201,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
        "xor    s4, s4, s5\n\t"
        "xor    s6, s6, s7\n\t"
        "xor    s8, s8, s9\n\t"
-        "sd     a4, 0(%[c])\n\t"
-        "sd     a6, 8(%[c])\n\t"
-        "sd     t3, 16(%[c])\n\t"
-        "sd     t5, 24(%[c])\n\t"
-        "sd     s2, 32(%[c])\n\t"
-        "sd     s4, 40(%[c])\n\t"
-        "sd     s6, 48(%[c])\n\t"
-        "sd     s8, 56(%[c])\n\t"
+        UNALIGNED_SD8(a4, a6, t3, t5, s2, s4, s6, s8, 0, %[c], t0)
 #endif

        "addi   %[m], %[m], 64\n\t"
@@ -2268,10 +2234,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
        "bltz   t0, L_chacha20_riscv_32bit\n\t"
        "addi   a3, a3, -1\n\t"
     "L_chacha20_riscv_64bit_loop:\n\t"
-        "ld     t0, (%[m])\n\t"
+        UNALIGNED_LD(t0, 0, %[m], t2)
        "ld     t1, (%[over])\n\t"
        "xor    t0, t0, t1\n\t"
-        "sd     t0, (%[c])\n\t"
+        UNALIGNED_SD(t0, 0, %[c], t2)
        "addi   %[m], %[m], 8\n\t"
        "addi   %[c], %[c], 8\n\t"
        "addi   %[over], %[over], 8\n\t"
@@ -2282,10 +2248,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
     "L_chacha20_riscv_32bit:\n\t"
        "addi   t0, a3, -4\n\t"
        "bltz   t0, L_chacha20_riscv_16bit\n\t"
-        "lw     t0, (%[m])\n\t"
+        UNALIGNED_LW(t0, 0, %[m], t2)
        "lw     t1, (%[over])\n\t"
        "xor    t0, t0, t1\n\t"
-        "sw     t0, (%[c])\n\t"
+        UNALIGNED_SW(t0, 0, %[c], t2)
        "addi   %[m], %[m], 4\n\t"
        "addi   %[c], %[c], 4\n\t"
        "addi   %[over], %[over], 4\n\t"
@@ -2293,10 +2259,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
     "L_chacha20_riscv_16bit:\n\t"
        "addi   t0, a3, -2\n\t"
        "bltz   t0, L_chacha20_riscv_8bit\n\t"
-        "lh     t0, (%[m])\n\t"
+        UNALIGNED_LH(t0, 0, %[m], t2)
        "lh     t1, (%[over])\n\t"
        "xor    t0, t0, t1\n\t"
-        "sh     t0, (%[c])\n\t"
+        UNALIGNED_SH(t0, 0, %[c], t2)
        "addi   %[m], %[m], 2\n\t"
        "addi   %[c], %[c], 2\n\t"
        "addi   %[over], %[over], 2\n\t"
@@ -145,8 +145,7 @@ static WC_INLINE void poly1305_blocks_riscv64_16(Poly1305* ctx,

    "L_poly1305_riscv64_16_64_loop_%=:\n\t"
        /* Load m */
-        "ld     t0, (%[m])\n\t"
-        "ld     t1, 8(%[m])\n\t"
+        UNALIGNED_LD2(t0, t1, 0, %[m], t5)
        /* Split m into 26, 52, 52 */
        SPLIT_130(t2, t3, t4, t0, t1, %[notLast], t5)

@@ -285,8 +284,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m,

    "L_poly1305_riscv64_vec_loop_%=:\n\t"
        /* m0 + nfin */
-        "ld     t0, 0(%[m])\n\t"
-        "ld     t1, 8(%[m])\n\t"
+        UNALIGNED_LD2(t0, t1, 0, %[m], t5)
        "li     t6, 1\n\t"
        /* Split m into 24, 52, 52 */
        SPLIT_130(t2, t3, t4, t0, t1, t6, t5)
@@ -294,8 +292,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m,
        VMV_S_X(REG_V12, REG_T3)
        VMV_S_X(REG_V13, REG_T4)
        /* m1+ nfin */
-        "ld     t0, 16(%[m])\n\t"
-        "ld     t1, 24(%[m])\n\t"
+        UNALIGNED_LD2(t0, t1, 16, %[m], t5)
        /* Split m into 24, 52, 52 */
        SPLIT_130(t2, t3, t4, t0, t1, t6, t5)
        VMV_S_X(REG_V14, REG_T2)
@@ -464,10 +461,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)

    __asm__ __volatile__ (
        /* Load key material */
-        "ld     t0, 0(%[key])\n\t"
-        "ld     t1, 8(%[key])\n\t"
-        "ld     t2, 16(%[key])\n\t"
-        "ld     t3, 24(%[key])\n\t"
+        UNALIGNED_LD4(t0, t1, t2, t3, 0, %[key], t4)
        /* Load clamp */
        "ld     t4, 0(%[clamp])\n\t"
        "ld     t5, 8(%[clamp])\n\t"
@@ -636,8 +630,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
        "sltu   t3, t1, t3\n\t"
        "add    t2, t2, t3\n\t"
        "andi   t2, t2, 3\n\t"
-        "sd     t0, 0(%[mac])\n\t"
-        "sd     t1, 8(%[mac])\n\t"
+        UNALIGNED_SD2(t0, t1, 0, %[mac], t2)
        /* Zero out h. */
        "sd     x0, %[ctx_h_0]\n\t"
        "sd     x0, %[ctx_h_1]\n\t"
@@ -484,14 +484,7 @@ static WC_OMIT_FRAME_POINTER WC_INLINE void Sha256Transform(wc_Sha256* sha256,
        LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7)
        LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7)
 #else
-        "lwu    a4, 0(%[data])\n\t"
-        "lwu    s0, 4(%[data])\n\t"
-        "lwu    a5, 8(%[data])\n\t"
-        "lwu    s1, 12(%[data])\n\t"
-        "lwu    a6, 16(%[data])\n\t"
-        "lwu    s2, 20(%[data])\n\t"
-        "lwu    a7, 24(%[data])\n\t"
-        "lwu    s3, 28(%[data])\n\t"
+        UNALIGNED_LWU8(a4, s0, a5, s1, a6, s2, a7, s3, 0, %[data], t4)
        PACK_BB(s0, s0, a4, REG_S0, REG_S0, REG_A4)
        PACK_BB(s1, s1, a5, REG_S1, REG_S1, REG_A5)
        PACK_BB(s2, s2, a6, REG_S2, REG_S2, REG_A6)
@@ -500,14 +493,7 @@ static WC_OMIT_FRAME_POINTER WC_INLINE void Sha256Transform(wc_Sha256* sha256,
        REV8(REG_S1, REG_S1)
        REV8(REG_S2, REG_S2)
        REV8(REG_S3, REG_S3)
-        "lwu    a4, 32(%[data])\n\t"
-        "lwu    s4, 36(%[data])\n\t"
-        "lwu    a5, 40(%[data])\n\t"
-        "lwu    s5, 44(%[data])\n\t"
-        "lwu    a6, 48(%[data])\n\t"
-        "lwu    s6, 52(%[data])\n\t"
-        "lwu    a7, 56(%[data])\n\t"
-        "lwu    s7, 60(%[data])\n\t"
+        UNALIGNED_LWU8(a4, s4, a5, s5, a6, s6, a7, s7, 32, %[data], t4)
        PACK_BB(s4, s4, a4, REG_S4, REG_S4, REG_A4)
        PACK_BB(s5, s5, a5, REG_S5, REG_S5, REG_A5)
        PACK_BB(s6, s6, a6, REG_S6, REG_S6, REG_A6)
@@ -840,31 +826,18 @@ static WC_INLINE void Sha256Final(wc_Sha256* sha256, byte* hash)
        "srli   t2, t3, 32\n\t"
        "srli   a4, a5, 32\n\t"
        "srli   a6, a7, 32\n\t"
-        "sw     t0, 0(%[hash])\n\t"
-        "sw     t1, 4(%[hash])\n\t"
-        "sw     t2, 8(%[hash])\n\t"
-        "sw     t3, 12(%[hash])\n\t"
-        "sw     a4, 16(%[hash])\n\t"
-        "sw     a5, 20(%[hash])\n\t"
-        "sw     a6, 24(%[hash])\n\t"
-        "sw     a7, 28(%[hash])\n\t"
+        UNALIGNED_SW8(t0, t1, t2, t3, a4, a5, a6, a7, 0, %[hash], t4)
 #else
        LOAD_WORD_REV(t0, 0, %[digest], t2, t3, t4)
        LOAD_WORD_REV(t1, 4, %[digest], t2, t3, t4)
        LOAD_WORD_REV(a4, 8, %[digest], t2, t3, t4)
        LOAD_WORD_REV(a5, 12, %[digest], t2, t3, t4)
-        "sw     t0, 0(%[hash])\n\t"
-        "sw     t1, 4(%[hash])\n\t"
-        "sw     a4, 8(%[hash])\n\t"
-        "sw     a5, 12(%[hash])\n\t"
+        UNALIGNED_SW4(t0, t1, a4, a5, 0, %[hash], t2)
        LOAD_WORD_REV(t0, 16, %[digest], t2, t3, t4)
        LOAD_WORD_REV(t1, 20, %[digest], t2, t3, t4)
        LOAD_WORD_REV(a4, 24, %[digest], t2, t3, t4)
        LOAD_WORD_REV(a5, 28, %[digest], t2, t3, t4)
-        "sw     t0, 16(%[hash])\n\t"
-        "sw     t1, 20(%[hash])\n\t"
-        "sw     a4, 24(%[hash])\n\t"
-        "sw     a5, 28(%[hash])\n\t"
+        UNALIGNED_SW4(t0, t1, a4, a5, 16, %[hash], t2)
 #endif
        :
        : [digest] "r" (sha256->digest), [hash] "r" (hash)
@@ -139,7 +139,7 @@ static const word64 hash_keccak_r[24] =

 #endif

-void BlockSha3(word64* s)
+WC_OMIT_FRAME_POINTER void BlockSha3(word64* s)
 {
    const word64* r = hash_keccak_r;

@@ -554,14 +554,7 @@ static WC_INLINE void Sha512Transform(wc_Sha512* sha512, const byte* data,
        LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7)
        LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7)
 #else
-        "ld     t4,  0(%[data])\n\t"
-        "ld     s1,  8(%[data])\n\t"
-        "ld     s2, 16(%[data])\n\t"
-        "ld     s3, 24(%[data])\n\t"
-        "ld     s4, 32(%[data])\n\t"
-        "ld     s5, 40(%[data])\n\t"
-        "ld     s6, 48(%[data])\n\t"
-        "ld     s7, 56(%[data])\n\t"
+        UNALIGNED_LD8(t4, s1, s2, s3, s4, s5, s6, s7, 0, %[data], t5)
        REV8(REG_T4, REG_T4)
        REV8(REG_S1, REG_S1)
        REV8(REG_S2, REG_S2)
@@ -946,14 +939,7 @@ static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen)
        REV8(REG_S9, REG_S9)
        REV8(REG_S10, REG_S10)
        REV8(REG_S11, REG_S11)
-        "sd     t0, 0(%[hash])\n\t"
-        "sd     t1, 8(%[hash])\n\t"
-        "sd     t2, 16(%[hash])\n\t"
-        "sd     t3, 24(%[hash])\n\t"
-        "sd     s8, 32(%[hash])\n\t"
-        "sd     s9, 40(%[hash])\n\t"
-        "sd     s10, 48(%[hash])\n\t"
-        "sd     s11, 56(%[hash])\n\t"
+        UNALIGNED_SD8(t0, t1, t2, t3, s8, s9, s10, s11, 0, %[hash], t4)
 #else
        LOAD_DWORD_REV(t0,  0, %[digest], a4, a5, a6, a7)
        LOAD_DWORD_REV(t1,  8, %[digest], a4, a5, a6, a7)
@@ -963,14 +949,7 @@ static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen)
        LOAD_DWORD_REV(s9,  40, %[digest], a4, a5, a6, a7)
        LOAD_DWORD_REV(s10,  48, %[digest], a4, a5, a6, a7)
        LOAD_DWORD_REV(s11,  56, %[digest], a4, a5, a6, a7)
-        "sd     t0, 0(%[hash])\n\t"
-        "sd     t1, 8(%[hash])\n\t"
-        "sd     t2, 16(%[hash])\n\t"
-        "sd     t3, 24(%[hash])\n\t"
-        "sd     s8, 32(%[hash])\n\t"
-        "sd     s9, 40(%[hash])\n\t"
-        "sd     s10, 48(%[hash])\n\t"
-        "sd     s11, 56(%[hash])\n\t"
+        UNALIGNED_SD8(t0, t1, t2, t3, s8, s9, s10, s11, 0, %[hash], t4)
 #endif
        :
        : [digest] "r" (sha512->digest), [hash] "r" (hashRes)
@@ -86,9 +86,10 @@ typedef struct ChaCha {
    byte extra[12];
 #endif
    word32 left;                            /* number of bytes leftover */
-#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(USE_ARM_CHACHA_SPEEDUP) || \
-    defined(WOLFSSL_RISCV_ASM)
+#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(USE_ARM_CHACHA_SPEEDUP)
    word32 over[CHACHA_CHUNK_WORDS];
+#elif defined(WOLFSSL_RISCV_ASM)
+    ALIGN8 word32 over[CHACHA_CHUNK_WORDS];
 #endif
 } ChaCha;

@@ -181,6 +181,633 @@
 /* 32-bit width when loading. */
 #define WIDTH_32  0b110

+/*
+ * Scalar load/store helpers.
+ *
+ * Each macro performs the same operation as the ld/lwu/lw/lh/sd/sw/sh
+ * instruction it is named after. By default it expands to that native
+ * instruction. When built with WOLFSSL_RISCV_ASM_NO_UNALIGNED - for cores that
+ * don't support misaligned access - it checks the effective address alignment
+ * at run time and dispatches to the widest supported sequence: word-wise
+ * (lwu/sw) when 4-byte aligned, half-wise (lhu/sh) when 2-byte aligned,
+ * otherwise byte-wise (lbu/sb). The narrower _BY_BYTE / _BY_HALF / _BY_WORD
+ * forms are also exposed for sites that already know the alignment. Values
+ * are little-endian, matching the native instructions either way.
+ *
+ * Bulk variants UNALIGNED_<LD|SD|LWU|LW|SW><N> (N = 2, 4, 8) issue N
+ * consecutive accesses starting at o(p) - stride 8 bytes for LD/SD, 4 bytes
+ * for LWU/LW/SW. Under WOLFSSL_RISCV_ASM_NO_UNALIGNED they share a single
+ * alignment check across all N elements; otherwise they are just N native
+ * instructions back-to-back.
+ *
+ *   r = data register: destination (loads) or source (stores)
+ *   o = constant byte offset
+ *   p = base address register
+ *   t = scratch register - must differ from r and p; clobbered. For stores the
+ *       data register r is preserved. Only used when
+ *       WOLFSSL_RISCV_ASM_NO_UNALIGNED is defined; ignored otherwise.
+ */
+
+/* Apply X to N doublewords at 8-byte stride starting at o(p). */
+#define UNALIGNED_DW_REP2(X, r0, r1, o, p, t)                           \
+    X(r0, o,    p, t)                                                   \
+    X(r1, o+8,  p, t)
+#define UNALIGNED_DW_REP4(X, r0, r1, r2, r3, o, p, t)                   \
+    X(r0, o,    p, t)                                                   \
+    X(r1, o+8,  p, t)                                                   \
+    X(r2, o+16, p, t)                                                   \
+    X(r3, o+24, p, t)
+#define UNALIGNED_DW_REP8(X, r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)   \
+    X(r0, o,    p, t)                                                   \
+    X(r1, o+8,  p, t)                                                   \
+    X(r2, o+16, p, t)                                                   \
+    X(r3, o+24, p, t)                                                   \
+    X(r4, o+32, p, t)                                                   \
+    X(r5, o+40, p, t)                                                   \
+    X(r6, o+48, p, t)                                                   \
+    X(r7, o+56, p, t)
+
+/* Apply X to N words at 4-byte stride starting at o(p). */
+#define UNALIGNED_W_REP2(X, r0, r1, o, p, t)                            \
+    X(r0, o,    p, t)                                                   \
+    X(r1, o+4,  p, t)
+#define UNALIGNED_W_REP4(X, r0, r1, r2, r3, o, p, t)                    \
+    X(r0, o,    p, t)                                                   \
+    X(r1, o+4,  p, t)                                                   \
+    X(r2, o+8,  p, t)                                                   \
+    X(r3, o+12, p, t)
+#define UNALIGNED_W_REP8(X, r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)    \
+    X(r0, o,    p, t)                                                   \
+    X(r1, o+4,  p, t)                                                   \
+    X(r2, o+8,  p, t)                                                   \
+    X(r3, o+12, p, t)                                                   \
+    X(r4, o+16, p, t)                                                   \
+    X(r5, o+20, p, t)                                                   \
+    X(r6, o+24, p, t)                                                   \
+    X(r7, o+28, p, t)
+
+#ifndef WOLFSSL_RISCV_ASM_NO_UNALIGNED
+
+/* Load 64-bits. */
+#define UNALIGNED_LD(r, o, p, t)                \
+    "ld     " #r ", " #o "(" #p ")\n\t"
+
+/* Load 32-bits, zero extended. */
+#define UNALIGNED_LWU(r, o, p, t)               \
+    "lwu    " #r ", " #o "(" #p ")\n\t"
+
+/* Load 32-bits, sign extended. */
+#define UNALIGNED_LW(r, o, p, t)                \
+    "lw     " #r ", " #o "(" #p ")\n\t"
+
+/* Load 16-bits, sign extended. */
+#define UNALIGNED_LH(r, o, p, t)                \
+    "lh     " #r ", " #o "(" #p ")\n\t"
+
+/* Store 64-bits. */
+#define UNALIGNED_SD(r, o, p, t)                \
+    "sd     " #r ", " #o "(" #p ")\n\t"
+
+/* Store 32-bits. */
+#define UNALIGNED_SW(r, o, p, t)                \
+    "sw     " #r ", " #o "(" #p ")\n\t"
+
+/* Store 16-bits. */
+#define UNALIGNED_SH(r, o, p, t)                \
+    "sh     " #r ", " #o "(" #p ")\n\t"
+
+/* Bulk variants - hardware handles unaligned access, so just emit N native
+ * instructions. */
+#define UNALIGNED_LD2(r0, r1, o, p, t)                              \
+    UNALIGNED_DW_REP2(UNALIGNED_LD, r0, r1, o, p, t)
+#define UNALIGNED_LD4(r0, r1, r2, r3, o, p, t)                      \
+    UNALIGNED_DW_REP4(UNALIGNED_LD, r0, r1, r2, r3, o, p, t)
+#define UNALIGNED_LD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)      \
+    UNALIGNED_DW_REP8(UNALIGNED_LD,                                 \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
+#define UNALIGNED_SD2(r0, r1, o, p, t)                              \
+    UNALIGNED_DW_REP2(UNALIGNED_SD, r0, r1, o, p, t)
+#define UNALIGNED_SD4(r0, r1, r2, r3, o, p, t)                      \
+    UNALIGNED_DW_REP4(UNALIGNED_SD, r0, r1, r2, r3, o, p, t)
+#define UNALIGNED_SD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)      \
+    UNALIGNED_DW_REP8(UNALIGNED_SD,                                 \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
+#define UNALIGNED_LWU2(r0, r1, o, p, t)                             \
+    UNALIGNED_W_REP2(UNALIGNED_LWU, r0, r1, o, p, t)
+#define UNALIGNED_LWU4(r0, r1, r2, r3, o, p, t)                     \
+    UNALIGNED_W_REP4(UNALIGNED_LWU, r0, r1, r2, r3, o, p, t)
+#define UNALIGNED_LWU8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)     \
+    UNALIGNED_W_REP8(UNALIGNED_LWU,                                 \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
+#define UNALIGNED_LW2(r0, r1, o, p, t)                              \
+    UNALIGNED_W_REP2(UNALIGNED_LW, r0, r1, o, p, t)
+#define UNALIGNED_LW4(r0, r1, r2, r3, o, p, t)                      \
+    UNALIGNED_W_REP4(UNALIGNED_LW, r0, r1, r2, r3, o, p, t)
+#define UNALIGNED_LW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)      \
+    UNALIGNED_W_REP8(UNALIGNED_LW,                                  \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
+#define UNALIGNED_SW2(r0, r1, o, p, t)                              \
+    UNALIGNED_W_REP2(UNALIGNED_SW, r0, r1, o, p, t)
+#define UNALIGNED_SW4(r0, r1, r2, r3, o, p, t)                      \
+    UNALIGNED_W_REP4(UNALIGNED_SW, r0, r1, r2, r3, o, p, t)
+#define UNALIGNED_SW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)      \
+    UNALIGNED_W_REP8(UNALIGNED_SW,                                  \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
+
+#else
+
+/* Load 64-bits. */
+#define UNALIGNED_LD_BY_BYTE(r, o, p, t)        \
+    "lbu    " #r ", " #o "+0(" #p ")\n\t"       \
+    "lbu    " #t ", " #o "+1(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 8\n\t"              \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+2(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 16\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+3(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 24\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+4(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 32\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+5(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 40\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+6(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 48\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+7(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 56\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"
+#define UNALIGNED_LD_BY_HALF(r, o, p, t)        \
+    "lhu    " #r ", " #o "+0(" #p ")\n\t"       \
+    "lhu    " #t ", " #o "+2(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 16\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lhu    " #t ", " #o "+4(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 32\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lhu    " #t ", " #o "+6(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 48\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"
+#define UNALIGNED_LD_BY_WORD(r, o, p, t)        \
+    "lwu    " #r ", " #o "+0(" #p ")\n\t"       \
+    "lwu    " #t ", " #o "+4(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 32\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"
+#define UNALIGNED_LD_BY_DWORD(r, o, p, t)       \
+    "ld     " #r ", " #o "(" #p ")\n\t"
+/* Assumes o is a multiple of 8. */
+#define UNALIGNED_LD(r, o, p, t)                \
+    "andi   " #t ", " #p ", 7\n\t"              \
+    "bnez   " #t ", 1f\n\t"                     \
+    UNALIGNED_LD_BY_DWORD(r, o, p, t)           \
+    "j      4f\n\t"                             \
+    "1:\n\t"                                    \
+    "andi   " #t ", " #t ", 3\n\t"              \
+    "bnez   " #t ", 2f\n\t"                     \
+    UNALIGNED_LD_BY_WORD(r, o, p, t)            \
+    "j      4f\n\t"                             \
+    "2:\n\t"                                    \
+    "andi   " #t ", " #t ", 1\n\t"              \
+    "bnez   " #t ", 3f\n\t"                     \
+    UNALIGNED_LD_BY_HALF(r, o, p, t)            \
+    "j      4f\n\t"                             \
+    "3:\n\t"                                    \
+    UNALIGNED_LD_BY_BYTE(r, o, p, t)            \
+    "4:\n\t"
+
+/* Load 32-bits, zero extended. */
+#define UNALIGNED_LWU_BY_BYTE(r, o, p, t)       \
+    "lbu    " #r ", " #o "+0(" #p ")\n\t"       \
+    "lbu    " #t ", " #o "+1(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 8\n\t"              \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+2(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 16\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"         \
+    "lbu    " #t ", " #o "+3(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 24\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"
+#define UNALIGNED_LWU_BY_HALF(r, o, p, t)       \
+    "lhu    " #r ", " #o "+0(" #p ")\n\t"       \
+    "lhu    " #t ", " #o "+2(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 16\n\t"             \
+    "or     " #r ", " #r ", " #t "\n\t"
+#define UNALIGNED_LWU_BY_WORD(r, o, p, t)       \
+    "lwu    " #r ", " #o "(" #p ")\n\t"
+/* Assumes o is a multiple of 4. */
+#define UNALIGNED_LWU(r, o, p, t)               \
+    "andi   " #t ", " #p ", 3\n\t"              \
+    "bnez   " #t ", 1f\n\t"                     \
+    UNALIGNED_LWU_BY_WORD(r, o, p, t)           \
+    "j      3f\n\t"                             \
+    "1:\n\t"                                    \
+    "andi   " #t ", " #t ", 1\n\t"              \
+    "bnez   " #t ", 2f\n\t"                     \
+    UNALIGNED_LWU_BY_HALF(r, o, p, t)           \
+    "j      3f\n\t"                             \
+    "2:\n\t"                                    \
+    UNALIGNED_LWU_BY_BYTE(r, o, p, t)           \
+    "3:\n\t"
+
+/* Load 32-bits, sign extended. */
+#define UNALIGNED_LW_BY_BYTE(r, o, p, t)        \
+    UNALIGNED_LWU_BY_BYTE(r, o, p, t)           \
+    "sext.w " #r ", " #r "\n\t"
+#define UNALIGNED_LW_BY_HALF(r, o, p, t)        \
+    UNALIGNED_LWU_BY_HALF(r, o, p, t)           \
+    "sext.w " #r ", " #r "\n\t"
+#define UNALIGNED_LW_BY_WORD(r, o, p, t)        \
+    "lw     " #r ", " #o "(" #p ")\n\t"
+/* Assumes o is a multiple of 4. */
+#define UNALIGNED_LW(r, o, p, t)                \
+    "andi   " #t ", " #p ", 3\n\t"              \
+    "bnez   " #t ", 1f\n\t"                     \
+    UNALIGNED_LW_BY_WORD(r, o, p, t)            \
+    "j      3f\n\t"                             \
+    "1:\n\t"                                    \
+    "andi   " #t ", " #t ", 1\n\t"              \
+    "bnez   " #t ", 2f\n\t"                     \
+    UNALIGNED_LW_BY_HALF(r, o, p, t)            \
+    "j      3f\n\t"                             \
+    "2:\n\t"                                    \
+    UNALIGNED_LW_BY_BYTE(r, o, p, t)            \
+    "3:\n\t"
+
+/* Load 16-bits, sign extended. */
+#define UNALIGNED_LH_BY_BYTE(r, o, p, t)        \
+    "lbu    " #r ", " #o "+0(" #p ")\n\t"       \
+    "lb     " #t ", " #o "+1(" #p ")\n\t"       \
+    "slli   " #t ", " #t ", 8\n\t"              \
+    "or     " #r ", " #r ", " #t "\n\t"
+#define UNALIGNED_LH(r, o, p, t)                \
+    UNALIGNED_LH_BY_BYTE(r, o, p, t)
+
+/* Store 64-bits. */
+#define UNALIGNED_SD_BY_BYTE(r, o, p, t)        \
+    "sb     " #r ", " #o "+0(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 8\n\t"              \
+    "sb     " #t ", " #o "+1(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 16\n\t"             \
+    "sb     " #t ", " #o "+2(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 24\n\t"             \
+    "sb     " #t ", " #o "+3(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 32\n\t"             \
+    "sb     " #t ", " #o "+4(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 40\n\t"             \
+    "sb     " #t ", " #o "+5(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 48\n\t"             \
+    "sb     " #t ", " #o "+6(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 56\n\t"             \
+    "sb     " #t ", " #o "+7(" #p ")\n\t"
+#define UNALIGNED_SD_BY_HALF(r, o, p, t)        \
+    "sh     " #r ", " #o "+0(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 16\n\t"             \
+    "sh     " #t ", " #o "+2(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 32\n\t"             \
+    "sh     " #t ", " #o "+4(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 48\n\t"             \
+    "sh     " #t ", " #o "+6(" #p ")\n\t"
+#define UNALIGNED_SD_BY_WORD(r, o, p, t)        \
+    "sw     " #r ", " #o "+0(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 32\n\t"             \
+    "sw     " #t ", " #o "+4(" #p ")\n\t"
+#define UNALIGNED_SD_BY_DWORD(r, o, p, t)       \
+    "sd     " #r ", " #o "(" #p ")\n\t"
+/* Assumes o is a multiple of 8. */
+#define UNALIGNED_SD(r, o, p, t)                \
+    "andi   " #t ", " #p ", 7\n\t"              \
+    "bnez   " #t ", 1f\n\t"                     \
+    UNALIGNED_SD_BY_DWORD(r, o, p, t)           \
+    "j      4f\n\t"                             \
+    "1:\n\t"                                    \
+    "andi   " #t ", " #t ", 3\n\t"              \
+    "bnez   " #t ", 2f\n\t"                     \
+    UNALIGNED_SD_BY_WORD(r, o, p, t)            \
+    "j      4f\n\t"                             \
+    "2:\n\t"                                    \
+    "andi   " #t ", " #t ", 1\n\t"              \
+    "bnez   " #t ", 3f\n\t"                     \
+    UNALIGNED_SD_BY_HALF(r, o, p, t)            \
+    "j      4f\n\t"                             \
+    "3:\n\t"                                    \
+    UNALIGNED_SD_BY_BYTE(r, o, p, t)            \
+    "4:\n\t"
+
+/* Store 32-bits. */
+#define UNALIGNED_SW_BY_BYTE(r, o, p, t)        \
+    "sb     " #r ", " #o "+0(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 8\n\t"              \
+    "sb     " #t ", " #o "+1(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 16\n\t"             \
+    "sb     " #t ", " #o "+2(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 24\n\t"             \
+    "sb     " #t ", " #o "+3(" #p ")\n\t"
+#define UNALIGNED_SW_BY_HALF(r, o, p, t)        \
+    "sh     " #r ", " #o "+0(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 16\n\t"             \
+    "sh     " #t ", " #o "+2(" #p ")\n\t"
+#define UNALIGNED_SW_BY_WORD(r, o, p, t)        \
+    "sw     " #r ", " #o "(" #p ")\n\t"
+/* Assumes o is a multiple of 4. */
+#define UNALIGNED_SW(r, o, p, t)                \
+    "andi   " #t ", " #p ", 3\n\t"              \
+    "bnez   " #t ", 1f\n\t"                     \
+    UNALIGNED_SW_BY_WORD(r, o, p, t)            \
+    "j      3f\n\t"                             \
+    "1:\n\t"                                    \
+    "andi   " #t ", " #t ", 1\n\t"              \
+    "bnez   " #t ", 2f\n\t"                     \
+    UNALIGNED_SW_BY_HALF(r, o, p, t)            \
+    "j      3f\n\t"                             \
+    "2:\n\t"                                    \
+    UNALIGNED_SW_BY_BYTE(r, o, p, t)            \
+    "3:\n\t"
+
+/* Store 16-bits. */
+#define UNALIGNED_SH_BY_BYTE(r, o, p, t)        \
+    "sb     " #r ", " #o "+0(" #p ")\n\t"       \
+    "srli   " #t ", " #r ", 8\n\t"              \
+    "sb     " #t ", " #o "+1(" #p ")\n\t"
+#define UNALIGNED_SH(r, o, p, t)                \
+    UNALIGNED_SH_BY_BYTE(r, o, p, t)
+
+/* Load 2 64-bits. Assumes o is a multiple of 8. */
+#define UNALIGNED_LD2(r0, r1, o, p, t)                                  \
+    "andi   " #t ", " #p ", 7\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_DW_REP2(UNALIGNED_LD_BY_DWORD, r0, r1, o, p, t)           \
+    "j      4f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 3\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_DW_REP2(UNALIGNED_LD_BY_WORD, r0, r1, o, p, t)            \
+    "j      4f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 3f\n\t"                                             \
+    UNALIGNED_DW_REP2(UNALIGNED_LD_BY_HALF, r0, r1, o, p, t)            \
+    "j      4f\n\t"                                                     \
+    "3:\n\t"                                                            \
+    UNALIGNED_DW_REP2(UNALIGNED_LD_BY_BYTE, r0, r1, o, p, t)            \
+    "4:\n\t"
+
+/* Load 4 64-bits. Assumes o is a multiple of 8. */
+#define UNALIGNED_LD4(r0, r1, r2, r3, o, p, t)                          \
+    "andi   " #t ", " #p ", 7\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_DW_REP4(UNALIGNED_LD_BY_DWORD, r0, r1, r2, r3, o, p, t)   \
+    "j      4f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 3\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_DW_REP4(UNALIGNED_LD_BY_WORD, r0, r1, r2, r3, o, p, t)    \
+    "j      4f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 3f\n\t"                                             \
+    UNALIGNED_DW_REP4(UNALIGNED_LD_BY_HALF, r0, r1, r2, r3, o, p, t)    \
+    "j      4f\n\t"                                                     \
+    "3:\n\t"                                                            \
+    UNALIGNED_DW_REP4(UNALIGNED_LD_BY_BYTE, r0, r1, r2, r3, o, p, t)    \
+    "4:\n\t"
+
+/* Load 8 64-bits. Assumes o is a multiple of 8. */
+#define UNALIGNED_LD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)          \
+    "andi   " #t ", " #p ", 7\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_DW_REP8(UNALIGNED_LD_BY_DWORD,                            \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      4f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 3\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_DW_REP8(UNALIGNED_LD_BY_WORD,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      4f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 3f\n\t"                                             \
+    UNALIGNED_DW_REP8(UNALIGNED_LD_BY_HALF,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      4f\n\t"                                                     \
+    "3:\n\t"                                                            \
+    UNALIGNED_DW_REP8(UNALIGNED_LD_BY_BYTE,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "4:\n\t"
+
+/* Store 2 64-bits. Assumes o is a multiple of 8. */
+#define UNALIGNED_SD2(r0, r1, o, p, t)                                  \
+    "andi   " #t ", " #p ", 7\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_DW_REP2(UNALIGNED_SD_BY_DWORD, r0, r1, o, p, t)           \
+    "j      4f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 3\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_DW_REP2(UNALIGNED_SD_BY_WORD, r0, r1, o, p, t)            \
+    "j      4f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 3f\n\t"                                             \
+    UNALIGNED_DW_REP2(UNALIGNED_SD_BY_HALF, r0, r1, o, p, t)            \
+    "j      4f\n\t"                                                     \
+    "3:\n\t"                                                            \
+    UNALIGNED_DW_REP2(UNALIGNED_SD_BY_BYTE, r0, r1, o, p, t)            \
+    "4:\n\t"
+
+/* Store 4 64-bits. Assumes o is a multiple of 8. */
+#define UNALIGNED_SD4(r0, r1, r2, r3, o, p, t)                          \
+    "andi   " #t ", " #p ", 7\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_DW_REP4(UNALIGNED_SD_BY_DWORD, r0, r1, r2, r3, o, p, t)   \
+    "j      4f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 3\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_DW_REP4(UNALIGNED_SD_BY_WORD, r0, r1, r2, r3, o, p, t)    \
+    "j      4f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 3f\n\t"                                             \
+    UNALIGNED_DW_REP4(UNALIGNED_SD_BY_HALF, r0, r1, r2, r3, o, p, t)    \
+    "j      4f\n\t"                                                     \
+    "3:\n\t"                                                            \
+    UNALIGNED_DW_REP4(UNALIGNED_SD_BY_BYTE, r0, r1, r2, r3, o, p, t)    \
+    "4:\n\t"
+
+/* Store 8 64-bits. Assumes o is a multiple of 8. */
+#define UNALIGNED_SD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)          \
+    "andi   " #t ", " #p ", 7\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_DW_REP8(UNALIGNED_SD_BY_DWORD,                            \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      4f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 3\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_DW_REP8(UNALIGNED_SD_BY_WORD,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      4f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 3f\n\t"                                             \
+    UNALIGNED_DW_REP8(UNALIGNED_SD_BY_HALF,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      4f\n\t"                                                     \
+    "3:\n\t"                                                            \
+    UNALIGNED_DW_REP8(UNALIGNED_SD_BY_BYTE,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "4:\n\t"
+
+/* Load 2 32-bits, zero extended. Assumes o is a multiple of 4. */
+#define UNALIGNED_LWU2(r0, r1, o, p, t)                                 \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP2(UNALIGNED_LWU_BY_WORD, r0, r1, o, p, t)            \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP2(UNALIGNED_LWU_BY_HALF, r0, r1, o, p, t)            \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP2(UNALIGNED_LWU_BY_BYTE, r0, r1, o, p, t)            \
+    "3:\n\t"
+
+/* Load 4 32-bits, zero extended. Assumes o is a multiple of 4. */
+#define UNALIGNED_LWU4(r0, r1, r2, r3, o, p, t)                         \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP4(UNALIGNED_LWU_BY_WORD, r0, r1, r2, r3, o, p, t)    \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP4(UNALIGNED_LWU_BY_HALF, r0, r1, r2, r3, o, p, t)    \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP4(UNALIGNED_LWU_BY_BYTE, r0, r1, r2, r3, o, p, t)    \
+    "3:\n\t"
+
+/* Load 8 32-bits, zero extended. Assumes o is a multiple of 4. */
+#define UNALIGNED_LWU8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)         \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP8(UNALIGNED_LWU_BY_WORD,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP8(UNALIGNED_LWU_BY_HALF,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP8(UNALIGNED_LWU_BY_BYTE,                             \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "3:\n\t"
+
+/* Load 2 32-bits, sign extended. Assumes o is a multiple of 4. */
+#define UNALIGNED_LW2(r0, r1, o, p, t)                                  \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP2(UNALIGNED_LW_BY_WORD, r0, r1, o, p, t)             \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP2(UNALIGNED_LW_BY_HALF, r0, r1, o, p, t)             \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP2(UNALIGNED_LW_BY_BYTE, r0, r1, o, p, t)             \
+    "3:\n\t"
+
+/* Load 4 32-bits, sign extended. Assumes o is a multiple of 4. */
+#define UNALIGNED_LW4(r0, r1, r2, r3, o, p, t)                          \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP4(UNALIGNED_LW_BY_WORD, r0, r1, r2, r3, o, p, t)     \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP4(UNALIGNED_LW_BY_HALF, r0, r1, r2, r3, o, p, t)     \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP4(UNALIGNED_LW_BY_BYTE, r0, r1, r2, r3, o, p, t)     \
+    "3:\n\t"
+
+/* Load 8 32-bits, sign extended. Assumes o is a multiple of 4. */
+#define UNALIGNED_LW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)          \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP8(UNALIGNED_LW_BY_WORD,                              \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP8(UNALIGNED_LW_BY_HALF,                              \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP8(UNALIGNED_LW_BY_BYTE,                              \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "3:\n\t"
+
+/* Store 2 32-bits. Assumes o is a multiple of 4. */
+#define UNALIGNED_SW2(r0, r1, o, p, t)                                  \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP2(UNALIGNED_SW_BY_WORD, r0, r1, o, p, t)             \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP2(UNALIGNED_SW_BY_HALF, r0, r1, o, p, t)             \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP2(UNALIGNED_SW_BY_BYTE, r0, r1, o, p, t)             \
+    "3:\n\t"
+
+/* Store 4 32-bits. Assumes o is a multiple of 4. */
+#define UNALIGNED_SW4(r0, r1, r2, r3, o, p, t)                          \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP4(UNALIGNED_SW_BY_WORD, r0, r1, r2, r3, o, p, t)     \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP4(UNALIGNED_SW_BY_HALF, r0, r1, r2, r3, o, p, t)     \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP4(UNALIGNED_SW_BY_BYTE, r0, r1, r2, r3, o, p, t)     \
+    "3:\n\t"
+
+/* Store 8 32-bits. Assumes o is a multiple of 4. */
+#define UNALIGNED_SW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)          \
+    "andi   " #t ", " #p ", 3\n\t"                                      \
+    "bnez   " #t ", 1f\n\t"                                             \
+    UNALIGNED_W_REP8(UNALIGNED_SW_BY_WORD,                              \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      3f\n\t"                                                     \
+    "1:\n\t"                                                            \
+    "andi   " #t ", " #t ", 1\n\t"                                      \
+    "bnez   " #t ", 2f\n\t"                                             \
+    UNALIGNED_W_REP8(UNALIGNED_SW_BY_HALF,                              \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "j      3f\n\t"                                                     \
+    "2:\n\t"                                                            \
+    UNALIGNED_W_REP8(UNALIGNED_SW_BY_BYTE,                              \
+        r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)                        \
+    "3:\n\t"
+
+#endif /* !WOLFSSL_RISCV_ASM_NO_UNALIGNED */
+

 #define VLSEG_V(vd, rs1, cnt, width) \
    ASM_WORD(0b0000111 | (width << 12) | (0b10101000 << 20) |   \