mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-07-05 17:00:49 +02:00
Merge pull request #10530 from SparkiDev/riscv_unaligned_fix
RISC-V ASM unaligned read/writes: alternative assembly
This commit is contained in:
@@ -888,6 +888,7 @@ WOLFSSL_RENESAS_RZN2L
|
||||
WOLFSSL_RENESAS_TLS
|
||||
WOLFSSL_RENESAS_TSIP_IAREWRX
|
||||
WOLFSSL_REQUIRE_TCA
|
||||
WOLFSSL_RISCV_ASM_NO_UNALIGNED
|
||||
WOLFSSL_RNG_USE_FULL_SEED
|
||||
WOLFSSL_RSA_CHECK_D_ON_DECRYPT
|
||||
WOLFSSL_RSA_DECRYPT_TO_0_LEN
|
||||
|
||||
+1
-1
@@ -3859,7 +3859,7 @@ do
|
||||
# FSL, FSR, FSRI, CMOV, CMIX - QEMU doesn't know about these instructions
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BIT_MANIPULATION_TERNARY"
|
||||
;;
|
||||
zkn|zkned)
|
||||
zkned)
|
||||
# AES encrypt/decrpyt, SHA-2
|
||||
ENABLED_RISCV_ASM=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_SCALAR_CRYPTO_ASM"
|
||||
|
||||
@@ -1871,8 +1871,7 @@ int wc_AesSetKey(Aes* aes, const byte* key, word32 keyLen, const byte* iv,
|
||||
static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
"ld t2, 0(%[in])\n\t"
|
||||
"ld t3, 8(%[in])\n\t"
|
||||
UNALIGNED_LD2(t2, t3, 0, %[in], t0)
|
||||
"ld a3, 0(%[key])\n\t"
|
||||
"ld a4, 8(%[key])\n\t"
|
||||
"ld a5, 16(%[key])\n\t"
|
||||
@@ -1897,8 +1896,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
|
||||
AESENC_2_ROUNDS(208, 216, 224, 232)
|
||||
"L_aes_encrypt_done:\n\t"
|
||||
AESENC_LAST_ROUND()
|
||||
"sd t2, 0(%[out])\n\t"
|
||||
"sd t3, 8(%[out])\n\t"
|
||||
UNALIGNED_SD2(t2, t3, 0, %[out], t0)
|
||||
:
|
||||
: [in] "r" (in), [out] "r" (out), [key] "r" (aes->key),
|
||||
[rounds] "r" (aes->rounds)
|
||||
@@ -1918,8 +1916,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
|
||||
static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
"ld t2, 0(%[in])\n\t"
|
||||
"ld t3, 8(%[in])\n\t"
|
||||
UNALIGNED_LD2(t2, t3, 0, %[in], t0)
|
||||
"ld a3, 0(%[key])\n\t"
|
||||
"ld a4, 8(%[key])\n\t"
|
||||
"ld a5, 16(%[key])\n\t"
|
||||
@@ -1944,8 +1941,7 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
|
||||
AESDEC_2_ROUNDS(208, 216, 224, 232)
|
||||
"L_aes_decrypt_done:\n\t"
|
||||
AESDEC_LAST_ROUND()
|
||||
"sd t2, 0(%[out])\n\t"
|
||||
"sd t3, 8(%[out])\n\t"
|
||||
UNALIGNED_SD2(t2, t3, 0, %[out], t0)
|
||||
:
|
||||
: [in] "r" (in), [out] "r" (out), [key] "r" (aes->key),
|
||||
[rounds] "r" (aes->rounds)
|
||||
@@ -3209,8 +3205,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
|
||||
LOAD_WORD_REV(t2, 8, %[in])
|
||||
LOAD_WORD_REV(t3, 12, %[in])
|
||||
#else
|
||||
"ld t1, 0(%[in])\n\t"
|
||||
"ld t3, 8(%[in])\n\t"
|
||||
UNALIGNED_LD2(t1, t3, 0, %[in], t0)
|
||||
REV8(REG_T1, REG_T1)
|
||||
REV8(REG_T3, REG_T3)
|
||||
"srli t0, t1, 32\n\t"
|
||||
@@ -3376,16 +3371,14 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out)
|
||||
REV8(REG_T1, REG_T1)
|
||||
REV8(REG_T3, REG_T3)
|
||||
/* Write encrypted block to output. */
|
||||
"sd t1, 0(%[out])\n\t"
|
||||
"sd t3, 8(%[out])\n\t"
|
||||
UNALIGNED_SD2(t1, t3, 0, %[out], t0)
|
||||
#else
|
||||
PACK(REG_T1, REG_A5, REG_A4)
|
||||
PACK(REG_T3, REG_A7, REG_A6)
|
||||
REV8(REG_T1, REG_T1)
|
||||
REV8(REG_T3, REG_T3)
|
||||
/* Write encrypted block to output. */
|
||||
"sd t1, 0(%[out])\n\t"
|
||||
"sd t3, 8(%[out])\n\t"
|
||||
UNALIGNED_SD2(t1, t3, 0, %[out], t0)
|
||||
#endif
|
||||
|
||||
:
|
||||
@@ -3641,8 +3634,7 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
|
||||
LOAD_WORD_REV(t2, 8, %[in])
|
||||
LOAD_WORD_REV(t3, 12, %[in])
|
||||
#else
|
||||
"ld t1, 0(%[in])\n\t"
|
||||
"ld t3, 8(%[in])\n\t"
|
||||
UNALIGNED_LD2(t1, t3, 0, %[in], t0)
|
||||
REV8(REG_T1, REG_T1)
|
||||
REV8(REG_T3, REG_T3)
|
||||
"srli t0, t1, 32\n\t"
|
||||
@@ -3793,16 +3785,14 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out)
|
||||
REV8(REG_T1, REG_T1)
|
||||
REV8(REG_T3, REG_T3)
|
||||
/* Write encrypted block to output. */
|
||||
"sd t1, 0(%[out])\n\t"
|
||||
"sd t3, 8(%[out])\n\t"
|
||||
UNALIGNED_SD2(t1, t3, 0, %[out], t0)
|
||||
#else
|
||||
PACK(REG_T1, REG_A5, REG_A4)
|
||||
PACK(REG_T3, REG_A7, REG_A6)
|
||||
REV8(REG_T1, REG_T1)
|
||||
REV8(REG_T3, REG_T3)
|
||||
/* Write encrypted block to output. */
|
||||
"sd t1, 0(%[out])\n\t"
|
||||
"sd t3, 8(%[out])\n\t"
|
||||
UNALIGNED_SD2(t1, t3, 0, %[out], t0)
|
||||
#endif
|
||||
|
||||
:
|
||||
@@ -4113,7 +4103,7 @@ static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
|
||||
*/
|
||||
int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
|
||||
{
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
word32 processed;
|
||||
int ret = 0;
|
||||
|
||||
@@ -4563,8 +4553,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
|
||||
byte* s, word32 sSz)
|
||||
{
|
||||
if (gcm != NULL) {
|
||||
byte x[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte x[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
byte* h = gcm->H;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
@@ -4896,8 +4886,8 @@ static void GMULT(byte* x, byte* y)
|
||||
void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
|
||||
byte* s, word32 sSz)
|
||||
{
|
||||
byte x[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte x[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
word32 blocks, partial;
|
||||
byte* h;
|
||||
|
||||
@@ -5163,8 +5153,7 @@ static void ghash_blocks(byte* x, byte* y, const byte* in, word32 blocks)
|
||||
|
||||
"L_ghash_loop:\n\t"
|
||||
/* Load input block. */
|
||||
"ld t5, 0(%[in])\n\t"
|
||||
"ld a5, 8(%[in])\n\t"
|
||||
UNALIGNED_LD2(t5, a5, 0, %[in], t4)
|
||||
/* Reverse bits to match x. */
|
||||
#ifdef WOLFSSL_RISCV_BIT_MANIPULATION
|
||||
BREV8(REG_T5, REG_T5)
|
||||
@@ -5307,8 +5296,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
|
||||
byte* s, word32 sSz)
|
||||
{
|
||||
if (gcm != NULL) {
|
||||
byte x[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte x[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
word32 blocks, partial;
|
||||
byte* h = gcm->H;
|
||||
|
||||
@@ -5388,8 +5377,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
|
||||
const byte* aad, word32 aadSz)
|
||||
{
|
||||
byte counter[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
/* Noticed different optimization levels treated head of array different.
|
||||
* Some cases was stack pointer plus offset others was a register containing
|
||||
* address. To make uniform for passing in to inline assembly code am using
|
||||
@@ -5886,8 +5875,8 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
|
||||
const byte* aad, word32 aadSz)
|
||||
{
|
||||
byte counter[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
/* Noticed different optimization levels treated head of array different.
|
||||
* Some cases was stack pointer plus offset others was a register containing
|
||||
* address. To make uniform for passing in to inline assembly code am using
|
||||
@@ -6398,8 +6387,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
|
||||
const byte* aad, word32 aadSz)
|
||||
{
|
||||
byte counter[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
/* Noticed different optimization levels treated head of array different.
|
||||
* Some cases was stack pointer plus offset others was a register containing
|
||||
* address. To make uniform for passing in to inline assembly code am using
|
||||
@@ -7003,8 +6992,8 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
const byte* aad, word32 aadSz)
|
||||
{
|
||||
int ret = 0;
|
||||
byte counter[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
/* Noticed different optimization levels treated head of array different.
|
||||
* Some cases was stack pointer plus offset others was a register containing
|
||||
* address. To make uniform for passing in to inline assembly code am using
|
||||
@@ -7512,8 +7501,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
const byte* aad, word32 aadSz)
|
||||
{
|
||||
int ret = 0;
|
||||
byte counter[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
/* Noticed different optimization levels treated head of array different.
|
||||
* Some cases was stack pointer plus offset others was a register containing
|
||||
* address. To make uniform for passing in to inline assembly code am using
|
||||
@@ -8035,8 +8024,8 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
const byte* aad, word32 aadSz)
|
||||
{
|
||||
int ret = 0;
|
||||
byte counter[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
/* Noticed different optimization levels treated head of array different.
|
||||
* Some cases was stack pointer plus offset others was a register containing
|
||||
* address. To make uniform for passing in to inline assembly code am using
|
||||
@@ -8733,8 +8722,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz,
|
||||
byte* s, word32 sSz)
|
||||
{
|
||||
if (gcm != NULL) {
|
||||
byte x[WC_AES_BLOCK_SIZE];
|
||||
byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte x[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
word32 blocks, partial;
|
||||
|
||||
XMEMSET(x, 0, WC_AES_BLOCK_SIZE);
|
||||
@@ -8834,9 +8823,9 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
word32 partial = sz % WC_AES_BLOCK_SIZE;
|
||||
const byte* p = in;
|
||||
byte* c = out;
|
||||
ALIGN16 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN16 byte initialCounter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN16 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte initialCounter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
|
||||
/* Validate parameters. */
|
||||
if ((aes == NULL) || (nonce == NULL) || (nonceSz == 0) || (tag == NULL) ||
|
||||
@@ -8934,10 +8923,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
word32 partial = sz % WC_AES_BLOCK_SIZE;
|
||||
const byte* c = in;
|
||||
byte* p = out;
|
||||
ALIGN16 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN16 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN16 byte Tprime[WC_AES_BLOCK_SIZE];
|
||||
ALIGN16 byte EKY0[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte counter[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte scratch[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte Tprime[WC_AES_BLOCK_SIZE];
|
||||
ALIGN8 byte EKY0[WC_AES_BLOCK_SIZE];
|
||||
sword32 res;
|
||||
|
||||
/* Validate parameters. */
|
||||
|
||||
@@ -1825,9 +1825,9 @@ static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m,
|
||||
VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000)
|
||||
VMV_X_S(REG_T0, REG_V0)
|
||||
VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
|
||||
"ld t1, (%[m])\n\t"
|
||||
UNALIGNED_LD(t1, 0, %[m], t2)
|
||||
"xor t1, t1, t0\n\t"
|
||||
"sd t1, (%[c])\n\t"
|
||||
UNALIGNED_SD(t1, 0, %[c], t2)
|
||||
"addi %[bytes], %[bytes], -8\n\t"
|
||||
"addi %[c], %[c], 8\n\t"
|
||||
"addi %[m], %[m], 8\n\t"
|
||||
@@ -2155,10 +2155,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"bltz %[bytes], L_chacha20_riscv_over\n\t"
|
||||
|
||||
#if !defined(WOLFSSL_RISCV_BIT_MANIPULATION)
|
||||
"ld t0, 0(%[m])\n\t"
|
||||
"ld t1, 8(%[m])\n\t"
|
||||
"ld t2, 16(%[m])\n\t"
|
||||
"ld s1, 24(%[m])\n\t"
|
||||
UNALIGNED_LD4(t0, t1, t2, s1, 0, %[m], a3)
|
||||
"xor a4, a4, t0\n\t"
|
||||
"xor a6, a6, t1\n\t"
|
||||
"xor t3, t3, t2\n\t"
|
||||
@@ -2171,10 +2168,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"xor a7, a7, t1\n\t"
|
||||
"xor t4, t4, t2\n\t"
|
||||
"xor t6, t6, s1\n\t"
|
||||
"ld t0, 32(%[m])\n\t"
|
||||
"ld t1, 40(%[m])\n\t"
|
||||
"ld t2, 48(%[m])\n\t"
|
||||
"ld s1, 56(%[m])\n\t"
|
||||
UNALIGNED_LD4(t0, t1, t2, s1, 32, %[m], a3)
|
||||
"xor s2, s2, t0\n\t"
|
||||
"xor s4, s4, t1\n\t"
|
||||
"xor s6, s6, t2\n\t"
|
||||
@@ -2187,22 +2181,8 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"xor s5, s5, t1\n\t"
|
||||
"xor s7, s7, t2\n\t"
|
||||
"xor s9, s9, s1\n\t"
|
||||
"sw a4, 0(%[c])\n\t"
|
||||
"sw a5, 4(%[c])\n\t"
|
||||
"sw a6, 8(%[c])\n\t"
|
||||
"sw a7, 12(%[c])\n\t"
|
||||
"sw t3, 16(%[c])\n\t"
|
||||
"sw t4, 20(%[c])\n\t"
|
||||
"sw t5, 24(%[c])\n\t"
|
||||
"sw t6, 28(%[c])\n\t"
|
||||
"sw s2, 32(%[c])\n\t"
|
||||
"sw s3, 36(%[c])\n\t"
|
||||
"sw s4, 40(%[c])\n\t"
|
||||
"sw s5, 44(%[c])\n\t"
|
||||
"sw s6, 48(%[c])\n\t"
|
||||
"sw s7, 52(%[c])\n\t"
|
||||
"sw s8, 56(%[c])\n\t"
|
||||
"sw s9, 60(%[c])\n\t"
|
||||
UNALIGNED_SW8(a4, a5, a6, a7, t3, t4, t5, t6, 0, %[c], t0)
|
||||
UNALIGNED_SW8(s2, s3, s4, s5, s6, s7, s8, s9, 32, %[c], t0)
|
||||
#else
|
||||
PACK(REG_A4, REG_A4, REG_A5)
|
||||
PACK(REG_A6, REG_A6, REG_A7)
|
||||
@@ -2212,14 +2192,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
PACK(REG_S4, REG_S4, REG_S5)
|
||||
PACK(REG_S6, REG_S6, REG_S7)
|
||||
PACK(REG_S8, REG_S8, REG_S9)
|
||||
"ld a5, 0(%[m])\n\t"
|
||||
"ld a7, 8(%[m])\n\t"
|
||||
"ld t4, 16(%[m])\n\t"
|
||||
"ld t6, 24(%[m])\n\t"
|
||||
"ld s3, 32(%[m])\n\t"
|
||||
"ld s5, 40(%[m])\n\t"
|
||||
"ld s7, 48(%[m])\n\t"
|
||||
"ld s9, 56(%[m])\n\t"
|
||||
UNALIGNED_LD8(a5, a7, t4, t6, s3, s5, s7, s9, 0, %[m], t0)
|
||||
"xor a4, a4, a5\n\t"
|
||||
"xor a6, a6, a7\n\t"
|
||||
"xor t3, t3, t4\n\t"
|
||||
@@ -2228,14 +2201,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"xor s4, s4, s5\n\t"
|
||||
"xor s6, s6, s7\n\t"
|
||||
"xor s8, s8, s9\n\t"
|
||||
"sd a4, 0(%[c])\n\t"
|
||||
"sd a6, 8(%[c])\n\t"
|
||||
"sd t3, 16(%[c])\n\t"
|
||||
"sd t5, 24(%[c])\n\t"
|
||||
"sd s2, 32(%[c])\n\t"
|
||||
"sd s4, 40(%[c])\n\t"
|
||||
"sd s6, 48(%[c])\n\t"
|
||||
"sd s8, 56(%[c])\n\t"
|
||||
UNALIGNED_SD8(a4, a6, t3, t5, s2, s4, s6, s8, 0, %[c], t0)
|
||||
#endif
|
||||
|
||||
"addi %[m], %[m], 64\n\t"
|
||||
@@ -2268,10 +2234,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"bltz t0, L_chacha20_riscv_32bit\n\t"
|
||||
"addi a3, a3, -1\n\t"
|
||||
"L_chacha20_riscv_64bit_loop:\n\t"
|
||||
"ld t0, (%[m])\n\t"
|
||||
UNALIGNED_LD(t0, 0, %[m], t2)
|
||||
"ld t1, (%[over])\n\t"
|
||||
"xor t0, t0, t1\n\t"
|
||||
"sd t0, (%[c])\n\t"
|
||||
UNALIGNED_SD(t0, 0, %[c], t2)
|
||||
"addi %[m], %[m], 8\n\t"
|
||||
"addi %[c], %[c], 8\n\t"
|
||||
"addi %[over], %[over], 8\n\t"
|
||||
@@ -2282,10 +2248,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"L_chacha20_riscv_32bit:\n\t"
|
||||
"addi t0, a3, -4\n\t"
|
||||
"bltz t0, L_chacha20_riscv_16bit\n\t"
|
||||
"lw t0, (%[m])\n\t"
|
||||
UNALIGNED_LW(t0, 0, %[m], t2)
|
||||
"lw t1, (%[over])\n\t"
|
||||
"xor t0, t0, t1\n\t"
|
||||
"sw t0, (%[c])\n\t"
|
||||
UNALIGNED_SW(t0, 0, %[c], t2)
|
||||
"addi %[m], %[m], 4\n\t"
|
||||
"addi %[c], %[c], 4\n\t"
|
||||
"addi %[over], %[over], 4\n\t"
|
||||
@@ -2293,10 +2259,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
|
||||
"L_chacha20_riscv_16bit:\n\t"
|
||||
"addi t0, a3, -2\n\t"
|
||||
"bltz t0, L_chacha20_riscv_8bit\n\t"
|
||||
"lh t0, (%[m])\n\t"
|
||||
UNALIGNED_LH(t0, 0, %[m], t2)
|
||||
"lh t1, (%[over])\n\t"
|
||||
"xor t0, t0, t1\n\t"
|
||||
"sh t0, (%[c])\n\t"
|
||||
UNALIGNED_SH(t0, 0, %[c], t2)
|
||||
"addi %[m], %[m], 2\n\t"
|
||||
"addi %[c], %[c], 2\n\t"
|
||||
"addi %[over], %[over], 2\n\t"
|
||||
|
||||
@@ -145,8 +145,7 @@ static WC_INLINE void poly1305_blocks_riscv64_16(Poly1305* ctx,
|
||||
|
||||
"L_poly1305_riscv64_16_64_loop_%=:\n\t"
|
||||
/* Load m */
|
||||
"ld t0, (%[m])\n\t"
|
||||
"ld t1, 8(%[m])\n\t"
|
||||
UNALIGNED_LD2(t0, t1, 0, %[m], t5)
|
||||
/* Split m into 26, 52, 52 */
|
||||
SPLIT_130(t2, t3, t4, t0, t1, %[notLast], t5)
|
||||
|
||||
@@ -285,8 +284,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m,
|
||||
|
||||
"L_poly1305_riscv64_vec_loop_%=:\n\t"
|
||||
/* m0 + nfin */
|
||||
"ld t0, 0(%[m])\n\t"
|
||||
"ld t1, 8(%[m])\n\t"
|
||||
UNALIGNED_LD2(t0, t1, 0, %[m], t5)
|
||||
"li t6, 1\n\t"
|
||||
/* Split m into 24, 52, 52 */
|
||||
SPLIT_130(t2, t3, t4, t0, t1, t6, t5)
|
||||
@@ -294,8 +292,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m,
|
||||
VMV_S_X(REG_V12, REG_T3)
|
||||
VMV_S_X(REG_V13, REG_T4)
|
||||
/* m1+ nfin */
|
||||
"ld t0, 16(%[m])\n\t"
|
||||
"ld t1, 24(%[m])\n\t"
|
||||
UNALIGNED_LD2(t0, t1, 16, %[m], t5)
|
||||
/* Split m into 24, 52, 52 */
|
||||
SPLIT_130(t2, t3, t4, t0, t1, t6, t5)
|
||||
VMV_S_X(REG_V14, REG_T2)
|
||||
@@ -464,10 +461,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
|
||||
__asm__ __volatile__ (
|
||||
/* Load key material */
|
||||
"ld t0, 0(%[key])\n\t"
|
||||
"ld t1, 8(%[key])\n\t"
|
||||
"ld t2, 16(%[key])\n\t"
|
||||
"ld t3, 24(%[key])\n\t"
|
||||
UNALIGNED_LD4(t0, t1, t2, t3, 0, %[key], t4)
|
||||
/* Load clamp */
|
||||
"ld t4, 0(%[clamp])\n\t"
|
||||
"ld t5, 8(%[clamp])\n\t"
|
||||
@@ -636,8 +630,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
||||
"sltu t3, t1, t3\n\t"
|
||||
"add t2, t2, t3\n\t"
|
||||
"andi t2, t2, 3\n\t"
|
||||
"sd t0, 0(%[mac])\n\t"
|
||||
"sd t1, 8(%[mac])\n\t"
|
||||
UNALIGNED_SD2(t0, t1, 0, %[mac], t2)
|
||||
/* Zero out h. */
|
||||
"sd x0, %[ctx_h_0]\n\t"
|
||||
"sd x0, %[ctx_h_1]\n\t"
|
||||
|
||||
@@ -484,14 +484,7 @@ static WC_OMIT_FRAME_POINTER WC_INLINE void Sha256Transform(wc_Sha256* sha256,
|
||||
LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7)
|
||||
LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7)
|
||||
#else
|
||||
"lwu a4, 0(%[data])\n\t"
|
||||
"lwu s0, 4(%[data])\n\t"
|
||||
"lwu a5, 8(%[data])\n\t"
|
||||
"lwu s1, 12(%[data])\n\t"
|
||||
"lwu a6, 16(%[data])\n\t"
|
||||
"lwu s2, 20(%[data])\n\t"
|
||||
"lwu a7, 24(%[data])\n\t"
|
||||
"lwu s3, 28(%[data])\n\t"
|
||||
UNALIGNED_LWU8(a4, s0, a5, s1, a6, s2, a7, s3, 0, %[data], t4)
|
||||
PACK_BB(s0, s0, a4, REG_S0, REG_S0, REG_A4)
|
||||
PACK_BB(s1, s1, a5, REG_S1, REG_S1, REG_A5)
|
||||
PACK_BB(s2, s2, a6, REG_S2, REG_S2, REG_A6)
|
||||
@@ -500,14 +493,7 @@ static WC_OMIT_FRAME_POINTER WC_INLINE void Sha256Transform(wc_Sha256* sha256,
|
||||
REV8(REG_S1, REG_S1)
|
||||
REV8(REG_S2, REG_S2)
|
||||
REV8(REG_S3, REG_S3)
|
||||
"lwu a4, 32(%[data])\n\t"
|
||||
"lwu s4, 36(%[data])\n\t"
|
||||
"lwu a5, 40(%[data])\n\t"
|
||||
"lwu s5, 44(%[data])\n\t"
|
||||
"lwu a6, 48(%[data])\n\t"
|
||||
"lwu s6, 52(%[data])\n\t"
|
||||
"lwu a7, 56(%[data])\n\t"
|
||||
"lwu s7, 60(%[data])\n\t"
|
||||
UNALIGNED_LWU8(a4, s4, a5, s5, a6, s6, a7, s7, 32, %[data], t4)
|
||||
PACK_BB(s4, s4, a4, REG_S4, REG_S4, REG_A4)
|
||||
PACK_BB(s5, s5, a5, REG_S5, REG_S5, REG_A5)
|
||||
PACK_BB(s6, s6, a6, REG_S6, REG_S6, REG_A6)
|
||||
@@ -840,31 +826,18 @@ static WC_INLINE void Sha256Final(wc_Sha256* sha256, byte* hash)
|
||||
"srli t2, t3, 32\n\t"
|
||||
"srli a4, a5, 32\n\t"
|
||||
"srli a6, a7, 32\n\t"
|
||||
"sw t0, 0(%[hash])\n\t"
|
||||
"sw t1, 4(%[hash])\n\t"
|
||||
"sw t2, 8(%[hash])\n\t"
|
||||
"sw t3, 12(%[hash])\n\t"
|
||||
"sw a4, 16(%[hash])\n\t"
|
||||
"sw a5, 20(%[hash])\n\t"
|
||||
"sw a6, 24(%[hash])\n\t"
|
||||
"sw a7, 28(%[hash])\n\t"
|
||||
UNALIGNED_SW8(t0, t1, t2, t3, a4, a5, a6, a7, 0, %[hash], t4)
|
||||
#else
|
||||
LOAD_WORD_REV(t0, 0, %[digest], t2, t3, t4)
|
||||
LOAD_WORD_REV(t1, 4, %[digest], t2, t3, t4)
|
||||
LOAD_WORD_REV(a4, 8, %[digest], t2, t3, t4)
|
||||
LOAD_WORD_REV(a5, 12, %[digest], t2, t3, t4)
|
||||
"sw t0, 0(%[hash])\n\t"
|
||||
"sw t1, 4(%[hash])\n\t"
|
||||
"sw a4, 8(%[hash])\n\t"
|
||||
"sw a5, 12(%[hash])\n\t"
|
||||
UNALIGNED_SW4(t0, t1, a4, a5, 0, %[hash], t2)
|
||||
LOAD_WORD_REV(t0, 16, %[digest], t2, t3, t4)
|
||||
LOAD_WORD_REV(t1, 20, %[digest], t2, t3, t4)
|
||||
LOAD_WORD_REV(a4, 24, %[digest], t2, t3, t4)
|
||||
LOAD_WORD_REV(a5, 28, %[digest], t2, t3, t4)
|
||||
"sw t0, 16(%[hash])\n\t"
|
||||
"sw t1, 20(%[hash])\n\t"
|
||||
"sw a4, 24(%[hash])\n\t"
|
||||
"sw a5, 28(%[hash])\n\t"
|
||||
UNALIGNED_SW4(t0, t1, a4, a5, 16, %[hash], t2)
|
||||
#endif
|
||||
:
|
||||
: [digest] "r" (sha256->digest), [hash] "r" (hash)
|
||||
|
||||
@@ -139,7 +139,7 @@ static const word64 hash_keccak_r[24] =
|
||||
|
||||
#endif
|
||||
|
||||
void BlockSha3(word64* s)
|
||||
WC_OMIT_FRAME_POINTER void BlockSha3(word64* s)
|
||||
{
|
||||
const word64* r = hash_keccak_r;
|
||||
|
||||
|
||||
@@ -554,14 +554,7 @@ static WC_INLINE void Sha512Transform(wc_Sha512* sha512, const byte* data,
|
||||
LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7)
|
||||
LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7)
|
||||
#else
|
||||
"ld t4, 0(%[data])\n\t"
|
||||
"ld s1, 8(%[data])\n\t"
|
||||
"ld s2, 16(%[data])\n\t"
|
||||
"ld s3, 24(%[data])\n\t"
|
||||
"ld s4, 32(%[data])\n\t"
|
||||
"ld s5, 40(%[data])\n\t"
|
||||
"ld s6, 48(%[data])\n\t"
|
||||
"ld s7, 56(%[data])\n\t"
|
||||
UNALIGNED_LD8(t4, s1, s2, s3, s4, s5, s6, s7, 0, %[data], t5)
|
||||
REV8(REG_T4, REG_T4)
|
||||
REV8(REG_S1, REG_S1)
|
||||
REV8(REG_S2, REG_S2)
|
||||
@@ -946,14 +939,7 @@ static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen)
|
||||
REV8(REG_S9, REG_S9)
|
||||
REV8(REG_S10, REG_S10)
|
||||
REV8(REG_S11, REG_S11)
|
||||
"sd t0, 0(%[hash])\n\t"
|
||||
"sd t1, 8(%[hash])\n\t"
|
||||
"sd t2, 16(%[hash])\n\t"
|
||||
"sd t3, 24(%[hash])\n\t"
|
||||
"sd s8, 32(%[hash])\n\t"
|
||||
"sd s9, 40(%[hash])\n\t"
|
||||
"sd s10, 48(%[hash])\n\t"
|
||||
"sd s11, 56(%[hash])\n\t"
|
||||
UNALIGNED_SD8(t0, t1, t2, t3, s8, s9, s10, s11, 0, %[hash], t4)
|
||||
#else
|
||||
LOAD_DWORD_REV(t0, 0, %[digest], a4, a5, a6, a7)
|
||||
LOAD_DWORD_REV(t1, 8, %[digest], a4, a5, a6, a7)
|
||||
@@ -963,14 +949,7 @@ static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen)
|
||||
LOAD_DWORD_REV(s9, 40, %[digest], a4, a5, a6, a7)
|
||||
LOAD_DWORD_REV(s10, 48, %[digest], a4, a5, a6, a7)
|
||||
LOAD_DWORD_REV(s11, 56, %[digest], a4, a5, a6, a7)
|
||||
"sd t0, 0(%[hash])\n\t"
|
||||
"sd t1, 8(%[hash])\n\t"
|
||||
"sd t2, 16(%[hash])\n\t"
|
||||
"sd t3, 24(%[hash])\n\t"
|
||||
"sd s8, 32(%[hash])\n\t"
|
||||
"sd s9, 40(%[hash])\n\t"
|
||||
"sd s10, 48(%[hash])\n\t"
|
||||
"sd s11, 56(%[hash])\n\t"
|
||||
UNALIGNED_SD8(t0, t1, t2, t3, s8, s9, s10, s11, 0, %[hash], t4)
|
||||
#endif
|
||||
:
|
||||
: [digest] "r" (sha512->digest), [hash] "r" (hashRes)
|
||||
|
||||
@@ -86,9 +86,10 @@ typedef struct ChaCha {
|
||||
byte extra[12];
|
||||
#endif
|
||||
word32 left; /* number of bytes leftover */
|
||||
#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(USE_ARM_CHACHA_SPEEDUP) || \
|
||||
defined(WOLFSSL_RISCV_ASM)
|
||||
#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(USE_ARM_CHACHA_SPEEDUP)
|
||||
word32 over[CHACHA_CHUNK_WORDS];
|
||||
#elif defined(WOLFSSL_RISCV_ASM)
|
||||
ALIGN8 word32 over[CHACHA_CHUNK_WORDS];
|
||||
#endif
|
||||
} ChaCha;
|
||||
|
||||
|
||||
@@ -181,6 +181,633 @@
|
||||
/* 32-bit width when loading. */
|
||||
#define WIDTH_32 0b110
|
||||
|
||||
/*
|
||||
* Scalar load/store helpers.
|
||||
*
|
||||
* Each macro performs the same operation as the ld/lwu/lw/lh/sd/sw/sh
|
||||
* instruction it is named after. By default it expands to that native
|
||||
* instruction. When built with WOLFSSL_RISCV_ASM_NO_UNALIGNED - for cores that
|
||||
* don't support misaligned access - it checks the effective address alignment
|
||||
* at run time and dispatches to the widest supported sequence: word-wise
|
||||
* (lwu/sw) when 4-byte aligned, half-wise (lhu/sh) when 2-byte aligned,
|
||||
* otherwise byte-wise (lbu/sb). The narrower _BY_BYTE / _BY_HALF / _BY_WORD
|
||||
* forms are also exposed for sites that already know the alignment. Values
|
||||
* are little-endian, matching the native instructions either way.
|
||||
*
|
||||
* Bulk variants UNALIGNED_<LD|SD|LWU|LW|SW><N> (N = 2, 4, 8) issue N
|
||||
* consecutive accesses starting at o(p) - stride 8 bytes for LD/SD, 4 bytes
|
||||
* for LWU/LW/SW. Under WOLFSSL_RISCV_ASM_NO_UNALIGNED they share a single
|
||||
* alignment check across all N elements; otherwise they are just N native
|
||||
* instructions back-to-back.
|
||||
*
|
||||
* r = data register: destination (loads) or source (stores)
|
||||
* o = constant byte offset
|
||||
* p = base address register
|
||||
* t = scratch register - must differ from r and p; clobbered. For stores the
|
||||
* data register r is preserved. Only used when
|
||||
* WOLFSSL_RISCV_ASM_NO_UNALIGNED is defined; ignored otherwise.
|
||||
*/
|
||||
|
||||
/* Apply X to N doublewords at 8-byte stride starting at o(p). */
|
||||
#define UNALIGNED_DW_REP2(X, r0, r1, o, p, t) \
|
||||
X(r0, o, p, t) \
|
||||
X(r1, o+8, p, t)
|
||||
#define UNALIGNED_DW_REP4(X, r0, r1, r2, r3, o, p, t) \
|
||||
X(r0, o, p, t) \
|
||||
X(r1, o+8, p, t) \
|
||||
X(r2, o+16, p, t) \
|
||||
X(r3, o+24, p, t)
|
||||
#define UNALIGNED_DW_REP8(X, r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
X(r0, o, p, t) \
|
||||
X(r1, o+8, p, t) \
|
||||
X(r2, o+16, p, t) \
|
||||
X(r3, o+24, p, t) \
|
||||
X(r4, o+32, p, t) \
|
||||
X(r5, o+40, p, t) \
|
||||
X(r6, o+48, p, t) \
|
||||
X(r7, o+56, p, t)
|
||||
|
||||
/* Apply X to N words at 4-byte stride starting at o(p). */
|
||||
#define UNALIGNED_W_REP2(X, r0, r1, o, p, t) \
|
||||
X(r0, o, p, t) \
|
||||
X(r1, o+4, p, t)
|
||||
#define UNALIGNED_W_REP4(X, r0, r1, r2, r3, o, p, t) \
|
||||
X(r0, o, p, t) \
|
||||
X(r1, o+4, p, t) \
|
||||
X(r2, o+8, p, t) \
|
||||
X(r3, o+12, p, t)
|
||||
#define UNALIGNED_W_REP8(X, r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
X(r0, o, p, t) \
|
||||
X(r1, o+4, p, t) \
|
||||
X(r2, o+8, p, t) \
|
||||
X(r3, o+12, p, t) \
|
||||
X(r4, o+16, p, t) \
|
||||
X(r5, o+20, p, t) \
|
||||
X(r6, o+24, p, t) \
|
||||
X(r7, o+28, p, t)
|
||||
|
||||
#ifndef WOLFSSL_RISCV_ASM_NO_UNALIGNED
|
||||
|
||||
/* Load 64-bits. */
|
||||
#define UNALIGNED_LD(r, o, p, t) \
|
||||
"ld " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Load 32-bits, zero extended. */
|
||||
#define UNALIGNED_LWU(r, o, p, t) \
|
||||
"lwu " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Load 32-bits, sign extended. */
|
||||
#define UNALIGNED_LW(r, o, p, t) \
|
||||
"lw " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Load 16-bits, sign extended. */
|
||||
#define UNALIGNED_LH(r, o, p, t) \
|
||||
"lh " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Store 64-bits. */
|
||||
#define UNALIGNED_SD(r, o, p, t) \
|
||||
"sd " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Store 32-bits. */
|
||||
#define UNALIGNED_SW(r, o, p, t) \
|
||||
"sw " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Store 16-bits. */
|
||||
#define UNALIGNED_SH(r, o, p, t) \
|
||||
"sh " #r ", " #o "(" #p ")\n\t"
|
||||
|
||||
/* Bulk variants - hardware handles unaligned access, so just emit N native
|
||||
* instructions. */
|
||||
#define UNALIGNED_LD2(r0, r1, o, p, t) \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_LD, r0, r1, o, p, t)
|
||||
#define UNALIGNED_LD4(r0, r1, r2, r3, o, p, t) \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_LD, r0, r1, r2, r3, o, p, t)
|
||||
#define UNALIGNED_LD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_LD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
|
||||
#define UNALIGNED_SD2(r0, r1, o, p, t) \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_SD, r0, r1, o, p, t)
|
||||
#define UNALIGNED_SD4(r0, r1, r2, r3, o, p, t) \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_SD, r0, r1, r2, r3, o, p, t)
|
||||
#define UNALIGNED_SD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_SD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
|
||||
#define UNALIGNED_LWU2(r0, r1, o, p, t) \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LWU, r0, r1, o, p, t)
|
||||
#define UNALIGNED_LWU4(r0, r1, r2, r3, o, p, t) \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LWU, r0, r1, r2, r3, o, p, t)
|
||||
#define UNALIGNED_LWU8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LWU, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
|
||||
#define UNALIGNED_LW2(r0, r1, o, p, t) \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LW, r0, r1, o, p, t)
|
||||
#define UNALIGNED_LW4(r0, r1, r2, r3, o, p, t) \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LW, r0, r1, r2, r3, o, p, t)
|
||||
#define UNALIGNED_LW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LW, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
|
||||
#define UNALIGNED_SW2(r0, r1, o, p, t) \
|
||||
UNALIGNED_W_REP2(UNALIGNED_SW, r0, r1, o, p, t)
|
||||
#define UNALIGNED_SW4(r0, r1, r2, r3, o, p, t) \
|
||||
UNALIGNED_W_REP4(UNALIGNED_SW, r0, r1, r2, r3, o, p, t)
|
||||
#define UNALIGNED_SW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
UNALIGNED_W_REP8(UNALIGNED_SW, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t)
|
||||
|
||||
#else
|
||||
|
||||
/* Load 64-bits. */
|
||||
#define UNALIGNED_LD_BY_BYTE(r, o, p, t) \
|
||||
"lbu " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"lbu " #t ", " #o "+1(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 8\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 16\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+3(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 24\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+4(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 32\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+5(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 40\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+6(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 48\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+7(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 56\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t"
|
||||
#define UNALIGNED_LD_BY_HALF(r, o, p, t) \
|
||||
"lhu " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"lhu " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 16\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lhu " #t ", " #o "+4(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 32\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lhu " #t ", " #o "+6(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 48\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t"
|
||||
#define UNALIGNED_LD_BY_WORD(r, o, p, t) \
|
||||
"lwu " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"lwu " #t ", " #o "+4(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 32\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t"
|
||||
#define UNALIGNED_LD_BY_DWORD(r, o, p, t) \
|
||||
"ld " #r ", " #o "(" #p ")\n\t"
|
||||
/* Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_LD(r, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_LD_BY_DWORD(r, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_LD_BY_WORD(r, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_LD_BY_HALF(r, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_LD_BY_BYTE(r, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Load 32-bits, zero extended. */
|
||||
#define UNALIGNED_LWU_BY_BYTE(r, o, p, t) \
|
||||
"lbu " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"lbu " #t ", " #o "+1(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 8\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 16\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t" \
|
||||
"lbu " #t ", " #o "+3(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 24\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t"
|
||||
#define UNALIGNED_LWU_BY_HALF(r, o, p, t) \
|
||||
"lhu " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"lhu " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 16\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t"
|
||||
#define UNALIGNED_LWU_BY_WORD(r, o, p, t) \
|
||||
"lwu " #r ", " #o "(" #p ")\n\t"
|
||||
/* Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LWU(r, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_LWU_BY_WORD(r, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_LWU_BY_HALF(r, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_LWU_BY_BYTE(r, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 32-bits, sign extended. */
|
||||
#define UNALIGNED_LW_BY_BYTE(r, o, p, t) \
|
||||
UNALIGNED_LWU_BY_BYTE(r, o, p, t) \
|
||||
"sext.w " #r ", " #r "\n\t"
|
||||
#define UNALIGNED_LW_BY_HALF(r, o, p, t) \
|
||||
UNALIGNED_LWU_BY_HALF(r, o, p, t) \
|
||||
"sext.w " #r ", " #r "\n\t"
|
||||
#define UNALIGNED_LW_BY_WORD(r, o, p, t) \
|
||||
"lw " #r ", " #o "(" #p ")\n\t"
|
||||
/* Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LW(r, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_LW_BY_WORD(r, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_LW_BY_HALF(r, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_LW_BY_BYTE(r, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 16-bits, sign extended. */
|
||||
#define UNALIGNED_LH_BY_BYTE(r, o, p, t) \
|
||||
"lbu " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"lb " #t ", " #o "+1(" #p ")\n\t" \
|
||||
"slli " #t ", " #t ", 8\n\t" \
|
||||
"or " #r ", " #r ", " #t "\n\t"
|
||||
#define UNALIGNED_LH(r, o, p, t) \
|
||||
UNALIGNED_LH_BY_BYTE(r, o, p, t)
|
||||
|
||||
/* Store 64-bits. */
|
||||
#define UNALIGNED_SD_BY_BYTE(r, o, p, t) \
|
||||
"sb " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 8\n\t" \
|
||||
"sb " #t ", " #o "+1(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 16\n\t" \
|
||||
"sb " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 24\n\t" \
|
||||
"sb " #t ", " #o "+3(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 32\n\t" \
|
||||
"sb " #t ", " #o "+4(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 40\n\t" \
|
||||
"sb " #t ", " #o "+5(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 48\n\t" \
|
||||
"sb " #t ", " #o "+6(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 56\n\t" \
|
||||
"sb " #t ", " #o "+7(" #p ")\n\t"
|
||||
#define UNALIGNED_SD_BY_HALF(r, o, p, t) \
|
||||
"sh " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 16\n\t" \
|
||||
"sh " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 32\n\t" \
|
||||
"sh " #t ", " #o "+4(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 48\n\t" \
|
||||
"sh " #t ", " #o "+6(" #p ")\n\t"
|
||||
#define UNALIGNED_SD_BY_WORD(r, o, p, t) \
|
||||
"sw " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 32\n\t" \
|
||||
"sw " #t ", " #o "+4(" #p ")\n\t"
|
||||
#define UNALIGNED_SD_BY_DWORD(r, o, p, t) \
|
||||
"sd " #r ", " #o "(" #p ")\n\t"
|
||||
/* Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_SD(r, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_SD_BY_DWORD(r, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_SD_BY_WORD(r, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_SD_BY_HALF(r, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_SD_BY_BYTE(r, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Store 32-bits. */
|
||||
#define UNALIGNED_SW_BY_BYTE(r, o, p, t) \
|
||||
"sb " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 8\n\t" \
|
||||
"sb " #t ", " #o "+1(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 16\n\t" \
|
||||
"sb " #t ", " #o "+2(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 24\n\t" \
|
||||
"sb " #t ", " #o "+3(" #p ")\n\t"
|
||||
#define UNALIGNED_SW_BY_HALF(r, o, p, t) \
|
||||
"sh " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 16\n\t" \
|
||||
"sh " #t ", " #o "+2(" #p ")\n\t"
|
||||
#define UNALIGNED_SW_BY_WORD(r, o, p, t) \
|
||||
"sw " #r ", " #o "(" #p ")\n\t"
|
||||
/* Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_SW(r, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_SW_BY_WORD(r, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_SW_BY_HALF(r, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_SW_BY_BYTE(r, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Store 16-bits. */
|
||||
#define UNALIGNED_SH_BY_BYTE(r, o, p, t) \
|
||||
"sb " #r ", " #o "+0(" #p ")\n\t" \
|
||||
"srli " #t ", " #r ", 8\n\t" \
|
||||
"sb " #t ", " #o "+1(" #p ")\n\t"
|
||||
#define UNALIGNED_SH(r, o, p, t) \
|
||||
UNALIGNED_SH_BY_BYTE(r, o, p, t)
|
||||
|
||||
/* Load 2 64-bits. Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_LD2(r0, r1, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_LD_BY_DWORD, r0, r1, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_LD_BY_WORD, r0, r1, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_LD_BY_HALF, r0, r1, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_LD_BY_BYTE, r0, r1, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Load 4 64-bits. Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_LD4(r0, r1, r2, r3, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_LD_BY_DWORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_LD_BY_WORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_LD_BY_HALF, r0, r1, r2, r3, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_LD_BY_BYTE, r0, r1, r2, r3, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Load 8 64-bits. Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_LD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_LD_BY_DWORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_LD_BY_WORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_LD_BY_HALF, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_LD_BY_BYTE, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Store 2 64-bits. Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_SD2(r0, r1, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_SD_BY_DWORD, r0, r1, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_SD_BY_WORD, r0, r1, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_SD_BY_HALF, r0, r1, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_DW_REP2(UNALIGNED_SD_BY_BYTE, r0, r1, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Store 4 64-bits. Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_SD4(r0, r1, r2, r3, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_SD_BY_DWORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_SD_BY_WORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_SD_BY_HALF, r0, r1, r2, r3, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_DW_REP4(UNALIGNED_SD_BY_BYTE, r0, r1, r2, r3, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Store 8 64-bits. Assumes o is a multiple of 8. */
|
||||
#define UNALIGNED_SD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"andi " #t ", " #p ", 7\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_SD_BY_DWORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 3\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_SD_BY_WORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"2:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 3f\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_SD_BY_HALF, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 4f\n\t" \
|
||||
"3:\n\t" \
|
||||
UNALIGNED_DW_REP8(UNALIGNED_SD_BY_BYTE, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"4:\n\t"
|
||||
|
||||
/* Load 2 32-bits, zero extended. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LWU2(r0, r1, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LWU_BY_WORD, r0, r1, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LWU_BY_HALF, r0, r1, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LWU_BY_BYTE, r0, r1, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 4 32-bits, zero extended. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LWU4(r0, r1, r2, r3, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LWU_BY_WORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LWU_BY_HALF, r0, r1, r2, r3, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LWU_BY_BYTE, r0, r1, r2, r3, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 8 32-bits, zero extended. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LWU8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LWU_BY_WORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LWU_BY_HALF, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LWU_BY_BYTE, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 2 32-bits, sign extended. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LW2(r0, r1, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LW_BY_WORD, r0, r1, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LW_BY_HALF, r0, r1, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_LW_BY_BYTE, r0, r1, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 4 32-bits, sign extended. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LW4(r0, r1, r2, r3, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LW_BY_WORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LW_BY_HALF, r0, r1, r2, r3, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_LW_BY_BYTE, r0, r1, r2, r3, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Load 8 32-bits, sign extended. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_LW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LW_BY_WORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LW_BY_HALF, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_LW_BY_BYTE, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Store 2 32-bits. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_SW2(r0, r1, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_SW_BY_WORD, r0, r1, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_SW_BY_HALF, r0, r1, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP2(UNALIGNED_SW_BY_BYTE, r0, r1, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Store 4 32-bits. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_SW4(r0, r1, r2, r3, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_SW_BY_WORD, r0, r1, r2, r3, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_SW_BY_HALF, r0, r1, r2, r3, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP4(UNALIGNED_SW_BY_BYTE, r0, r1, r2, r3, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
/* Store 8 32-bits. Assumes o is a multiple of 4. */
|
||||
#define UNALIGNED_SW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"andi " #t ", " #p ", 3\n\t" \
|
||||
"bnez " #t ", 1f\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_SW_BY_WORD, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"1:\n\t" \
|
||||
"andi " #t ", " #t ", 1\n\t" \
|
||||
"bnez " #t ", 2f\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_SW_BY_HALF, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"j 3f\n\t" \
|
||||
"2:\n\t" \
|
||||
UNALIGNED_W_REP8(UNALIGNED_SW_BY_BYTE, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \
|
||||
"3:\n\t"
|
||||
|
||||
#endif /* !WOLFSSL_RISCV_ASM_NO_UNALIGNED */
|
||||
|
||||
|
||||
#define VLSEG_V(vd, rs1, cnt, width) \
|
||||
ASM_WORD(0b0000111 | (width << 12) | (0b10101000 << 20) | \
|
||||
|
||||
Reference in New Issue
Block a user