ARMv8 : AES-CTR/CBC/GCM speed ups and refactor AES

2025-08-03 12:44:45 +02:00 · 2016-09-15 06:03:48 +00:00
parent 78c0f98ea9
commit 6d82cba29c
5 changed files with 2254 additions and 464 deletions
--- a/src/include.am
+++ b/src/include.am
@@ -89,8 +89,12 @@ endif
 endif

 if BUILD_AES
+if BUILD_ARMASM
+src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes.c
 endif
+endif

 if BUILD_CMAC
 src_libwolfssl_la_SOURCES += wolfcrypt/src/cmac.c
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -344,153 +344,6 @@ void wc_AesAsyncFree(Aes* aes)
    #ifdef HAVE_AES_DECRYPT
        #error nRF51 AES Hardware does not support decrypt
    #endif /* HAVE_AES_DECRYPT */
-#elif defined(WOLFSSL_ARMASM)
-    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
-    {
-            byte*  keyPt  = (byte*)aes->key;
-            word32 rounds = aes->rounds;
-
-            /*
-              AESE exor's input with round key
-                   shift rows of exor'ed result
-                   sub bytes for shifted rows
-             */
-
-            __asm__ __volatile__ (
-                "LD1 {v0.16b}, [%[CtrIn]] \n"
-                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
-
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-
-                "#subtract rounds done so far and see if should continue\n"
-                "MOV w12, %w[R]    \n"
-                "SUB w12, w12, #10 \n"
-                "CBZ w12, final    \n"
-                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-
-                "SUB w12, w12, #2 \n"
-                "CBZ w12, final   \n"
-                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-
-                "#Final AddRoundKey then store result \n"
-                "final: \n"
-                "LD1 {v1.16b}, [%[Key]], #16 \n"
-                "EOR v0.16b, v0.16b, v1.16b  \n"
-                "ST1 {v0.16b}, [%[CtrOut]]   \n"
-
-                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (rounds),
-                 "=r" (inBlock)
-                :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (rounds),
-                 [CtrIn] "3" (inBlock)
-                : "cc", "memory", "w12"
-            );
-
-        return 0;
-    }
-    #ifdef HAVE_AES_DECRYPT
-    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
-    {
-            byte*  keyPt  = (byte*)aes->key;
-            word32 rounds = aes->rounds;
-
-            /*
-              AESE exor's input with round key
-                   shift rows of exor'ed result
-                   sub bytes for shifted rows
-             */
-
-            __asm__ __volatile__ (
-                "LD1 {v0.16b}, [%[CtrIn]] \n"
-                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
-
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-
-                "#subtract rounds done so far and see if should continue\n"
-                "MOV w12, %w[R]    \n"
-                "SUB w12, w12, #10 \n"
-                "CBZ w12, finalDec \n"
-                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-
-                "SUB w12, w12, #2  \n"
-                "CBZ w12, finalDec \n"
-                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-
-                "#Final AddRoundKey then store result \n"
-                "finalDec: \n"
-                "LD1 {v1.16b}, [%[Key]], #16 \n"
-                "EOR v0.16b, v0.16b, v1.16b  \n"
-                "ST1 {v0.4s}, [%[CtrOut]]    \n"
-
-                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (rounds),
-                 "=r" (inBlock)
-                :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (rounds),
-                 [CtrIn] "3" (inBlock)
-                : "cc", "memory", "w12"
-            );
-
-        return 0;
-}
-    #endif /* HAVE_AES_DECRYPT */
 #else

    /* using wolfCrypt software AES implementation */
@@ -1794,196 +1647,6 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
    {
        return wc_AesSetKey(aes, userKey, keylen, iv, dir);
    }
-#elif defined(WOLFSSL_ARMASM)
-    static const byte rcon[] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36
-        /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
-    };
-
-
-    /* Similar to wolfSSL software implementation of expanding the AES key.
-     * Changed out the locations of where table look ups where made to
-     * use hardware instruction. Also altered decryption key to match. */
-    int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
-                const byte* iv, int dir)
-    {
-        word32 temp, *rk = aes->key;
-        unsigned int i = 0;
-
-    #if defined(AES_MAX_KEY_SIZE)
-        const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
-    #endif
-
-        if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
-            return BAD_FUNC_ARG;
-
-    #if defined(AES_MAX_KEY_SIZE)
-        /* Check key length */
-        if (keylen > max_key_len) {
-            return BAD_FUNC_ARG;
-        }
-    #endif
-
-        #ifdef WOLFSSL_AES_COUNTER
-            aes->left = 0;
-        #endif /* WOLFSSL_AES_COUNTER */
-
-        aes->rounds = keylen/4 + 6;
-        XMEMCPY(rk, userKey, keylen);
-
-        switch(keylen)
-        {
-#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128
-        case 16:
-            while (1)
-            {
-                temp  = rk[3];
-
-                /* get table value from hardware */
-                __asm__ volatile (
-                    "DUP v1.4s, %w[in]  \n"
-                    "MOVI v0.16b, #0     \n"
-                    "AESE v0.16b, v1.16b \n"
-                    "UMOV %w[out], v0.4s[0] \n"
-                    : [out] "=r"(temp)
-                    : [in] "r" (temp)
-                    : "cc", "memory", "v0", "v1"
-                );
-                temp = rotrFixed(temp, 8);
-                rk[4] = rk[0] ^ temp ^ rcon[i];
-                rk[5] = rk[4] ^ rk[1];
-                rk[6] = rk[5] ^ rk[2];
-                rk[7] = rk[6] ^ rk[3];
-                if (++i == 10)
-                    break;
-                rk += 4;
-            }
-            break;
-#endif /* 128 */
-
-#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192
-        case 24:
-            /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
-            while (1)
-            {
-                temp  = rk[5];
-
-                /* get table value from hardware */
-                __asm__ volatile (
-                    "DUP v1.4s, %w[in]  \n"
-                    "MOVI v0.16b, #0     \n"
-                    "AESE v0.16b, v1.16b \n"
-                    "UMOV %w[out], v0.4s[0] \n"
-                    : [out] "=r"(temp)
-                    : [in] "r" (temp)
-                    : "cc", "memory", "v0", "v1"
-                );
-                temp = rotrFixed(temp, 8);
-                rk[ 6] = rk[ 0] ^ temp ^ rcon[i];
-                rk[ 7] = rk[ 1] ^ rk[ 6];
-                rk[ 8] = rk[ 2] ^ rk[ 7];
-                rk[ 9] = rk[ 3] ^ rk[ 8];
-                if (++i == 8)
-                    break;
-                rk[10] = rk[ 4] ^ rk[ 9];
-                rk[11] = rk[ 5] ^ rk[10];
-                rk += 6;
-            }
-            break;
-#endif /* 192 */
-
-#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256
-        case 32:
-            while (1)
-            {
-                temp  = rk[7];
-
-                /* get table value from hardware */
-                __asm__ volatile (
-                    "DUP v1.4s, %w[in]  \n"
-                    "MOVI v0.16b, #0     \n"
-                    "AESE v0.16b, v1.16b \n"
-                    "UMOV %w[out], v0.4s[0] \n"
-                    : [out] "=r"(temp)
-                    : [in] "r" (temp)
-                    : "cc", "memory", "v0", "v1"
-                );
-                temp = rotrFixed(temp, 8);
-                rk[8] = rk[0] ^ temp ^ rcon[i];
-                rk[ 9] = rk[ 1] ^ rk[ 8];
-                rk[10] = rk[ 2] ^ rk[ 9];
-                rk[11] = rk[ 3] ^ rk[10];
-                if (++i == 7)
-                    break;
-                temp  = rk[11];
-
-                /* get table value from hardware */
-                __asm__ volatile (
-                    "DUP v1.4s, %w[in]  \n"
-                    "MOVI v0.16b, #0     \n"
-                    "AESE v0.16b, v1.16b \n"
-                    "UMOV %w[out], v0.4s[0] \n"
-                    : [out] "=r"(temp)
-                    : [in] "r" (temp)
-                    : "cc", "memory", "v0", "v1"
-                );
-                rk[12] = rk[ 4] ^ temp;
-                rk[13] = rk[ 5] ^ rk[12];
-                rk[14] = rk[ 6] ^ rk[13];
-                rk[15] = rk[ 7] ^ rk[14];
-
-                rk += 8;
-            }
-            break;
-#endif /* 256 */
-
-        default:
-            return BAD_FUNC_ARG;
-        }
-
-        if (dir == AES_DECRYPTION)
-        {
-#ifdef HAVE_AES_DECRYPT
-            unsigned int j;
-            rk = aes->key;
-
-            /* invert the order of the round keys: */
-            for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) {
-                temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
-                temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
-                temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
-                temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
-            }
-            /* apply the inverse MixColumn transform to all round keys but the
-               first and the last: */
-            for (i = 1; i < aes->rounds; i++) {
-                rk += 4;
-                __asm__ volatile (
-                    "LD1 {v0.16b}, [%[in]] \n"
-                    "AESIMC v0.16b, v0.16b \n"
-                    "ST1 {v0.16b}, [%[out]]\n"
-                    : [out] "=r" (rk)
-                    : [in] "0" (rk)
-                    : "cc", "memory", "v0"
-                );
-            }
-#else
-        WOLFSSL_MSG("AES Decryption not compiled in");
-        return BAD_FUNC_ARG;
-#endif /* HAVE_AES_DECRYPT */
-        }
-
-        return wc_AesSetIV(aes, iv);
-    }
-
-    #if defined(WOLFSSL_AES_DIRECT)
-        int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
-                            const byte* iv, int dir)
-        {
-            return wc_AesSetKey(aes, userKey, keylen, iv, dir);
-        }
-    #endif
-
 #else
    static int wc_AesSetKeyLocal(Aes* aes, const byte* userKey, word32 keylen,
                const byte* iv, int dir)
@@ -3165,7 +2828,7 @@ static INLINE void IncrementGcmCounter(byte* inOutCtr)
 }


-#if defined(GCM_SMALL) || defined(GCM_TABLE) || defined(WOLFSSL_ARMASM)
+#if defined(GCM_SMALL) || defined(GCM_TABLE)

 static INLINE void FlattenSzInBits(byte* buf, word32 sz)
 {
@@ -3249,20 +2912,6 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)

    if (ret == 0) {
        wc_AesEncrypt(aes, iv, aes->H);
-    #if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
-        {
-            word32* pt = (word32*)aes->H;
-            __asm__ volatile (
-                "LD1 {v0.16b}, [%[h]] \n"
-                "RBIT v0.16b, v0.16b \n"
-                "ST1 {v0.16b}, [%[out]] \n"
-                : [out] "=r" (pt)
-                : [h] "0" (pt)
-                : "cc", "memory"
-            );
-            return ret; /* no need to generate GCM_TABLE */
-        }
-    #endif
    #ifdef GCM_TABLE
        GenerateM0(aes);
    #endif /* GCM_TABLE */
@@ -3699,118 +3348,7 @@ static int AES_GCM_decrypt(const unsigned char *in,
 #endif /* WOLFSSL_AESNI */


-#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
-/* PMULL and RBIT only with AArch64 */
-/* Use ARM hardware for polynomial multiply */
-static void GMULT(byte* X, byte* Y)
-{
-    word32* Xpt = (word32*)X;
-    word32* Ypt = (word32*)Y;
-
-    __asm__ volatile (
-        "LD1 {v0.16b}, [%[inX]] \n"
-        "LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */
-        "RBIT v0.16b, v0.16b \n"
-
-
-        /* Algorithm 1 from Intel GCM white paper.
-           "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
-         */
-        "PMULL  v3.1q, v0.1d, v1.1d \n"     /* a0 * b0 = C */
-        "PMULL2 v4.1q, v0.2d, v1.2d \n"     /* a1 * b1 = D */
-        "EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */
-        "PMULL  v6.1q, v0.1d, v5.1d \n"     /* a0 * b1 = E */
-        "PMULL2 v5.1q, v0.2d, v5.2d \n"     /* a1 * b0 = F */
-
-        "#Set a register to all 0s using EOR \n"
-        "EOR v7.16b, v7.16b, v7.16b \n"
-        "EOR v5.16b, v5.16b, v6.16b \n"     /* F ^ E */
-        "EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */
-        "EOR v3.16b, v3.16b, v6.16b \n"     /* low 128 bits in v3 */
-        "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */
-        "EOR v4.16b, v4.16b, v6.16b \n"     /* high 128 bits in v4 */
-
-
-        /* Based from White Paper "Implementing GCM on ARMv8"
-           by Conrado P.L. Gouvea and Julio Lopez
-           reduction on 256bit value using Algorithm 5 */
-        "MOVI v8.16b, #0x87 \n"
-        "USHR v8.2d, v8.2d, #56 \n"
-        /* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/
-        "PMULL2 v5.1q, v4.2d, v8.2d \n"
-        "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */
-        "EOR v4.16b, v4.16b, v6.16b \n"
-        "EXT v6.16b, v7.16b, v5.16b, #8 \n"
-        "EOR v3.16b, v3.16b, v6.16b \n"
-        "PMULL v5.1q, v4.1d, v8.1d  \n"
-        "EOR v4.16b, v3.16b, v5.16b \n"
-
-        "RBIT v4.16b, v4.16b \n"
-        "STR q4, [%[out]] \n"
-        : [out] "=r" (Xpt), "=r" (Ypt)
-        : [inX] "0" (Xpt), [inY] "1" (Ypt)
-        : "cc", "memory", "v3", "v4", "v5", "v6", "v7", "v8"
-    );
-}
-
-
-/* Currently is a copy from GCM_SMALL wolfSSL version. Duplicated and set
- * seperate for future optimizations. */
-static void GHASH(Aes* aes, const byte* a, word32 aSz,
-                                const byte* c, word32 cSz, byte* s, word32 sSz)
-{
-    byte x[AES_BLOCK_SIZE];
-    byte scratch[AES_BLOCK_SIZE];
-    word32 blocks, partial;
-    byte* h = aes->H;
-
-    XMEMSET(x, 0, AES_BLOCK_SIZE);
-
-    /* Hash in A, the Additional Authentication Data */
-    if (aSz != 0 && a != NULL) {
-        blocks = aSz / AES_BLOCK_SIZE;
-        partial = aSz % AES_BLOCK_SIZE;
-        while (blocks--) {
-            xorbuf(x, a, AES_BLOCK_SIZE);
-            GMULT(x, h);
-            a += AES_BLOCK_SIZE;
-        }
-        if (partial != 0) {
-            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
-            XMEMCPY(scratch, a, partial);
-            xorbuf(x, scratch, AES_BLOCK_SIZE);
-            GMULT(x, h);
-        }
-    }
-
-    /* Hash in C, the Ciphertext */
-    if (cSz != 0 && c != NULL) {
-        blocks = cSz / AES_BLOCK_SIZE;
-        partial = cSz % AES_BLOCK_SIZE;
-        while (blocks--) {
-            xorbuf(x, c, AES_BLOCK_SIZE);
-            GMULT(x, h);
-            c += AES_BLOCK_SIZE;
-        }
-        if (partial != 0) {
-            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
-            XMEMCPY(scratch, c, partial);
-            xorbuf(x, scratch, AES_BLOCK_SIZE);
-            GMULT(x, h);
-        }
-    }
-
-    /* Hash in the lengths of A and C in bits */
-    FlattenSzInBits(&scratch[0], aSz);
-    FlattenSzInBits(&scratch[8], cSz);
-    xorbuf(x, scratch, AES_BLOCK_SIZE);
-    GMULT(x, h);
-
-    /* Copy the result into s. */
-    XMEMCPY(s, x, sSz);
-}
-/* not using ARMASM for multiplication */
-#elif defined(GCM_SMALL)
+#if defined(GCM_SMALL)
 static void GMULT(byte* X, byte* Y)
 {
    byte Z[AES_BLOCK_SIZE];
--- a/wolfcrypt/src/port/arm/armv8-aes.c
+++ b/wolfcrypt/src/port/arm/armv8-aes.c
--- a/wolfcrypt/src/wc_port.c
+++ b/wolfcrypt/src/wc_port.c
@@ -69,6 +69,10 @@ int wolfCrypt_Init()
        }
    #endif

+    #ifdef WOLFSSL_ARMASM
+        WOLFSSL_MSG("Using ARM hardware acceleration");
+    #endif
+
        initRefCount = 1;
    }

--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -2815,6 +2815,7 @@ int aes_test(void)
                ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, msgSz);
                if (ret != 0)
                    return -1032;
+
                ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, msgSz);
                if (ret != 0)
                    return -1033;
@@ -2878,6 +2879,64 @@ int aes_test(void)
            0xc2
        };

+
+        /* test vector from "Recommendation for Block Cipher Modes of Operation"
+         * NIST Special Publication 800-38A */
+        const byte ctr192Key[] =
+        {
+            0x8e,0x73,0xb0,0xf7,0xda,0x0e,0x64,0x52,
+            0xc8,0x10,0xf3,0x2b,0x80,0x90,0x79,0xe5,
+            0x62,0xf8,0xea,0xd2,0x52,0x2c,0x6b,0x7b
+        };
+
+        const byte ctr192Iv[] =
+        {
+            0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,
+            0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
+        };
+
+
+        const byte ctr192Plain[] =
+        {
+            0x6b,0xc1,0xbe,0xe2,0x2e,0x40,0x9f,0x96,
+            0xe9,0x3d,0x7e,0x11,0x73,0x93,0x17,0x2a
+        };
+
+        const byte ctr192Cipher[] =
+        {
+            0x1a,0xbc,0x93,0x24,0x17,0x52,0x1c,0xa2,
+            0x4f,0x2b,0x04,0x59,0xfe,0x7e,0x6e,0x0b
+        };
+
+        /* test vector from "Recommendation for Block Cipher Modes of Operation"
+         * NIST Special Publication 800-38A */
+        const byte ctr256Key[] =
+        {
+            0x60,0x3d,0xeb,0x10,0x15,0xca,0x71,0xbe,
+            0x2b,0x73,0xae,0xf0,0x85,0x7d,0x77,0x81,
+            0x1f,0x35,0x2c,0x07,0x3b,0x61,0x08,0xd7,
+            0x2d,0x98,0x10,0xa3,0x09,0x14,0xdf,0xf4
+        };
+
+        const byte ctr256Iv[] =
+        {
+            0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,
+            0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
+        };
+
+
+        const byte ctr256Plain[] =
+        {
+            0x6b,0xc1,0xbe,0xe2,0x2e,0x40,0x9f,0x96,
+            0xe9,0x3d,0x7e,0x11,0x73,0x93,0x17,0x2a
+        };
+
+        const byte ctr256Cipher[] =
+        {
+            0x60,0x1e,0xc3,0x13,0x77,0x57,0x89,0xa5,
+            0xb7,0xa7,0xf5,0x04,0xbb,0xf3,0xd2,0x28
+        };
+
        wc_AesSetKeyDirect(&enc, ctrKey, AES_BLOCK_SIZE, ctrIv, AES_ENCRYPTION);
        /* Ctr only uses encrypt, even on key setup */
        wc_AesSetKeyDirect(&dec, ctrKey, AES_BLOCK_SIZE, ctrIv, AES_ENCRYPTION);
@@ -2914,6 +2973,40 @@ int aes_test(void)

        if (XMEMCMP(cipher, oddCipher, 9))
            return -71;
+
+        /* 192 bit key */
+        wc_AesSetKeyDirect(&enc, ctr192Key, sizeof(ctr192Key),
+                           ctr192Iv, AES_ENCRYPTION);
+        /* Ctr only uses encrypt, even on key setup */
+        wc_AesSetKeyDirect(&dec, ctr192Key, sizeof(ctr192Key),
+                           ctr192Iv, AES_ENCRYPTION);
+
+        XMEMSET(plain, 0, sizeof(plain));
+        wc_AesCtrEncrypt(&enc, plain, ctr192Cipher, sizeof(ctr192Cipher));
+
+        if (XMEMCMP(plain, ctr192Plain, sizeof(ctr192Plain)))
+            return -72;
+
+        wc_AesCtrEncrypt(&dec, cipher, ctr192Plain, sizeof(ctr192Plain));
+        if (XMEMCMP(ctr192Cipher, cipher, sizeof(ctr192Cipher)))
+            return -73;
+
+        /* 256 bit key */
+        wc_AesSetKeyDirect(&enc, ctr256Key, sizeof(ctr256Key),
+                           ctr256Iv, AES_ENCRYPTION);
+        /* Ctr only uses encrypt, even on key setup */
+        wc_AesSetKeyDirect(&dec, ctr256Key, sizeof(ctr256Key),
+                           ctr256Iv, AES_ENCRYPTION);
+
+        XMEMSET(plain, 0, sizeof(plain));
+        wc_AesCtrEncrypt(&enc, plain, ctr256Cipher, sizeof(ctr256Cipher));
+
+        if (XMEMCMP(plain, ctr256Plain, sizeof(ctr256Plain)))
+            return -74;
+
+        wc_AesCtrEncrypt(&dec, cipher, ctr256Plain, sizeof(ctr256Plain));
+        if (XMEMCMP(ctr256Cipher, cipher, sizeof(ctr256Cipher)))
+            return -75;
    }
 #endif /* WOLFSSL_AES_COUNTER */