diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c
index d9126bf07..30cd73594 100644
--- a/wolfcrypt/src/port/arm/armv8-aes.c
+++ b/wolfcrypt/src/port/arm/armv8-aes.c
@@ -1269,8 +1269,6 @@ static void GMULT(byte* X, byte* Y)
 }
 #endif
 
-/* Currently is a copy from GCM_SMALL wolfSSL version. Duplicated and set
- * seperate for future optimizations. */
 static void GHASH(Aes* aes, const byte* a, word32 aSz,
                                 const byte* c, word32 cSz, byte* s, word32 sSz)
 {
@@ -1320,28 +1318,35 @@ static void GHASH(Aes* aes, const byte* a, word32 aSz,
     FlattenSzInBits(&scratch[0], aSz);
     FlattenSzInBits(&scratch[8], cSz);
     xorbuf(x, scratch, AES_BLOCK_SIZE);
-    GMULT(x, h);
 
-    /* Copy the result into s. */
+    /* Copy the result (minus last GMULT) into s. */
     XMEMCPY(s, x, sSz);
 }
 
 
-int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
                    const byte* iv, word32 ivSz,
                    byte* authTag, word32 authTagSz,
                    const byte* authIn, word32 authInSz)
 {
-    word32 blocks = sz / AES_BLOCK_SIZE;
-    word32 partial = sz % AES_BLOCK_SIZE;
-    const byte* p = in;
-    byte* c = out;
+    word32 blocks;
+    word32 partial;
     byte counter[AES_BLOCK_SIZE];
     byte initialCounter[AES_BLOCK_SIZE];
-    byte *ctr ;
+    byte x[AES_BLOCK_SIZE];
     byte scratch[AES_BLOCK_SIZE];
 
-    ctr = counter ;
+    /* Noticed different optimization levels treated head of array different.
+       Some cases was stack pointer plus offset others was a regester containing
+       address. To make uniform for passing in to inline assembly code am using
+       pointers to the head of each local array.
+     */
+    byte* ctr  = counter;
+    byte* iCtr = initialCounter;
+    byte* xPt  = x;
+    byte* sPt  = scratch;
+    byte* keyPt; /* pointer to handle pointer advencment */
 
     XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
     if (ivSz == NONCE_SZ) {
@@ -1350,37 +1355,56 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     }
     else {
         GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+    /* Hash in the Additional Authentication Data */
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+    if (authInSz != 0 && authIn != NULL) {
+        blocks = authInSz / AES_BLOCK_SIZE;
+        partial = authInSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, authIn, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+            authIn += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, authIn, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+        }
     }
-    XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
 
     /* do as many blocks as possible */
+    blocks = sz / AES_BLOCK_SIZE;
+    partial = sz % AES_BLOCK_SIZE;
     if (blocks > 0) {
-        /* pointer needed because it is incremented when read, causing
-         * an issue with call to encrypt/decrypt leftovers */
-        byte*  keyPt  = (byte*)aes->key;
-        switch(aes->rounds) {
-        case 10: /* AES 128 BLOCK */
-            __asm__ __volatile__ (
+        keyPt  = (byte*)aes->key;
+        __asm__ __volatile__ (
             "MOV w11, %w[blocks] \n"
-            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+            "LD1 {v13.2d}, [%[ctr]] \n"
 
             "#Create vector with the value 1  \n"
             "MOVI v14.16b, #1                 \n"
             "USHR v14.2d, v14.2d, #56         \n"
-            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
-            "EOR v13.16b, v13.16b, v13.16b    \n"
-            "EXT v14.16b, v14.16b, v13.16b, #8\n"
+            "EOR v22.16b, v22.16b, v22.16b    \n"
+            "EXT v14.16b, v14.16b, v22.16b, #8\n"
 
-            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
-            "LD1 {v13.2d}, [%[ctr]] \n"
 
-            "LD1 {v12.2d}, [%[input]], #16 \n"
-            "AESGCM128Block: \n"
+            /***************************************************
+               Get first out block for GHASH using AES encrypt
+             ***************************************************/
             "REV64 v13.16b, v13.16b \n" /* network order */
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
             "EXT v13.16b, v13.16b, v13.16b, #8 \n"
             "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
             "EXT v13.16b, v13.16b, v13.16b, #8 \n"
             "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
             "MOV v0.16b, v13.16b  \n"
             "AESE v0.16b, v1.16b  \n"
             "AESMC v0.16b, v0.16b \n"
@@ -1388,17 +1412,23 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
             "AESMC v0.16b, v0.16b \n"
             "AESE v0.16b, v3.16b  \n"
             "AESMC v0.16b, v0.16b \n"
+            "LD1 {v16.2d}, %[inY] \n"
             "AESE v0.16b, v4.16b  \n"
             "AESMC v0.16b, v0.16b \n"
             "SUB w11, w11, #1     \n"
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
             "AESE v0.16b, v5.16b  \n"
             "AESMC v0.16b, v0.16b \n"
+            "MOVI v23.16b, #0x87 \n"
             "AESE v0.16b, v6.16b  \n"
             "AESMC v0.16b, v0.16b \n"
+            "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
             "AESE v0.16b, v7.16b  \n"
             "AESMC v0.16b, v0.16b \n"
+            "USHR v23.2d, v23.2d, #56 \n"
             "AESE v0.16b, v8.16b  \n"
             "AESMC v0.16b, v0.16b \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
             "AESE v0.16b, v9.16b  \n"
             "AESMC v0.16b, v0.16b \n"
             "AESE v0.16b, v10.16b \n"
@@ -1406,190 +1436,945 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
 
             "EOR v0.16b, v0.16b, v12.16b \n"
             "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v0.16b \n"
+
+            "CBZ w11, AESGCMend \n" /* only one block jump to final GHASH */
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+
+            /***************************************************
+               Interweave GHASH and encrypt if more then 1 block
+             ***************************************************/
+            "AESGCM128Block: \n"
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "MOV v0.16b, v13.16b  \n"
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v0.16b, v0.16b, v11.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "MOV v15.16b, v0.16b \n"
+            "RBIT v17.16b, v19.16b \n"
 
             "CBZ w11, AESGCMend \n"
             "LD1 {v12.2d}, [%[input]], #16 \n"
             "B AESGCM128Block \n"
 
+            /***************************************************
+               GHASH on last block
+             ***************************************************/
             "AESGCMend: \n"
-            "#store current counter value at the end \n"
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+            "#store current AES counter value \n"
             "ST1 {v13.2d}, [%[ctrOut]] \n"
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
 
-            :[out] "=r" (c), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (p)
-            :"0" (c), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
-             [input] "3" (p)
-            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "#Reduce product from multiplication \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+            "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+            :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+            ,[xOut] "=r" (xPt),"=m" (aes->H)
+            :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (in)
+            ,[inX] "4" (xPt), [inY] "m" (aes->H)
+            : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
             "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
-            );
-            break;
-
-        case 12: /* AES 192 BLOCK */
-            __asm__ __volatile__ (
-            "MOV w11, %w[blocks] \n"
-            "LD1 {v1.2d-v4.2d}, [%[Key]], #64\n"
-
-            "#Create vector with the value 1  \n"
-            "MOVI v16.16b, #1                 \n"
-            "USHR v16.2d, v16.2d, #56         \n"
-            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
-            "EOR v14.16b, v14.16b, v14.16b    \n"
-            "EXT v16.16b, v16.16b, v14.16b, #8\n"
-
-            "LD1 {v9.2d-v12.2d}, [%[Key]], #64\n"
-            "LD1 {v13.2d}, [%[Key]], #16      \n"
-            "LD1 {v14.2d}, [%[input]], #16    \n"
-            "LD1 {v15.2d}, [%[ctr]]           \n"
-
-            "AESGCM192Block: \n"
-            "REV64 v15.16b, v15.16b \n" /* network order */
-            "EXT v15.16b, v15.16b, v15.16b, #8 \n"
-            "ADD v15.2d, v15.2d, v16.2d \n" /* add 1 to counter */
-            "EXT v15.16b, v15.16b, v15.16b, #8 \n"
-            "REV64 v15.16b, v15.16b \n" /* revert from network order */
-            "MOV v0.16b, v15.16b  \n"
-            "AESE v0.16b, v1.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v2.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v3.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v4.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "SUB w11, w11, #1     \n"
-            "AESE v0.16b, v5.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v6.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v7.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v8.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v9.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v10.16b \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v11.16b \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v12.16b \n"
-            "EOR v0.16b, v0.16b, v13.16b \n"
-
-            "EOR v0.16b, v0.16b, v14.16b \n"
-            "ST1 {v0.2d}, [%[out]], #16  \n"
-
-            "CBZ w11, AESGCM192end \n"
-            "LD1 {v14.2d}, [%[input]], #16 \n"
-            "B AESGCM192Block \n"
-
-            "AESGCM192end: \n"
-            "#store current counter value at the end \n"
-            "ST1 {v15.16b}, [%[ctrOut]]   \n"
-
-            :[out] "=r" (c), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (p)
-            :"0" (c), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
-             [input] "3" (p)
-            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-            "v16"
-            );
-            break;
-        case 14: /* AES 256 BLOCK */
-            __asm__ __volatile__ (
-            "MOV w11, %w[blocks] \n"
-            "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-
-            "#Create vector with the value 1   \n"
-            "MOVI v18.16b, #1                  \n"
-            "USHR v18.2d, v18.2d, #56          \n"
-            "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
-            "EOR v19.16b, v19.16b, v19.16b     \n"
-            "EXT v18.16b, v18.16b, v19.16b, #8 \n"
-
-            "LD1 {v9.2d-v12.2d}, [%[Key]], #64  \n"
-            "LD1 {v17.2d}, [%[ctr]]             \n"
-            "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
-            "LD1 {v16.2d}, [%[input]], #16      \n"
-
-            "AESGCM256Block: \n"
-            "REV64 v17.16b, v17.16b \n" /* network order */
-            "EXT v17.16b, v17.16b, v17.16b, #8 \n"
-            "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */
-            "EXT v17.16b, v17.16b, v17.16b, #8 \n"
-            "REV64 v17.16b, v17.16b \n" /* revert from network order */
-            "MOV v0.16b, v17.16b \n"
-
-            "AESE v0.16b, v1.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v2.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v3.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v4.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "SUB w11, w11, #1     \n"
-            "AESE v0.16b, v5.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v6.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v7.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v8.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v9.16b  \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v10.16b \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v11.16b \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v12.16b \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v13.16b \n"
-            "AESMC v0.16b, v0.16b \n"
-            "AESE v0.16b, v14.16b \n"
-            "EOR v0.16b, v0.16b, v15.16b \n"
-
-            "EOR v0.16b, v0.16b, v16.16b \n"
-            "ST1 {v0.2d}, [%[out]], #16  \n"
-
-            "CBZ w11, AESGCM256end \n"
-            "LD1 {v16.2d}, [%[input]], #16 \n"
-            "B AESGCM256Block \n"
-
-            "AESGCM256end:\n"
-            "#store current counter value at the end \n"
-            "ST1 {v17.2d}, [%[ctrOut]] \n"
-
-            :[out] "=r" (c), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (p)
-            :"0" (c), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
-             [input] "3" (p)
-            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-            "v16", "v17", "v18", "v19"
-            );
-            break;
-
-        default:
-            WOLFSSL_MSG("Bad AES-GCM round value");
-            return BAD_FUNC_ARG;
-        }
+            ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12"
+        );
     }
 
+    /* take care of partial block sizes leftover */
     if (partial != 0) {
-        IncrementGcmCounter(ctr);
-        wc_AesEncrypt(aes, ctr, scratch);
-        xorbuf(scratch, p, partial);
-        XMEMCPY(c, scratch, partial);
+        IncrementGcmCounter(counter);
+        wc_AesEncrypt(aes, counter, scratch);
+        xorbuf(scratch, in, partial);
+        XMEMCPY(out, scratch, partial);
 
+        XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+        XMEMCPY(scratch, out, partial);
+        xorbuf(x, scratch, AES_BLOCK_SIZE);
+        GMULT(x, aes->H);
     }
 
-    GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
-    wc_AesEncrypt(aes, initialCounter, scratch);
-    xorbuf(authTag, scratch, authTagSz);
+    /* Hash in the lengths of A and C in bits */
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+    FlattenSzInBits(&scratch[0], authInSz);
+    FlattenSzInBits(&scratch[8], sz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+    keyPt  = (byte*)aes->key;
+    __asm__ __volatile__ (
+
+        "LD1 {v16.16b}, [%[tag]] \n"
+        "LD1 {v17.16b}, %[h] \n"
+        "RBIT v16.16b, v16.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+        "PMULL  v18.1q, v16.1d, v17.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v19.1q, v16.2d, v17.2d \n"     /* a1 * b1 = D */
+        "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+        "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+        "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+        "PMULL  v21.1q, v16.1d, v20.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v20.1q, v16.2d, v20.2d \n"     /* a1 * b0 = F */
+        "LD1 {v0.2d}, [%[ctr]]             \n"
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v22.16b, v22.16b, v22.16b \n"
+        "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+        "MOVI v23.16b, #0x87 \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "USHR v23.2d, v23.2d, #56 \n"
+        "PMULL2 v20.1q, v19.2d, v23.2d \n"
+        "AESE v0.16b, v5.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+        "AESE v0.16b, v6.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"
+        "AESE v0.16b, v7.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+        "AESE v0.16b, v8.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"
+        "AESE v0.16b, v9.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "PMULL v20.1q, v19.1d, v23.1d  \n"
+        "EOR v19.16b, v18.16b, v20.16b \n"
+        "AESE v0.16b, v10.16b \n"
+        "RBIT v19.16b, v19.16b \n"
+        "EOR v0.16b, v0.16b, v11.16b \n"
+        "EOR v19.16b, v19.16b, v0.16b \n"
+        "STR q19, [%[out]] \n"
+
+        :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+        :[tag] "0" (sPt), [Key] "1" (keyPt),
+        [ctr] "2" (iCtr) , [h] "m" (aes->H)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+        "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+    );
+
+
+    /* authTagSz can be smaller than AES_BLOCK_SIZE */
+    XMEMCPY(authTag, scratch, authTagSz);
 
     return 0;
 }
 
 
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks;
+    word32 partial;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+
+    /* Noticed different optimization levels treated head of array different.
+       Some cases was stack pointer plus offset others was a regester containing
+       address. To make uniform for passing in to inline assembly code am using
+       pointers to the head of each local array.
+     */
+    byte* ctr  = counter;
+    byte* iCtr = initialCounter;
+    byte* xPt  = x;
+    byte* sPt  = scratch;
+    byte* keyPt; /* pointer to handle pointer advencment */
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == NONCE_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+    /* Hash in the Additional Authentication Data */
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+    if (authInSz != 0 && authIn != NULL) {
+        blocks = authInSz / AES_BLOCK_SIZE;
+        partial = authInSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, authIn, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+            authIn += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, authIn, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+        }
+    }
+
+    /* do as many blocks as possible */
+    blocks = sz / AES_BLOCK_SIZE;
+    partial = sz % AES_BLOCK_SIZE;
+    if (blocks > 0) {
+        keyPt  = (byte*)aes->key;
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v13.2d}, [%[ctr]] \n"
+
+            "#Create vector with the value 1  \n"
+            "MOVI v14.16b, #1                 \n"
+            "USHR v14.2d, v14.2d, #56         \n"
+            "EOR v22.16b, v22.16b, v22.16b    \n"
+            "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+            /***************************************************
+               Get first out block for GHASH using AES encrypt
+             ***************************************************/
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+            "MOV v0.16b, v13.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v16.2d}, %[inY] \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+            "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "MOVI v23.16b, #0x87 \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "USHR v23.2d, v23.2d, #56 \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v0.16b \n"
+
+            "CBZ w11, AESGCM192end \n" /* only one block jump to final GHASH */
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+
+            /***************************************************
+               Interweave GHASH and encrypt if more then 1 block
+             ***************************************************/
+            "AESGCM192Block: \n"
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "MOV v0.16b, v13.16b  \n"
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "MOV v15.16b, v0.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+
+            "CBZ w11, AESGCM192end \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "B AESGCM192Block \n"
+
+            /***************************************************
+               GHASH on last block
+             ***************************************************/
+            "AESGCM192end: \n"
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+            "#store current AES counter value \n"
+            "ST1 {v13.2d}, [%[ctrOut]] \n"
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+
+            "#Reduce product from multiplication \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+            "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+            :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+            ,[xOut] "=r" (xPt),"=m" (aes->H)
+            :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (in)
+            ,[inX] "4" (xPt), [inY] "m" (aes->H)
+            : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+            ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12"
+        );
+    }
+
+    /* take care of partial block sizes leftover */
+    if (partial != 0) {
+        IncrementGcmCounter(counter);
+        wc_AesEncrypt(aes, counter, scratch);
+        xorbuf(scratch, in, partial);
+        XMEMCPY(out, scratch, partial);
+
+        XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+        XMEMCPY(scratch, out, partial);
+        xorbuf(x, scratch, AES_BLOCK_SIZE);
+        GMULT(x, aes->H);
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+    FlattenSzInBits(&scratch[0], authInSz);
+    FlattenSzInBits(&scratch[8], sz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+    keyPt  = (byte*)aes->key;
+    __asm__ __volatile__ (
+
+        "LD1 {v16.16b}, [%[tag]] \n"
+        "LD1 {v17.16b}, %[h] \n"
+        "RBIT v16.16b, v16.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+        "PMULL  v18.1q, v16.1d, v17.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v19.1q, v16.2d, v17.2d \n"     /* a1 * b1 = D */
+        "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+        "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+        "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+        "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n"
+        "PMULL  v21.1q, v16.1d, v20.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v20.1q, v16.2d, v20.2d \n"     /* a1 * b0 = F */
+        "LD1 {v0.2d}, [%[ctr]]             \n"
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v22.16b, v22.16b, v22.16b \n"
+        "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+        "MOVI v23.16b, #0x87 \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "USHR v23.2d, v23.2d, #56 \n"
+        "PMULL2 v20.1q, v19.2d, v23.2d \n"
+        "AESE v0.16b, v5.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+        "AESE v0.16b, v6.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"
+        "AESE v0.16b, v7.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+        "AESE v0.16b, v8.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"
+        "AESE v0.16b, v9.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v10.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v11.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "PMULL v20.1q, v19.1d, v23.1d  \n"
+        "EOR v19.16b, v18.16b, v20.16b \n"
+        "AESE v0.16b, v30.16b \n"
+        "RBIT v19.16b, v19.16b \n"
+        "EOR v0.16b, v0.16b, v31.16b \n"
+        "EOR v19.16b, v19.16b, v0.16b \n"
+        "STR q19, [%[out]] \n"
+
+        :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+        :[tag] "0" (sPt), [Key] "1" (keyPt),
+        [ctr] "2" (iCtr) , [h] "m" (aes->H)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+        "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+    );
+
+
+    /* authTagSz can be smaller than AES_BLOCK_SIZE */
+    XMEMCPY(authTag, scratch, authTagSz);
+
+    return 0;
+}
+
+
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks;
+    word32 partial;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+
+    /* Noticed different optimization levels treated head of array different.
+       Some cases was stack pointer plus offset others was a regester containing
+       address. To make uniform for passing in to inline assembly code am using
+       pointers to the head of each local array.
+     */
+    byte* ctr  = counter;
+    byte* iCtr = initialCounter;
+    byte* xPt  = x;
+    byte* sPt  = scratch;
+    byte* keyPt; /* pointer to handle pointer advencment */
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == NONCE_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+    /* Hash in the Additional Authentication Data */
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+    if (authInSz != 0 && authIn != NULL) {
+        blocks = authInSz / AES_BLOCK_SIZE;
+        partial = authInSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, authIn, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+            authIn += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, authIn, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+        }
+    }
+
+    /* do as many blocks as possible */
+    blocks = sz / AES_BLOCK_SIZE;
+    partial = sz % AES_BLOCK_SIZE;
+    if (blocks > 0) {
+        keyPt  = (byte*)aes->key;
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v13.2d}, [%[ctr]] \n"
+
+            "#Create vector with the value 1  \n"
+            "MOVI v14.16b, #1                 \n"
+            "USHR v14.2d, v14.2d, #56         \n"
+            "EOR v22.16b, v22.16b, v22.16b    \n"
+            "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+            /***************************************************
+               Get first out block for GHASH using AES encrypt
+             ***************************************************/
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+            "MOV v0.16b, v13.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v16.2d}, %[inY] \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+            "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "MOVI v23.16b, #0x87 \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "USHR v23.2d, v23.2d, #56 \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v28.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v29.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v0.16b \n"
+
+            "CBZ w11, AESGCM256end \n" /* only one block jump to final GHASH */
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+
+            /***************************************************
+               Interweave GHASH and encrypt if more then 1 block
+             ***************************************************/
+            "AESGCM256Block: \n"
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "MOV v0.16b, v13.16b  \n"
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v28.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v29.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "MOV v15.16b, v0.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+
+            "CBZ w11, AESGCM256end \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "B AESGCM256Block \n"
+
+            /***************************************************
+               GHASH on last block
+             ***************************************************/
+            "AESGCM256end: \n"
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+            "#store current AES counter value \n"
+            "ST1 {v13.2d}, [%[ctrOut]] \n"
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+
+            "#Reduce product from multiplication \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+            "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+            :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+            ,[xOut] "=r" (xPt),"=m" (aes->H)
+            :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (in)
+            ,[inX] "4" (xPt), [inY] "m" (aes->H)
+            : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+            ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12"
+        );
+    }
+
+    /* take care of partial block sizes leftover */
+    if (partial != 0) {
+        IncrementGcmCounter(counter);
+        wc_AesEncrypt(aes, counter, scratch);
+        xorbuf(scratch, in, partial);
+        XMEMCPY(out, scratch, partial);
+
+        XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+        XMEMCPY(scratch, out, partial);
+        xorbuf(x, scratch, AES_BLOCK_SIZE);
+        GMULT(x, aes->H);
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+    FlattenSzInBits(&scratch[0], authInSz);
+    FlattenSzInBits(&scratch[8], sz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+    keyPt  = (byte*)aes->key;
+    __asm__ __volatile__ (
+
+        "LD1 {v16.16b}, [%[tag]] \n"
+        "LD1 {v17.16b}, %[h] \n"
+        "RBIT v16.16b, v16.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+        "PMULL  v18.1q, v16.1d, v17.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v19.1q, v16.2d, v17.2d \n"     /* a1 * b1 = D */
+        "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+        "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+        "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+        "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n"
+        "PMULL  v21.1q, v16.1d, v20.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v20.1q, v16.2d, v20.2d \n"     /* a1 * b0 = F */
+        "LD1 {v0.2d}, [%[ctr]]             \n"
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v22.16b, v22.16b, v22.16b \n"
+        "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+        "MOVI v23.16b, #0x87 \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "USHR v23.2d, v23.2d, #56 \n"
+        "PMULL2 v20.1q, v19.2d, v23.2d \n"
+        "AESE v0.16b, v5.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+        "AESE v0.16b, v6.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"
+        "AESE v0.16b, v7.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+        "AESE v0.16b, v8.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"
+        "AESE v0.16b, v9.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v10.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v11.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v28.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v29.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "PMULL v20.1q, v19.1d, v23.1d  \n"
+        "EOR v19.16b, v18.16b, v20.16b \n"
+        "AESE v0.16b, v30.16b \n"
+        "RBIT v19.16b, v19.16b \n"
+        "EOR v0.16b, v0.16b, v31.16b \n"
+        "EOR v19.16b, v19.16b, v0.16b \n"
+        "STR q19, [%[out]] \n"
+
+        :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+        :[tag] "0" (sPt), [Key] "1" (keyPt),
+        [ctr] "2" (iCtr) , [h] "m" (aes->H)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+        "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+    );
+
+
+    /* authTagSz can be smaller than AES_BLOCK_SIZE */
+    XMEMCPY(authTag, scratch, authTagSz);
+
+    return 0;
+}
+
+
+/* aarch64 with PMULL and PMULL2
+ * Encrypt and tag data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: encrypted data output buffer
+ * in:  plain text input buffer
+ * sz:  size of plain text and out buffer
+ * iv:  initialization vector
+ * ivSz:      size of iv buffer
+ * authTag:   buffer to hold tag
+ * authTagSz: size of tag buffer
+ * authIn:    additional data buffer
+ * authInSz:  size of additional data buffer
+ *
+ * Notes:
+ * GHASH multiplication based from Algorithm 1 from Intel GCM white paper.
+ * "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
+ *
+ * GHASH reduction Based from White Paper "Implementing GCM on ARMv8"
+ * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using
+ * Algorithm 5
+ */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    /* sanity checks */
+    if (authTagSz > AES_BLOCK_SIZE) {
+        WOLFSSL_MSG("parameter authTagSz can not be larger than 16 bytes");
+        return BAD_FUNC_ARG; /* is bigger then scratch buffer */
+    }
+
+    if (aes == NULL || (iv == NULL && ivSz > 0) ||
+                       (authTag == NULL && authTagSz > 0) ||
+                       (authIn == NULL && authInSz > 0) ||
+                       (in == NULL && sz > 0) ||
+                       (out == NULL && authTag == NULL)) {
+        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+        return BAD_FUNC_ARG;
+    }
+
+    switch (aes->rounds) {
+        case 10:
+            return Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz,
+                                    authTag, authTagSz, authIn, authInSz);
+
+        case 12:
+            return Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz,
+                                    authTag, authTagSz, authIn, authInSz);
+
+        case 14:
+            return Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz,
+                                    authTag, authTagSz, authIn, authInSz);
+
+        default:
+            WOLFSSL_MSG("AES-GCM invalid round number");
+            return BAD_FUNC_ARG;
+    }
+}
+
+
 #ifdef HAVE_AES_DECRYPT
+/*
+ * Check tag and decrypt data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: decrypted data output buffer
+ * in:  cipher text buffer
+ * sz:  size of plain text and out buffer
+ * iv:  initialization vector
+ * ivSz:      size of iv buffer
+ * authTag:   buffer holding tag
+ * authTagSz: size of tag buffer
+ * authIn:    additional data buffer
+ * authInSz:  size of additional data buffer
+ */
 int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
                    const byte* iv, word32 ivSz,
                    const byte* authTag, word32 authTagSz,
@@ -1606,6 +2391,21 @@ int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
 
     ctr = counter ;
 
+    /* sanity checks */
+    if (authTagSz > AES_BLOCK_SIZE) {
+        WOLFSSL_MSG("parameter authTagSz can not be larger than 16 bytes");
+        return BAD_FUNC_ARG; /* is bigger then scratch buffer */
+    }
+
+    if (aes == NULL || (iv == NULL && ivSz > 0) ||
+                       (authTag == NULL && authTagSz > 0) ||
+                       (authIn == NULL && authInSz > 0) ||
+                       (in == NULL && sz > 0) ||
+                       (out == NULL && authTag == NULL)) {
+        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+        return BAD_FUNC_ARG;
+    }
+
     XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
     if (ivSz == NONCE_SZ) {
         XMEMCPY(initialCounter, iv, ivSz);
@@ -1613,6 +2413,7 @@ int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     }
     else {
         GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
     }
     XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
 
@@ -1623,6 +2424,7 @@ int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         byte EKY0[AES_BLOCK_SIZE];
 
         GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime));
+        GMULT(Tprime, aes->H);
         wc_AesEncrypt(aes, ctr, EKY0);
         xorbuf(Tprime, EKY0, sizeof(Tprime));