initial ARMv8 instructions

2016-07-22 15:49:15 +00:00
parent a0b02236b8
commit 41912b92c6
7 changed files with 1151 additions and 7 deletions
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -344,7 +344,160 @@ void wc_AesAsyncFree(Aes* aes)
    #ifdef HAVE_AES_DECRYPT
        #error nRF51 AES Hardware does not support decrypt
    #endif /* HAVE_AES_DECRYPT */
+#elif defined(WOLFSSL_ARMASM)
+    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+            byte* keyPt = (byte*)aes->key;
+            word32 rounds = aes->rounds;
+            byte out[AES_BLOCK_SIZE];
+            byte* output = out;
+            byte* input = (byte*)inBlock;

+
+            /*
+              AESE exor's input with round key
+                   shift rows of exor'ed result
+                   sub bytes for shifted rows
+             */
+
+            __asm__ __volatile__ (
+                "LD1 {v0.16b}, [%[CtrIn]], #16 \n"
+                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
+
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b \n"
+
+                "#subtract rounds done so far and see if should continue\n"
+                "MOV w12, %w[R] \n"
+                "SUB w12, w12, #10 \n"
+                "CBZ w12, final \n"
+                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+
+                "SUB w12, w12, #2 \n"
+                "CBZ w12, final \n"
+                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+
+                "#Final AddRoundKey then store result \n"
+                "final: \n"
+                "LD1 {v1.16b}, [%[Key]], #16  \n"
+                "EOR v0.16b, v0.16b, v1.16b \n"
+                "ST1 {v0.16b}, [%[CtrOut]] \n"
+
+                :[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds)
+                :[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "r" (input), "0" (output)
+                : "cc", "memory", "w12"
+            );
+
+            XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
+
+        return 0;
+    }
+    #ifdef HAVE_AES_DECRYPT
+    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+            byte* keyPt = (byte*)aes->key;
+            word32 rounds = aes->rounds;
+            byte out[AES_BLOCK_SIZE];
+            byte* output = out;
+            byte* input = (byte*)inBlock;
+
+            /*
+              AESE exor's input with round key
+                   shift rows of exor'ed result
+                   sub bytes for shifted rows
+             */
+
+            __asm__ __volatile__ (
+                "LD1 {v0.16b}, [%[CtrIn]], #16 \n"
+                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
+
+                "AESD v0.16b, v1.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.16b-v4.16b}, [%[Key]], #64  \n"
+                "AESD v0.16b, v1.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
+                "AESD v0.16b, v1.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b \n"
+
+                "#subtract rounds done so far and see if should continue\n"
+                "MOV w12, %w[R] \n"
+                "SUB w12, w12, #10 \n"
+                "CBZ w12, finalDec \n"
+                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v1.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b  \n"
+
+                "SUB w12, w12, #2 \n"
+                "CBZ w12, finalDec \n"
+                "LD1 {v1.16b-v2.16b}, [%[Key]], #32  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v1.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b  \n"
+
+                "#Final AddRoundKey then store result \n"
+                "finalDec: \n"
+                "LD1 {v1.16b}, [%[Key]], #16  \n"
+                "EOR v0.16b, v0.16b, v1.16b \n"
+                "ST1 {v0.4s}, [%[CtrOut]] \n"
+
+                :[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds), "=r" (input)
+                :[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "3" (input), "0" (output)
+                : "cc", "memory", "w12"
+            );
+
+            XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
+
+        return 0;
+}
+    #endif /* HAVE_AES_DECRYPT */
 #else

    /* using wolfCrypt software AES implementation */
@@ -1533,7 +1686,6 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
 }
 #endif /* HAVE_AES_DECRYPT */
 #endif /* HAVE_AES_CBC || WOLFSSL_AES_DIRECT */
-
 #endif /* NEED_AES_TABLES */


@@ -1678,6 +1830,196 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
    {
        return wc_AesSetKey(aes, userKey, keylen, iv, dir);
    }
+#elif defined(WOLFSSL_ARMASM)
+    static const byte rcon[] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36
+        /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+    };
+
+
+    /* Similar to wolfSSL software implementation of expanding the AES key.
+     * Changed out the locations of where table look ups where made to
+     * use hardware instruction. Also altered decryption key to match. */
+    int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
+                const byte* iv, int dir)
+    {
+        word32 temp, *rk = aes->key;
+        unsigned int i = 0;
+
+    #if defined(AES_MAX_KEY_SIZE)
+        const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
+    #endif
+
+        if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
+            return BAD_FUNC_ARG;
+
+    #if defined(AES_MAX_KEY_SIZE)
+        /* Check key length */
+        if (keylen > max_key_len) {
+            return BAD_FUNC_ARG;
+        }
+    #endif
+
+        #ifdef WOLFSSL_AES_COUNTER
+            aes->left = 0;
+        #endif /* WOLFSSL_AES_COUNTER */
+
+        aes->rounds = keylen/4 + 6;
+        XMEMCPY(rk, userKey, keylen);
+
+        switch(keylen)
+        {
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128
+        case 16:
+            while (1)
+            {
+                temp  = rk[3];
+
+                /* get table value from hardware */
+                __asm__ volatile (
+                    "DUP v1.4s, %w[in]  \n"
+                    "MOVI v0.16b, #0     \n"
+                    "AESE v0.16b, v1.16b \n"
+                    "UMOV %w[out], v0.4s[0] \n"
+                    : [out] "=r"(temp)
+                    : [in] "r" (temp)
+                    : "cc", "memory", "v0", "v1"
+                );
+                temp = rotrFixed(temp, 8);
+                rk[4] = rk[0] ^ temp ^ rcon[i];
+                rk[5] = rk[4] ^ rk[1];
+                rk[6] = rk[5] ^ rk[2];
+                rk[7] = rk[6] ^ rk[3];
+                if (++i == 10)
+                    break;
+                rk += 4;
+            }
+            break;
+#endif /* 128 */
+
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192
+        case 24:
+            /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
+            while (1)
+            {
+                temp  = rk[5];
+
+                /* get table value from hardware */
+                __asm__ volatile (
+                    "DUP v1.4s, %w[in]  \n"
+                    "MOVI v0.16b, #0     \n"
+                    "AESE v0.16b, v1.16b \n"
+                    "UMOV %w[out], v0.4s[0] \n"
+                    : [out] "=r"(temp)
+                    : [in] "r" (temp)
+                    : "cc", "memory", "v0", "v1"
+                );
+                temp = rotrFixed(temp, 8);
+                rk[ 6] = rk[ 0] ^ temp ^ rcon[i];
+                rk[ 7] = rk[ 1] ^ rk[ 6];
+                rk[ 8] = rk[ 2] ^ rk[ 7];
+                rk[ 9] = rk[ 3] ^ rk[ 8];
+                if (++i == 8)
+                    break;
+                rk[10] = rk[ 4] ^ rk[ 9];
+                rk[11] = rk[ 5] ^ rk[10];
+                rk += 6;
+            }
+            break;
+#endif /* 192 */
+
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256
+        case 32:
+            while (1)
+            {
+                temp  = rk[7];
+
+                /* get table value from hardware */
+                __asm__ volatile (
+                    "DUP v1.4s, %w[in]  \n"
+                    "MOVI v0.16b, #0     \n"
+                    "AESE v0.16b, v1.16b \n"
+                    "UMOV %w[out], v0.4s[0] \n"
+                    : [out] "=r"(temp)
+                    : [in] "r" (temp)
+                    : "cc", "memory", "v0", "v1"
+                );
+                temp = rotrFixed(temp, 8);
+                rk[8] = rk[0] ^ temp ^ rcon[i];
+                rk[ 9] = rk[ 1] ^ rk[ 8];
+                rk[10] = rk[ 2] ^ rk[ 9];
+                rk[11] = rk[ 3] ^ rk[10];
+                if (++i == 7)
+                    break;
+                temp  = rk[11];
+
+                /* get table value from hardware */
+                __asm__ volatile (
+                    "DUP v1.4s, %w[in]  \n"
+                    "MOVI v0.16b, #0     \n"
+                    "AESE v0.16b, v1.16b \n"
+                    "UMOV %w[out], v0.4s[0] \n"
+                    : [out] "=r"(temp)
+                    : [in] "r" (temp)
+                    : "cc", "memory", "v0", "v1"
+                );
+                rk[12] = rk[ 4] ^ temp;
+                rk[13] = rk[ 5] ^ rk[12];
+                rk[14] = rk[ 6] ^ rk[13];
+                rk[15] = rk[ 7] ^ rk[14];
+
+                rk += 8;
+            }
+            break;
+#endif /* 256 */
+
+        default:
+            return BAD_FUNC_ARG;
+        }
+
+        if (dir == AES_DECRYPTION)
+        {
+#ifdef HAVE_AES_DECRYPT
+            unsigned int j;
+            rk = aes->key;
+
+            /* invert the order of the round keys: */
+            for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) {
+                temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+                temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+                temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+                temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+            }
+            /* apply the inverse MixColumn transform to all round keys but the
+               first and the last: */
+            for (i = 1; i < aes->rounds; i++) {
+                rk += 4;
+                __asm__ volatile (
+                    "LD1 {v0.16b}, [%[in]] \n"
+                    "AESIMC v0.16b, v0.16b \n"
+                    "ST1 {v0.16b}, [%[out]]\n"
+                    : [out] "=r" (rk)
+                    : [in] "0" (rk)
+                    : "cc", "memory", "v0"
+                );
+            }
+#else
+        WOLFSSL_MSG("AES Decryption not compiled in");
+        return BAD_FUNC_ARG;
+#endif /* HAVE_AES_DECRYPT */
+        }
+
+        return wc_AesSetIV(aes, iv);
+    }
+
+    #if defined(WOLFSSL_AES_DIRECT)
+        int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+                            const byte* iv, int dir)
+        {
+            return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+        }
+    #endif
+
 #else
    static int wc_AesSetKeyLocal(Aes* aes, const byte* userKey, word32 keylen,
                const byte* iv, int dir)
@@ -2859,7 +3201,7 @@ static INLINE void IncrementGcmCounter(byte* inOutCtr)
 }


-#if defined(GCM_SMALL) || defined(GCM_TABLE)
+#if defined(GCM_SMALL) || defined(GCM_TABLE) || defined(WOLFSSL_ARMASM)

 static INLINE void FlattenSzInBits(byte* buf, word32 sz)
 {
@@ -2943,6 +3285,20 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)

    if (ret == 0) {
        wc_AesEncrypt(aes, iv, aes->H);
+    #if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
+        {
+            word32* pt = (word32*)aes->H;
+            __asm__ volatile (
+                "LD1 {v0.16b}, [%[h]] \n"
+                "RBIT v0.16b, v0.16b \n"
+                "ST1 {v0.16b}, [%[out]] \n"
+                : [out] "=r" (pt)
+                : [h] "0" (pt)
+                : "cc", "memory"
+            );
+            return ret; /* no need to generate GCM_TABLE */
+        }
+    #endif
    #ifdef GCM_TABLE
        GenerateM0(aes);
    #endif /* GCM_TABLE */
@@ -3379,8 +3735,118 @@ static int AES_GCM_decrypt(const unsigned char *in,
 #endif /* WOLFSSL_AESNI */


-#if defined(GCM_SMALL)
+#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
+/* PMULL and RBIT only with AArch64 */
+/* Use ARM hardware for polynomial multiply */
+static void GMULT(byte* X, byte* Y)
+{
+    word32* Xpt = (word32*)X;
+    word32* Ypt = (word32*)Y;

+    __asm__ volatile (
+        "LD1 {v0.16b}, [%[inX]] \n"
+        "LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */
+        "RBIT v0.16b, v0.16b \n"
+
+
+        /* Algorithm 1 from Intel GCM white paper.
+           "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
+         */
+        "PMULL  v3.1q, v0.1d, v1.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v4.1q, v0.2d, v1.2d \n"     /* a1 * b1 = D */
+        "EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */
+        "PMULL  v6.1q, v0.1d, v5.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v5.1q, v0.2d, v5.2d \n"     /* a1 * b0 = F */
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v7.16b, v7.16b, v7.16b \n"
+        "EOR v5.16b, v5.16b, v6.16b \n"     /* F ^ E */
+        "EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */
+        "EOR v3.16b, v3.16b, v6.16b \n"     /* low 128 bits in v3 */
+        "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */
+        "EOR v4.16b, v4.16b, v6.16b \n"     /* high 128 bits in v4 */
+
+
+        /* Based from White Paper "Implementing GCM on ARMv8"
+           by Conrado P.L. Gouvea and Julio Lopez
+           reduction on 256bit value using Algorithm 5 */
+        "MOVI v8.16b, #0x87 \n"
+        "USHR v8.2d, v8.2d, #56 \n"
+        /* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/
+        "PMULL2 v5.1q, v4.2d, v8.2d \n"
+        "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */
+        "EOR v4.16b, v4.16b, v6.16b \n"
+        "EXT v6.16b, v7.16b, v5.16b, #8 \n"
+        "EOR v3.16b, v3.16b, v6.16b \n"
+        "PMULL v5.1q, v4.1d, v8.1d  \n"
+        "EOR v4.16b, v3.16b, v5.16b \n"
+
+        "RBIT v4.16b, v4.16b \n"
+        "STR q4, [%[out]] \n"
+        : [out] "=r" (Xpt), "=r" (Ypt)
+        : [inX] "0" (Xpt), [inY] "1" (Ypt)
+        : "cc", "memory", "v3", "v4", "v5", "v6", "v7", "v8"
+    );
+}
+
+
+/* Currently is a copy from GCM_SMALL wolfSSL version. Duplicated and set
+ * seperate for future optimizations. */
+static void GHASH(Aes* aes, const byte* a, word32 aSz,
+                                const byte* c, word32 cSz, byte* s, word32 sSz)
+{
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+    word32 blocks, partial;
+    byte* h = aes->H;
+
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+
+    /* Hash in A, the Additional Authentication Data */
+    if (aSz != 0 && a != NULL) {
+        blocks = aSz / AES_BLOCK_SIZE;
+        partial = aSz % AES_BLOCK_SIZE;
+        while (blocks--) {
+            xorbuf(x, a, AES_BLOCK_SIZE);
+            GMULT(x, h);
+            a += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, a, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, h);
+        }
+    }
+
+    /* Hash in C, the Ciphertext */
+    if (cSz != 0 && c != NULL) {
+        blocks = cSz / AES_BLOCK_SIZE;
+        partial = cSz % AES_BLOCK_SIZE;
+        while (blocks--) {
+            xorbuf(x, c, AES_BLOCK_SIZE);
+            GMULT(x, h);
+            c += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, c, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, h);
+        }
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    FlattenSzInBits(&scratch[0], aSz);
+    FlattenSzInBits(&scratch[8], cSz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    GMULT(x, h);
+
+    /* Copy the result into s. */
+    XMEMCPY(s, x, sSz);
+}
+/* not using ARMASM for multiplication */
+#elif defined(GCM_SMALL)
 static void GMULT(byte* X, byte* Y)
 {
    byte Z[AES_BLOCK_SIZE];