From d0703f8931066016f05c3dfc55a533935d9448c9 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 14 Oct 2020 14:08:06 +1000 Subject: [PATCH] AES-GCM: GMULT using 4-bit table When 64-bit data type available and not big endian code is faster. --enable-aesgcm=4bit --- configure.ac | 56 ++-- wolfcrypt/src/aes.c | 629 +++++++++++++++++++++++++++++++++----- wolfcrypt/src/ecc.c | 20 +- wolfcrypt/src/misc.c | 44 +++ wolfssl/openssl/aes.h | 6 + wolfssl/wolfcrypt/aes.h | 6 + wolfssl/wolfcrypt/misc.h | 5 + wolfssl/wolfcrypt/types.h | 2 + 8 files changed, 666 insertions(+), 102 deletions(-) diff --git a/configure.ac b/configure.ac index 8f277be6f..263052c88 100644 --- a/configure.ac +++ b/configure.ac @@ -1206,27 +1206,9 @@ then ENABLED_AESGCM=no fi -if test "$ENABLED_AESGCM" != "no" +if test "$ENABLED_AESGCM" = "yes" && test "$ac_cv_c_bigendian" != "yes" then - if test "$ENABLED_AESGCM" = "word32" - then - AM_CFLAGS="$AM_CFLAGS -DGCM_WORD32" - ENABLED_AESGCM=yes - fi - - if test "$ENABLED_AESGCM" = "small" || test "$ENABLED_LOWRESOURCE" = "yes" - then - AM_CFLAGS="$AM_CFLAGS -DGCM_SMALL" - ENABLED_AESGCM=yes - fi - - if test "$ENABLED_AESGCM" = "table" - then - AM_CFLAGS="$AM_CFLAGS -DGCM_TABLE" - ENABLED_AESGCM=yes - fi - - AM_CFLAGS="$AM_CFLAGS -DHAVE_AESGCM" + ENABLED_AESGCM="4bit" fi @@ -5155,6 +5137,11 @@ then # These flags are already implied by --enable-aesctr AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_AES_COUNTER -DWOLFSSL_AES_DIRECT" fi + if test "x$ENABLED_AESGCM" != "xyes" && test "x$ENABLED_AESGCM" != "xno" + then + # Use the smaller object size implementation + ENABLED_AESGCM=yes + fi fi if test "$ENABLED_MCAPI" = "yes" && test "$ENABLED_SHA512" = "no" @@ -5519,6 +5506,35 @@ else fi fi +if test "$ENABLED_AESGCM" != "no" +then + if test "$ENABLED_AESGCM" = "word32" + then + AM_CFLAGS="$AM_CFLAGS -DGCM_WORD32" + ENABLED_AESGCM=yes + fi + + if test "$ENABLED_AESGCM" = "small" || test "$ENABLED_LOWRESOURCE" = "yes" + then + AM_CFLAGS="$AM_CFLAGS -DGCM_SMALL" + ENABLED_AESGCM=yes + fi + + if test "$ENABLED_AESGCM" = "table" + then + AM_CFLAGS="$AM_CFLAGS -DGCM_TABLE" + ENABLED_AESGCM=yes + fi + + if test "$ENABLED_AESGCM" = "4bit" + then + AM_CFLAGS="$AM_CFLAGS -DGCM_TABLE_4BIT" + ENABLED_AESGCM=yes + fi + + AM_CFLAGS="$AM_CFLAGS -DHAVE_AESGCM" +fi + AS_IF([test "x$ENABLED_MAXSTRENGTH" = "xyes"], [AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_MAX_STRENGTH"]) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6dedc68a9..eb3a8626e 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -1685,10 +1685,12 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) word32 r = aes->rounds >> 1; const word32* rk = aes->key; +#ifdef DEBUG_WOLFSSL if (r > 7 || r == 0) { WOLFSSL_MSG("AesEncrypt encountered improper key, set it up"); return; /* stop instead of seg-faulting, set up your keys! */ } +#endif #ifdef WOLFSSL_AESNI if (haveAESNI && aes->use_aesni) { @@ -1755,7 +1757,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) * and add initial round key: */ XMEMCPY(&s0, inBlock, sizeof(s0)); - XMEMCPY(&s1, inBlock + sizeof(s0), sizeof(s1)); + XMEMCPY(&s1, inBlock + sizeof(s0), sizeof(s1)); XMEMCPY(&s2, inBlock + 2 * sizeof(s0), sizeof(s2)); XMEMCPY(&s3, inBlock + 3 * sizeof(s0), sizeof(s3)); @@ -1773,8 +1775,44 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) s3 ^= rk[3]; #ifndef WOLFSSL_AES_SMALL_TABLES +#ifdef WOLFSSL_X86_64_BUILD s0 |= PreFetchTe(); +#endif +#ifndef WOLFSSL_AES_NO_UNROLL +/* Unroll the loop. */ +#define ENC_ROUND_T_S(o) \ + t0 = Te[0][GETBYTE(s0, 3)] ^ Te[1][GETBYTE(s1, 2)] ^ \ + Te[2][GETBYTE(s2, 1)] ^ Te[3][GETBYTE(s3, 0)] ^ rk[o+4]; \ + t1 = Te[0][GETBYTE(s1, 3)] ^ Te[1][GETBYTE(s2, 2)] ^ \ + Te[2][GETBYTE(s3, 1)] ^ Te[3][GETBYTE(s0, 0)] ^ rk[o+5]; \ + t2 = Te[0][GETBYTE(s2, 3)] ^ Te[1][GETBYTE(s3, 2)] ^ \ + Te[2][GETBYTE(s0, 1)] ^ Te[3][GETBYTE(s1, 0)] ^ rk[o+6]; \ + t3 = Te[0][GETBYTE(s3, 3)] ^ Te[1][GETBYTE(s0, 2)] ^ \ + Te[2][GETBYTE(s1, 1)] ^ Te[3][GETBYTE(s2, 0)] ^ rk[o+7] +#define ENC_ROUND_S_T(o) \ + s0 = Te[0][GETBYTE(t0, 3)] ^ Te[1][GETBYTE(t1, 2)] ^ \ + Te[2][GETBYTE(t2, 1)] ^ Te[3][GETBYTE(t3, 0)] ^ rk[o+0]; \ + s1 = Te[0][GETBYTE(t1, 3)] ^ Te[1][GETBYTE(t2, 2)] ^ \ + Te[2][GETBYTE(t3, 1)] ^ Te[3][GETBYTE(t0, 0)] ^ rk[o+1]; \ + s2 = Te[0][GETBYTE(t2, 3)] ^ Te[1][GETBYTE(t3, 2)] ^ \ + Te[2][GETBYTE(t0, 1)] ^ Te[3][GETBYTE(t1, 0)] ^ rk[o+2]; \ + s3 = Te[0][GETBYTE(t3, 3)] ^ Te[1][GETBYTE(t0, 2)] ^ \ + Te[2][GETBYTE(t1, 1)] ^ Te[3][GETBYTE(t2, 0)] ^ rk[o+3] + + ENC_ROUND_T_S( 0); + ENC_ROUND_S_T( 8); ENC_ROUND_T_S( 8); + ENC_ROUND_S_T(16); ENC_ROUND_T_S(16); + ENC_ROUND_S_T(24); ENC_ROUND_T_S(24); + ENC_ROUND_S_T(32); ENC_ROUND_T_S(32); + if (r > 5) { + ENC_ROUND_S_T(40); ENC_ROUND_T_S(40); + if (r > 6) { + ENC_ROUND_S_T(48); ENC_ROUND_T_S(48); + } + } + rk += r * 8; +#else /* * Nr - 1 full rounds: */ @@ -1835,6 +1873,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) Te[3][GETBYTE(t2, 0)] ^ rk[3]; } +#endif /* * apply last round and @@ -1866,7 +1905,9 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) (Te[1][GETBYTE(t2, 0)] & 0x000000ff) ^ rk[3]; #else +#ifdef WOLFSSL_X86_64_BUILD s0 |= PreFetchSBox(); +#endif r *= 2; /* Two rounds at a time */ @@ -1953,7 +1994,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) #endif XMEMCPY(outBlock, &s0, sizeof(s0)); - XMEMCPY(outBlock + sizeof(s0), &s1, sizeof(s1)); + XMEMCPY(outBlock + sizeof(s0), &s1, sizeof(s1)); XMEMCPY(outBlock + 2 * sizeof(s0), &s2, sizeof(s2)); XMEMCPY(outBlock + 3 * sizeof(s0), &s3, sizeof(s3)); @@ -1999,12 +2040,15 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) word32 s0, s1, s2, s3; word32 t0, t1, t2, t3; word32 r = aes->rounds >> 1; - const word32* rk = aes->key; + +#ifdef DEBUG_WOLFSSL if (r > 7 || r == 0) { WOLFSSL_MSG("AesDecrypt encountered improper key, set it up"); return; /* stop instead of seg-faulting, set up your keys! */ } +#endif + #ifdef WOLFSSL_AESNI if (haveAESNI && aes->use_aesni) { #ifdef DEBUG_AESNI @@ -2063,7 +2107,44 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) s3 ^= rk[3]; #ifndef WOLFSSL_AES_SMALL_TABLES +#ifdef WOLFSSL_X86_64_BUILD s0 |= PreFetchTd(); +#endif + +#ifndef WOLFSSL_AES_NO_UNROLL +/* Unroll the loop. */ +#define DEC_ROUND_T_S(o) \ + t0 = Td[0][GETBYTE(s0, 3)] ^ Td[1][GETBYTE(s3, 2)] ^ \ + Td[2][GETBYTE(s2, 1)] ^ Td[3][GETBYTE(s1, 0)] ^ rk[o+4]; \ + t1 = Td[0][GETBYTE(s1, 3)] ^ Td[1][GETBYTE(s0, 2)] ^ \ + Td[2][GETBYTE(s3, 1)] ^ Td[3][GETBYTE(s2, 0)] ^ rk[o+5]; \ + t2 = Td[0][GETBYTE(s2, 3)] ^ Td[1][GETBYTE(s1, 2)] ^ \ + Td[2][GETBYTE(s0, 1)] ^ Td[3][GETBYTE(s3, 0)] ^ rk[o+6]; \ + t3 = Td[0][GETBYTE(s3, 3)] ^ Td[1][GETBYTE(s2, 2)] ^ \ + Td[2][GETBYTE(s1, 1)] ^ Td[3][GETBYTE(s0, 0)] ^ rk[o+7] +#define DEC_ROUND_S_T(o) \ + s0 = Td[0][GETBYTE(t0, 3)] ^ Td[1][GETBYTE(t3, 2)] ^ \ + Td[2][GETBYTE(t2, 1)] ^ Td[3][GETBYTE(t1, 0)] ^ rk[o+0]; \ + s1 = Td[0][GETBYTE(t1, 3)] ^ Td[1][GETBYTE(t0, 2)] ^ \ + Td[2][GETBYTE(t3, 1)] ^ Td[3][GETBYTE(t2, 0)] ^ rk[o+1]; \ + s2 = Td[0][GETBYTE(t2, 3)] ^ Td[1][GETBYTE(t1, 2)] ^ \ + Td[2][GETBYTE(t0, 1)] ^ Td[3][GETBYTE(t3, 0)] ^ rk[o+2]; \ + s3 = Td[0][GETBYTE(t3, 3)] ^ Td[1][GETBYTE(t2, 2)] ^ \ + Td[2][GETBYTE(t1, 1)] ^ Td[3][GETBYTE(t0, 0)] ^ rk[o+3] + + DEC_ROUND_T_S( 0); + DEC_ROUND_S_T( 8); DEC_ROUND_T_S( 8); + DEC_ROUND_S_T(16); DEC_ROUND_T_S(16); + DEC_ROUND_S_T(24); DEC_ROUND_T_S(24); + DEC_ROUND_S_T(32); DEC_ROUND_T_S(32); + if (r > 5) { + DEC_ROUND_S_T(40); DEC_ROUND_T_S(40); + if (r > 6) { + DEC_ROUND_S_T(48); DEC_ROUND_T_S(48); + } + } + rk += r * 8; +#else /* * Nr - 1 full rounds: @@ -2125,12 +2206,15 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) Td[3][GETBYTE(t0, 0)] ^ rk[3]; } +#endif /* * apply last round and * map cipher state to byte array block: */ +#ifdef WOLFSSL_X86_64_BUILD t0 |= PreFetchTd4(); +#endif s0 = ((word32)Td4[GETBYTE(t0, 3)] << 24) ^ @@ -2157,7 +2241,9 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) ((word32)Td4[GETBYTE(t0, 0)]) ^ rk[3]; #else +#ifdef WOLFSSL_X86_64_BUILD s0 |= PreFetchTd4(); +#endif r *= 2; for (rk += 4; r > 1; r--, rk += 4) { @@ -2615,10 +2701,6 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) word32 localSz = 32; #endif - if (aes == NULL) { - return BAD_FUNC_ARG; - } - #ifdef WOLFSSL_IMX6_CAAM_BLOB if (keylen == (16 + WC_CAAM_BLOB_SZ) || keylen == (24 + WC_CAAM_BLOB_SZ) || @@ -2891,6 +2973,13 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, const byte* iv, int dir) { + if (aes == NULL) { + return BAD_FUNC_ARG; + } + if (keylen > sizeof(aes->key)) { + return BAD_FUNC_ARG; + } + return wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir, 1); } @@ -2900,6 +2989,13 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen, const byte* iv, int dir) { + if (aes == NULL) { + return BAD_FUNC_ARG; + } + if (keylen > sizeof(aes->key)) { + return BAD_FUNC_ARG; + } + return wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir, 0); } #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ @@ -4068,7 +4164,7 @@ static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) } #endif /* !FREESCALE_LTC_AES_GCM */ -#if defined(GCM_SMALL) || defined(GCM_TABLE) +#if defined(GCM_SMALL) || defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz) { @@ -4103,7 +4199,7 @@ static WC_INLINE void RIGHTSHIFTX(byte* x) if (borrow) x[0] ^= 0xE1; } -#endif /* defined(GCM_SMALL) || defined(GCM_TABLE) */ +#endif /* defined(GCM_SMALL) || defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) */ #ifdef GCM_TABLE @@ -4130,6 +4226,72 @@ static void GenerateM0(Aes* aes) XMEMSET(m[0], 0, AES_BLOCK_SIZE); } +#elif defined(GCM_TABLE_4BIT) + +static WC_INLINE void Shift4_M0(byte *r8, byte* z8) +{ + int i; + for (i = 15; i > 0; i--) + r8[i] = (z8[i-1] << 4) | (z8[i] >> 4); + r8[0] = z8[0] >> 4; +} + +static void GenerateM0(Aes* aes) +{ +#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_16BIT_CPU) + int i; +#endif + byte (*m)[AES_BLOCK_SIZE] = aes->M0; + + /* 0 times -> 0x0 */ + XMEMSET(m[0x0], 0, AES_BLOCK_SIZE); + /* 1 times -> 0x8 */ + XMEMCPY(m[0x8], aes->H, AES_BLOCK_SIZE); + /* 2 times -> 0x4 */ + XMEMCPY(m[0x4], m[0x8], AES_BLOCK_SIZE); + RIGHTSHIFTX(m[0x4]); + /* 4 times -> 0x2 */ + XMEMCPY(m[0x2], m[0x4], AES_BLOCK_SIZE); + RIGHTSHIFTX(m[0x2]); + /* 8 times -> 0x1 */ + XMEMCPY(m[0x1], m[0x2], AES_BLOCK_SIZE); + RIGHTSHIFTX(m[0x1]); + + /* 0x3 */ + XMEMCPY(m[0x3], m[0x2], AES_BLOCK_SIZE); + xorbuf (m[0x3], m[0x1], AES_BLOCK_SIZE); + + /* 0x5 -> 0x7 */ + XMEMCPY(m[0x5], m[0x4], AES_BLOCK_SIZE); + xorbuf (m[0x5], m[0x1], AES_BLOCK_SIZE); + XMEMCPY(m[0x6], m[0x4], AES_BLOCK_SIZE); + xorbuf (m[0x6], m[0x2], AES_BLOCK_SIZE); + XMEMCPY(m[0x7], m[0x4], AES_BLOCK_SIZE); + xorbuf (m[0x7], m[0x3], AES_BLOCK_SIZE); + + /* 0x9 -> 0xf */ + XMEMCPY(m[0x9], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0x9], m[0x1], AES_BLOCK_SIZE); + XMEMCPY(m[0xa], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0xa], m[0x2], AES_BLOCK_SIZE); + XMEMCPY(m[0xb], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0xb], m[0x3], AES_BLOCK_SIZE); + XMEMCPY(m[0xc], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0xc], m[0x4], AES_BLOCK_SIZE); + XMEMCPY(m[0xd], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0xd], m[0x5], AES_BLOCK_SIZE); + XMEMCPY(m[0xe], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0xe], m[0x6], AES_BLOCK_SIZE); + XMEMCPY(m[0xf], m[0x8], AES_BLOCK_SIZE); + xorbuf (m[0xf], m[0x7], AES_BLOCK_SIZE); + +#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_16BIT_CPU) + for (i = 0; i < 16; i++) { + Shift4_M0(m[16+i], m[i]); + } +#endif +} + #endif /* GCM_TABLE */ /* Software AES - GCM SetKey */ @@ -4165,7 +4327,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) } #endif XMEMSET(iv, 0, AES_BLOCK_SIZE); - ret = wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION); + ret = wc_AesSetKeyLocal(aes, key, len, iv, AES_ENCRYPTION, 0); #ifdef WOLFSSL_AESNI /* AES-NI code generates its own H value. */ @@ -4176,7 +4338,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) #if !defined(FREESCALE_LTC_AES_GCM) if (ret == 0) { wc_AesEncrypt(aes, iv, aes->H); - #ifdef GCM_TABLE + #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) GenerateM0(aes); #endif /* GCM_TABLE */ } @@ -5528,6 +5690,7 @@ static const byte R[256][2] = { static void GMULT(byte *x, byte m[256][AES_BLOCK_SIZE]) { +#if !defined(WORD64_AVAILABLE) || defined(BIG_ENDIAN_ORDER) int i, j; byte Z[AES_BLOCK_SIZE]; byte a; @@ -5542,14 +5705,41 @@ static void GMULT(byte *x, byte m[256][AES_BLOCK_SIZE]) Z[j] = Z[j-1]; } - Z[0] = R[a][0]; + Z[0] = R[a][0]; Z[1] ^= R[a][1]; } xorbuf(Z, m[x[0]], AES_BLOCK_SIZE); XMEMCPY(x, Z, AES_BLOCK_SIZE); -} +#else + byte Z[AES_BLOCK_SIZE + AES_BLOCK_SIZE]; + byte a; + word64* pZ; + word64* pm; + word64* px = (word64*)(x); + pZ = (word64*)(Z + 15 + 1); + pm = (word64*)(m[x[15]]); + pZ[0] = pm[0]; + pZ[1] = pm[1]; + a = Z[16 + 15]; + Z[15] = R[a][0]; + Z[16] ^= R[a][1]; + int i; + for (i = 14; i > 0; i--) { + pZ = (word64*)(Z + i + 1); + pm = (word64*)(m[x[i]]); + pZ[0] ^= pm[0]; + pZ[1] ^= pm[1]; + a = Z[16 + i]; + Z[i] = R[a][0]; + Z[i+1] ^= R[a][1]; + } + pZ = (word64*)(Z + 1); + pm = (word64*)(m[x[0]]); + px[0] = pZ[0] ^ pm[0]; px[1] = pZ[1] ^ pm[1]; +#endif +} void GHASH(Aes* aes, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) @@ -5605,6 +5795,280 @@ void GHASH(Aes* aes, const byte* a, word32 aSz, const byte* c, } /* end GCM_TABLE */ +#elif defined(GCM_TABLE_4BIT) + +/* remainder = x^7 + x^2 + x^1 + 1 => 0xe1 + * R shifts right a reverse bit pair of bytes such that: + * R(b0, b1) => b1 = (b1 >> 1) | (b0 << 7); b0 >>= 1 + * 0 => 0, 0, 0, 0 => R(R(R(00,00) ^ 00,00) ^ 00,00) ^ 00,00 = 00,00 + * 8 => 0, 0, 0, 1 => R(R(R(00,00) ^ 00,00) ^ 00,00) ^ e1,00 = e1,00 + * 4 => 0, 0, 1, 0 => R(R(R(00,00) ^ 00,00) ^ e1,00) ^ 00,00 = 70,80 + * 2 => 0, 1, 0, 0 => R(R(R(00,00) ^ e1,00) ^ 00,00) ^ 00,00 = 38,40 + * 1 => 1, 0, 0, 0 => R(R(R(e1,00) ^ 00,00) ^ 00,00) ^ 00,00 = 1c,20 + * To calculate te rest, XOR result for each bit. + * e.g. 6 = 4 ^ 2 => 48,c0 + * + * Second half is same values rotated by 4-bits. + */ +#if defined(BIG_ENDIAN_ORDER) || defined(WC_16BIT_CPU) +static const byte R[16][2] = { + {0x00, 0x00}, {0x1c, 0x20}, {0x38, 0x40}, {0x24, 0x60}, + {0x70, 0x80}, {0x6c, 0xa0}, {0x48, 0xc0}, {0x54, 0xe0}, + {0xe1, 0x00}, {0xfd, 0x20}, {0xd9, 0x40}, {0xc5, 0x60}, + {0x91, 0x80}, {0x8d, 0xa0}, {0xa9, 0xc0}, {0xb5, 0xe0}, +}; +#else +static const word16 R[32] = { + 0x0000, 0x201c, 0x4038, 0x6024, + 0x8070, 0xa06c, 0xc048, 0xe054, + 0x00e1, 0x20fd, 0x40d9, 0x60c5, + 0x8091, 0xa08d, 0xc0a9, 0xe0b5, + + 0x0000, 0xc201, 0x8403, 0x4602, + 0x0807, 0xca06, 0x8c04, 0x4e05, + 0x100e, 0xd20f, 0x940d, 0x560c, + 0x1809, 0xda08, 0x9c0a, 0x5e0b, +}; +#endif + +/* Multiply in GF(2^128) defined by polynomial: + * x^128 + x^7 + x^2 + x^1 + 1. + * + * H: hash key = encrypt(key, 0) + * x = x * H in field + * + * x: cumlative result + * m: 4-bit table + * [0..15] * H + */ +#if defined(BIG_ENDIAN_ORDER) || defined(WC_16BIT_CPU) +static void GMULT(byte *x, byte m[16][AES_BLOCK_SIZE]) +{ + int i, j, n; + byte Z[AES_BLOCK_SIZE]; + byte a; + + XMEMSET(Z, 0, sizeof(Z)); + + for (i = 15; i >= 0; i--) { + for (n = 0; n < 2; n++) { + if (n == 0) + xorbuf(Z, m[x[i] & 0xf], AES_BLOCK_SIZE); + else { + xorbuf(Z, m[x[i] >> 4], AES_BLOCK_SIZE); + if (i == 0) + break; + } + a = Z[15] & 0xf; + + for (j = 15; j > 0; j--) + Z[j] = (Z[j-1] << 4) | (Z[j] >> 4); + Z[0] >>= 4; + + Z[0] ^= R[a][0]; + Z[1] ^= R[a][1]; + } + } + + XMEMCPY(x, Z, AES_BLOCK_SIZE); +} +#elif defined(WC_32BIT_CPU) +static WC_INLINE void GMULT(byte *x, byte m[32][AES_BLOCK_SIZE]) +{ + int i; + word32 z8[4] = {0, 0, 0, 0}; + byte a; + word32* x8 = (word32*)x; + word32* m8; + byte xi; + word32 n7, n6, n5, n4, n3, n2, n1, n0; + + for (i = 15; i > 0; i--) { + xi = x[i]; + + /* XOR in (msn * H) */ + m8 = (word32*)m[xi & 0xf]; + z8[0] ^= m8[0]; z8[1] ^= m8[1]; z8[2] ^= m8[2]; z8[3] ^= m8[3]; + + /* Cache top byte for remainder calculations - lost in rotate. */ + a = z8[3] >> 24; + + /* Rotate Z by 8-bits */ + z8[3] = (z8[2] >> 24) | (z8[3] << 8); + z8[2] = (z8[1] >> 24) | (z8[2] << 8); + z8[1] = (z8[0] >> 24) | (z8[1] << 8); + z8[0] <<= 8; + + /* XOR in (msn * remainder) [pre-rotated by 4 bits] */ + z8[0] ^= (word32)R[16 + (a & 0xf)]; + + xi >>= 4; + /* XOR in next significant nibble (XORed with H) * remainder */ + m8 = (word32*)m[xi]; + a ^= (byte)(m8[3] >> 20); + z8[0] ^= (word32)R[a >> 4]; + + /* XOR in (next significant nibble * H) [pre-rotated by 4 bits] */ + m8 = (word32*)m[16 + xi]; + z8[0] ^= m8[0]; z8[1] ^= m8[1]; + z8[2] ^= m8[2]; z8[3] ^= m8[3]; + } + + xi = x[0]; + + /* XOR in most significant nibble * H */ + m8 = (word32*)m[xi & 0xf]; + z8[0] ^= m8[0]; z8[1] ^= m8[1]; z8[2] ^= m8[2]; z8[3] ^= m8[3]; + + /* Cache top byte for remainder calculations - lost in rotate. */ + a = (z8[3] >> 24) & 0xf; + + /* Rotate z by 4-bits */ + n7 = z8[3] & 0xf0f0f0f0ULL; + n6 = z8[3] & 0x0f0f0f0fULL; + n5 = z8[2] & 0xf0f0f0f0ULL; + n4 = z8[2] & 0x0f0f0f0fULL; + n3 = z8[1] & 0xf0f0f0f0ULL; + n2 = z8[1] & 0x0f0f0f0fULL; + n1 = z8[0] & 0xf0f0f0f0ULL; + n0 = z8[0] & 0x0f0f0f0fULL; + z8[3] = (n7 >> 4) | (n6 << 12) | (n4 >> 20); + z8[2] = (n5 >> 4) | (n4 << 12) | (n2 >> 20); + z8[1] = (n3 >> 4) | (n2 << 12) | (n0 >> 20); + z8[0] = (n1 >> 4) | (n0 << 12); + + /* XOR in most significant nibble * remainder */ + z8[0] ^= (word32)R[a]; + /* XOR in next significant nibble * H */ + m8 = (word32*)m[xi >> 4]; + z8[0] ^= m8[0]; z8[1] ^= m8[1]; z8[2] ^= m8[2]; z8[3] ^= m8[3]; + + /* Write back result. */ + x8[0] = z8[0]; x8[1] = z8[1]; x8[2] = z8[2]; x8[3] = z8[3]; +} +#else +static WC_INLINE void GMULT(byte *x, byte m[32][AES_BLOCK_SIZE]) +{ + int i; + word64 z8[2] = {0, 0}; + byte a; + word64* x8 = (word64*)x; + word64* m8; + word64 n0, n1, n2, n3; + byte xi; + + for (i = 15; i > 0; i--) { + xi = x[i]; + + /* XOR in (msn * H) */ + m8 = (word64*)m[xi & 0xf]; + z8[0] ^= m8[0]; + z8[1] ^= m8[1]; + + /* Cache top byte for remainder calculations - lost in rotate. */ + a = z8[1] >> 56; + + /* Rotate Z by 8-bits */ + z8[1] = (z8[0] >> 56) | (z8[1] << 8); + z8[0] <<= 8; + + /* XOR in (next significant nibble * H) [pre-rotated by 4 bits] */ + m8 = (word64*)m[16 + (xi >> 4)]; + z8[0] ^= m8[0]; + z8[1] ^= m8[1]; + + /* XOR in (msn * remainder) [pre-rotated by 4 bits] */ + z8[0] ^= (word64)R[16 + (a & 0xf)]; + /* XOR in next significant nibble (XORed with H) * remainder */ + m8 = (word64*)m[xi >> 4]; + a ^= (byte)(m8[1] >> 52); + z8[0] ^= (word64)R[a >> 4]; + } + + xi = x[0]; + + /* XOR in most significant nibble * H */ + m8 = (word64*)m[xi & 0xf]; + z8[0] ^= m8[0]; + z8[1] ^= m8[1]; + + /* Cache top byte for remainder calculations - lost in rotate. */ + a = (z8[1] >> 56) & 0xf; + + /* Rotate z by 4-bits */ + n3 = z8[1] & 0xf0f0f0f0f0f0f0f0ULL; + n2 = z8[1] & 0x0f0f0f0f0f0f0f0fULL; + n1 = z8[0] & 0xf0f0f0f0f0f0f0f0ULL; + n0 = z8[0] & 0x0f0f0f0f0f0f0f0fULL; + z8[1] = (n3 >> 4) | (n2 << 12) | (n0 >> 52); + z8[0] = (n1 >> 4) | (n0 << 12); + + /* XOR in next significant nibble * H */ + m8 = (word64*)m[xi >> 4]; + z8[0] ^= m8[0]; + z8[1] ^= m8[1]; + /* XOR in most significant nibble * remainder */ + z8[0] ^= (word64)R[a]; + + /* Write back result. */ + x8[0] = z8[0]; + x8[1] = z8[1]; +} +#endif + +void GHASH(Aes* aes, const byte* a, word32 aSz, const byte* c, + word32 cSz, byte* s, word32 sSz) +{ + byte x[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + word32 blocks, partial; + + XMEMSET(x, 0, AES_BLOCK_SIZE); + + /* Hash in A, the Additional Authentication Data */ + if (aSz != 0 && a != NULL) { + blocks = aSz / AES_BLOCK_SIZE; + partial = aSz % AES_BLOCK_SIZE; + while (blocks--) { + xorbuf(x, a, AES_BLOCK_SIZE); + GMULT(x, aes->M0); + a += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, a, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->M0); + } + } + + /* Hash in C, the Ciphertext */ + if (cSz != 0 && c != NULL) { + blocks = cSz / AES_BLOCK_SIZE; + partial = cSz % AES_BLOCK_SIZE; + while (blocks--) { + xorbuf(x, c, AES_BLOCK_SIZE); + GMULT(x, aes->M0); + c += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, c, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->M0); + } + } + + /* Hash in the lengths of A and C in bits */ + FlattenSzInBits(&scratch[0], aSz); + FlattenSzInBits(&scratch[8], cSz); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->M0); + + /* Copy the result into s. */ + XMEMCPY(s, x, sSz); +} + #elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) #if !defined(FREESCALE_LTC_AES_GCM) @@ -5613,6 +6077,9 @@ static void GMULT(word64* X, word64* Y) word64 Z[2] = {0,0}; word64 V[2]; int i, j; +#ifdef AES_GCM_GMULT_CT + word64 v1; +#endif V[0] = X[0]; V[1] = X[1]; for (i = 0; i < 2; i++) @@ -5620,11 +6087,24 @@ static void GMULT(word64* X, word64* Y) word64 y = Y[i]; for (j = 0; j < 64; j++) { +#ifdef AES_GCM_GMULT_CT + word64 mask = 0 - (y >> 63); + Z[0] ^= V[0] & mask; + Z[1] ^= V[1] & mask; +#else if (y & 0x8000000000000000ULL) { Z[0] ^= V[0]; Z[1] ^= V[1]; } +#endif +#ifdef AES_GCM_GMULT_CT + v1 = (0 - (V[1] & 1)) & 0xE100000000000000ULL; + V[1] >>= 1; + V[1] |= V[0] << 63; + V[0] >>= 1; + V[0] ^= v1; +#else if (V[1] & 0x0000000000000001) { V[1] >>= 1; V[1] |= ((V[0] & 0x0000000000000001) ? @@ -5638,6 +6118,7 @@ static void GMULT(word64* X, word64* Y) 0x8000000000000000ULL : 0); V[0] >>= 1; } +#endif y <<= 1; } } @@ -6155,36 +6636,34 @@ int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, word32 partial = sz % AES_BLOCK_SIZE; const byte* p = in; byte* c = out; - byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte *ctr; - byte scratch[AES_BLOCK_SIZE]; -#ifdef OPENSSL_EXTRA - word32 aadTemp; -#endif - ctr = counter; - XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); - XMEMSET(scratch, 0, AES_BLOCK_SIZE); + ALIGN32 byte counter[AES_BLOCK_SIZE]; + ALIGN32 byte initialCounter[AES_BLOCK_SIZE]; + ALIGN32 byte scratch[AES_BLOCK_SIZE]; + if (ivSz == GCM_NONCE_MID_SZ) { - XMEMCPY(initialCounter, iv, ivSz); - initialCounter[AES_BLOCK_SIZE - 1] = 1; + /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */ + XMEMCPY(counter, iv, ivSz); + XMEMSET(counter + GCM_NONCE_MID_SZ, 0, + AES_BLOCK_SIZE - GCM_NONCE_MID_SZ - 1); + counter[AES_BLOCK_SIZE - 1] = 1; } else { + /* Counter is GHASH of IV. */ #ifdef OPENSSL_EXTRA - aadTemp = aes->aadLen; + word32 aadTemp = aes->aadLen; aes->aadLen = 0; #endif - GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + GHASH(aes, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); #ifdef OPENSSL_EXTRA aes->aadLen = aadTemp; #endif } - XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); + XMEMCPY(initialCounter, counter, AES_BLOCK_SIZE); #ifdef WOLFSSL_PIC32MZ_CRYPT if (blocks) { /* use initial IV for HW, but don't use it below */ - XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE); + XMEMCPY(aes->reg, counter, AES_BLOCK_SIZE); ret = wc_Pic32AesCrypt( aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, @@ -6201,8 +6680,8 @@ int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, * of the whole buffer at once */ if (c != p && blocks > 0) { /* can not handle inline encryption */ while (blocks--) { - IncrementGcmCounter(ctr); - XMEMCPY(c, ctr, AES_BLOCK_SIZE); + IncrementGcmCounter(counter); + XMEMCPY(c, counter, AES_BLOCK_SIZE); c += AES_BLOCK_SIZE; } @@ -6214,22 +6693,22 @@ int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, } else #endif /* HAVE_AES_ECB && !WOLFSSL_PIC32MZ_CRYPT */ - while (blocks--) { - IncrementGcmCounter(ctr); - #if !defined(WOLFSSL_PIC32MZ_CRYPT) - wc_AesEncrypt(aes, ctr, scratch); - xorbuf(scratch, p, AES_BLOCK_SIZE); - XMEMCPY(c, scratch, AES_BLOCK_SIZE); - #endif - p += AES_BLOCK_SIZE; - c += AES_BLOCK_SIZE; + { + while (blocks--) { + IncrementGcmCounter(counter); + #if !defined(WOLFSSL_PIC32MZ_CRYPT) + wc_AesEncrypt(aes, counter, scratch); + xorbufout(c, scratch, p, AES_BLOCK_SIZE); + #endif + p += AES_BLOCK_SIZE; + c += AES_BLOCK_SIZE; + } } if (partial != 0) { - IncrementGcmCounter(ctr); - wc_AesEncrypt(aes, ctr, scratch); - xorbuf(scratch, p, partial); - XMEMCPY(c, scratch, partial); + IncrementGcmCounter(counter); + wc_AesEncrypt(aes, counter, scratch); + xorbufout(c, scratch, p, partial); } if (authTag) { GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz); @@ -6616,36 +7095,33 @@ int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, word32 partial = sz % AES_BLOCK_SIZE; const byte* c = in; byte* p = out; - byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte *ctr; - byte scratch[AES_BLOCK_SIZE]; - byte Tprime[AES_BLOCK_SIZE]; - byte EKY0[AES_BLOCK_SIZE]; -#ifdef OPENSSL_EXTRA - word32 aadTemp; -#endif - ctr = counter; - XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + ALIGN32 byte counter[AES_BLOCK_SIZE]; + ALIGN32 byte scratch[AES_BLOCK_SIZE]; + ALIGN32 byte Tprime[AES_BLOCK_SIZE]; + ALIGN32 byte EKY0[AES_BLOCK_SIZE]; + if (ivSz == GCM_NONCE_MID_SZ) { - XMEMCPY(initialCounter, iv, ivSz); - initialCounter[AES_BLOCK_SIZE - 1] = 1; + /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */ + XMEMCPY(counter, iv, ivSz); + XMEMSET(counter + GCM_NONCE_MID_SZ, 0, + AES_BLOCK_SIZE - GCM_NONCE_MID_SZ - 1); + counter[AES_BLOCK_SIZE - 1] = 1; } else { + /* Counter is GHASH of IV. */ #ifdef OPENSSL_EXTRA - aadTemp = aes->aadLen; + word32 aadTemp = aes->aadLen; aes->aadLen = 0; #endif - GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + GHASH(aes, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); #ifdef OPENSSL_EXTRA aes->aadLen = aadTemp; #endif } - XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); - /* Calc the authTag again using the received auth data and the cipher text */ + /* Calc the authTag again using received auth data and the cipher text */ GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime)); - wc_AesEncrypt(aes, ctr, EKY0); + wc_AesEncrypt(aes, counter, EKY0); xorbuf(Tprime, EKY0, sizeof(Tprime)); #ifdef OPENSSL_EXTRA @@ -6662,7 +7138,7 @@ int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, #if defined(WOLFSSL_PIC32MZ_CRYPT) if (blocks) { /* use initial IV for HW, but don't use it below */ - XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE); + XMEMCPY(aes->reg, counter, AES_BLOCK_SIZE); ret = wc_Pic32AesCrypt( aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, @@ -6679,8 +7155,8 @@ int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, * of the whole buffer at once */ if (c != p && blocks > 0) { /* can not handle inline decryption */ while (blocks--) { - IncrementGcmCounter(ctr); - XMEMCPY(p, ctr, AES_BLOCK_SIZE); + IncrementGcmCounter(counter); + XMEMCPY(p, counter, AES_BLOCK_SIZE); p += AES_BLOCK_SIZE; } @@ -6693,20 +7169,21 @@ int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, } else #endif /* HAVE_AES_ECB && !PIC32MZ */ - while (blocks--) { - IncrementGcmCounter(ctr); - #if !defined(WOLFSSL_PIC32MZ_CRYPT) - wc_AesEncrypt(aes, ctr, scratch); - xorbuf(scratch, c, AES_BLOCK_SIZE); - XMEMCPY(p, scratch, AES_BLOCK_SIZE); - #endif - p += AES_BLOCK_SIZE; - c += AES_BLOCK_SIZE; + { + while (blocks--) { + IncrementGcmCounter(counter); + #if !defined(WOLFSSL_PIC32MZ_CRYPT) + wc_AesEncrypt(aes, counter, scratch); + xorbufout(p, scratch, c, AES_BLOCK_SIZE); + #endif + p += AES_BLOCK_SIZE; + c += AES_BLOCK_SIZE; + } } if (partial != 0) { - IncrementGcmCounter(ctr); - wc_AesEncrypt(aes, ctr, scratch); + IncrementGcmCounter(counter); + wc_AesEncrypt(aes, counter, scratch); xorbuf(scratch, c, partial); XMEMCPY(p, scratch, partial); } diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c index 3319c3914..e1a1dbe5d 100644 --- a/wolfcrypt/src/ecc.c +++ b/wolfcrypt/src/ecc.c @@ -10562,14 +10562,22 @@ int wc_ecc_decrypt(ecc_key* privKey, ecc_key* pubKey, const byte* msg, case ecAES_128_CBC: { Aes aes; - ret = wc_AesSetKey(&aes, encKey, KEY_SIZE_128, encIv, + ret = wc_AesInit(&aes, NULL, INVALID_DEVID); + if (ret == 0) { + ret = wc_AesSetKey(&aes, encKey, KEY_SIZE_128, encIv, AES_DECRYPTION); + if (ret == 0) { + ret = wc_AesCbcDecrypt(&aes, out, msg, + msgSz-digestSz); + #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) + ret = wc_AsyncWait(ret, &aes.asyncDev, + WC_ASYNC_FLAG_NONE); + #endif + } + wc_AesFree(&aes); + } if (ret != 0) - break; - ret = wc_AesCbcDecrypt(&aes, out, msg, msgSz-digestSz); - #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) - ret = wc_AsyncWait(ret, &aes.asyncDev, WC_ASYNC_FLAG_NONE); - #endif + break; } break; #endif diff --git a/wolfcrypt/src/misc.c b/wolfcrypt/src/misc.c index a2aa6c468..90679197f 100644 --- a/wolfcrypt/src/misc.c +++ b/wolfcrypt/src/misc.c @@ -127,6 +127,20 @@ WC_STATIC WC_INLINE word32 ByteReverseWord32(word32 value) #elif defined(WOLF_ALLOW_BUILTIN) && \ defined(__GNUC_PREREQ) && __GNUC_PREREQ(4, 3) return (word32)__builtin_bswap32(value); +#elif defined(__arm__) && defined(__GNUC__) + __asm__ volatile ( + "REV %0, %0 \n" + : "+r" (value) + : + ); + return value; +#elif defined(__aarch64__) && defined(__GNUC__) + __asm__ volatile ( + "REV %w0, %w0 \n" + : "+r" (value) + : + ); + return value; #elif defined(FAST_ROTATE) /* 5 instructions with rotate instruction, 9 without */ return (rotrFixed(value, 8U) & 0xff00ff00) | @@ -193,6 +207,36 @@ WC_STATIC WC_INLINE void ByteReverseWords64(word64* out, const word64* in, #endif /* WORD64_AVAILABLE && !WOLFSSL_NO_WORD64_OPS */ #ifndef WOLFSSL_NO_XOR_OPS +/* This routine performs a bitwise XOR operation of <*r> and <*a> for number +of wolfssl_words, placing the result in <*r>. */ +WC_STATIC WC_INLINE void XorWordsOut(wolfssl_word* r, const wolfssl_word* a, + const wolfssl_word* b, word32 n) +{ + word32 i; + + for (i = 0; i < n; i++) r[i] = a[i] ^ b[i]; +} + +/* This routine performs a bitwise XOR operation of <*buf> and <*mask> of n +counts, placing the result in <*buf>. */ + +WC_STATIC WC_INLINE void xorbufout(void*out, const void* buf, const void* mask, + word32 count) +{ + if (((wolfssl_word)out | (wolfssl_word)buf | (wolfssl_word)mask | count) % \ + WOLFSSL_WORD_SIZE == 0) + XorWordsOut( (wolfssl_word*)out, (wolfssl_word*)buf, + (const wolfssl_word*)mask, count / WOLFSSL_WORD_SIZE); + else { + word32 i; + byte* o = (byte*)out; + byte* b = (byte*)buf; + const byte* m = (const byte*)mask; + + for (i = 0; i < count; i++) o[i] = b[i] ^ m[i]; + } +} + /* This routine performs a bitwise XOR operation of <*r> and <*a> for number of wolfssl_words, placing the result in <*r>. */ WC_STATIC WC_INLINE void XorWords(wolfssl_word* r, const wolfssl_word* a, word32 n) diff --git a/wolfssl/openssl/aes.h b/wolfssl/openssl/aes.h index de9f5b897..4fdf9fef0 100644 --- a/wolfssl/openssl/aes.h +++ b/wolfssl/openssl/aes.h @@ -47,6 +47,12 @@ typedef struct WOLFSSL_AES_KEY { #ifdef GCM_TABLE /* key-based fast multiplication table. */ ALIGN16 void* M0[4096 / sizeof(void*)]; + #elif defined(GCM_TABLE_4BIT) + #if !defined(WORD64_AVAILABLE) || defined(BIG_ENDIAN_ORDER) + ALIGN16 byte M0[16][AES_BLOCK_SIZE]; + #else + ALIGN16 byte M0[32][AES_BLOCK_SIZE]; + #endif #endif /* GCM_TABLE */ #if defined(WOLFSSL_DEVCRYPTO) && \ (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC)) diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index bd704375e..cdd96d02f 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -171,6 +171,12 @@ struct Aes { #ifdef GCM_TABLE /* key-based fast multiplication table. */ ALIGN16 byte M0[256][AES_BLOCK_SIZE]; +#elif defined(GCM_TABLE_4BIT) + #if defined(BIG_ENDIAN_ORDER) || defined(WC_16BIT_CPU) + ALIGN16 byte M0[16][AES_BLOCK_SIZE]; + #else + ALIGN16 byte M0[32][AES_BLOCK_SIZE]; + #endif #endif /* GCM_TABLE */ #ifdef HAVE_CAVIUM_OCTEON_SYNC word32 y0; diff --git a/wolfssl/wolfcrypt/misc.h b/wolfssl/wolfcrypt/misc.h index 08affe9fb..275ef309d 100644 --- a/wolfssl/wolfcrypt/misc.h +++ b/wolfssl/wolfcrypt/misc.h @@ -55,6 +55,11 @@ word32 ByteReverseWord32(word32); WOLFSSL_LOCAL void ByteReverseWords(word32*, const word32*, word32); +WOLFSSL_LOCAL +void XorWordsOut(wolfssl_word* r, const wolfssl_word* a, const wolfssl_word* b, + word32 n); +WOLFSSL_LOCAL +void xorbufout(void*, const void*, const void*, word32); WOLFSSL_LOCAL void XorWords(wolfssl_word*, const wolfssl_word*, word32); WOLFSSL_LOCAL diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h index 5c071ecb3..d9b1bec04 100644 --- a/wolfssl/wolfcrypt/types.h +++ b/wolfssl/wolfcrypt/types.h @@ -154,6 +154,7 @@ decouple library dependencies with standard string, memory and so on. #ifdef WORD64_AVAILABLE #define WOLFCRYPT_SLOW_WORD64 #endif + #define WC_32BIT_CPU #endif #elif defined(WC_16BIT_CPU) @@ -167,6 +168,7 @@ decouple library dependencies with standard string, memory and so on. typedef word32 wolfssl_word; #define MP_16BIT /* for mp_int, mp_word needs to be twice as big as mp_digit, no 64 bit type so make mp_digit 16 bit */ + #define WC_32BIT_CPU #endif enum {