diff --git a/wolfcrypt/src/md5.c b/wolfcrypt/src/md5.c index a3e76eda8..6c2e45d9f 100644 --- a/wolfcrypt/src/md5.c +++ b/wolfcrypt/src/md5.c @@ -1,426 +1,426 @@ -/* md5.c - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - - -#ifdef HAVE_CONFIG_H - #include -#endif - -#include - -#if !defined(NO_MD5) && !defined(WOLFSSL_TI_HASH) - -#ifdef WOLFSSL_PIC32MZ_HASH -#define wc_InitMd5 wc_InitMd5_sw -#define wc_Md5Update wc_Md5Update_sw -#define wc_Md5Final wc_Md5Final_sw -#endif - -#include -#include - -#ifdef NO_INLINE - #include -#else - #include -#endif - -#ifdef FREESCALE_MMCAU - #include "cau_api.h" - #define XTRANSFORM(S,B) cau_md5_hash_n((B), 1, (unsigned char*)(S)->digest) -#else - #define XTRANSFORM(S,B) Transform((S)) -#endif - - -#ifdef STM32F2_HASH - /* - * STM32F2 hardware MD5 support through the STM32F2 standard peripheral - * library. Documentation located in STM32F2xx Standard Peripheral Library - * document (See note in README). - */ - #include "stm32f2xx.h" - - void wc_InitMd5(Md5* md5) - { - /* STM32F2 struct notes: - * md5->buffer = first 4 bytes used to hold partial block if needed - * md5->buffLen = num bytes currently stored in md5->buffer - * md5->loLen = num bytes that have been written to STM32 FIFO - */ - XMEMSET(md5->buffer, 0, MD5_REG_SIZE); - - md5->buffLen = 0; - md5->loLen = 0; - - /* initialize HASH peripheral */ - HASH_DeInit(); - - /* configure algo used, algo mode, datatype */ - HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE); - HASH->CR |= (HASH_AlgoSelection_MD5 | HASH_AlgoMode_HASH - | HASH_DataType_8b); - - /* reset HASH processor */ - HASH->CR |= HASH_CR_INIT; - } - - void wc_Md5Update(Md5* md5, const byte* data, word32 len) - { - word32 i = 0; - word32 fill = 0; - word32 diff = 0; - - /* if saved partial block is available */ - if (md5->buffLen > 0) { - fill = 4 - md5->buffLen; - - /* if enough data to fill, fill and push to FIFO */ - if (fill <= len) { - XMEMCPY((byte*)md5->buffer + md5->buffLen, data, fill); - HASH_DataIn(*(uint32_t*)md5->buffer); - - data += fill; - len -= fill; - md5->loLen += 4; - md5->buffLen = 0; - } else { - /* append partial to existing stored block */ - XMEMCPY((byte*)md5->buffer + md5->buffLen, data, len); - md5->buffLen += len; - return; - } - } - - /* write input block in the IN FIFO */ - for (i = 0; i < len; i += 4) - { - diff = len - i; - if (diff < 4) { - /* store incomplete last block, not yet in FIFO */ - XMEMSET(md5->buffer, 0, MD5_REG_SIZE); - XMEMCPY((byte*)md5->buffer, data, diff); - md5->buffLen = diff; - } else { - HASH_DataIn(*(uint32_t*)data); - data+=4; - } - } - - /* keep track of total data length thus far */ - md5->loLen += (len - md5->buffLen); - } - - void wc_Md5Final(Md5* md5, byte* hash) - { - __IO uint16_t nbvalidbitsdata = 0; - - /* finish reading any trailing bytes into FIFO */ - if (md5->buffLen > 0) { - HASH_DataIn(*(uint32_t*)md5->buffer); - md5->loLen += md5->buffLen; - } - - /* calculate number of valid bits in last word of input data */ - nbvalidbitsdata = 8 * (md5->loLen % MD5_REG_SIZE); - - /* configure number of valid bits in last word of the data */ - HASH_SetLastWordValidBitsNbr(nbvalidbitsdata); - - /* start HASH processor */ - HASH_StartDigest(); - - /* wait until Busy flag == RESET */ - while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {} - - /* read message digest */ - md5->digest[0] = HASH->HR[0]; - md5->digest[1] = HASH->HR[1]; - md5->digest[2] = HASH->HR[2]; - md5->digest[3] = HASH->HR[3]; - - ByteReverseWords(md5->digest, md5->digest, MD5_DIGEST_SIZE); - - XMEMCPY(hash, md5->digest, MD5_DIGEST_SIZE); - - wc_InitMd5(md5); /* reset state */ - } - -#elif defined(WOLFSSL_IT_HASH) - - /* defined in port/ti_md5.c */ - -#else /* CTaoCrypt software implementation */ - -#ifndef WOLFSSL_HAVE_MIN -#define WOLFSSL_HAVE_MIN - - static INLINE word32 min(word32 a, word32 b) - { - return a > b ? b : a; - } - -#endif /* WOLFSSL_HAVE_MIN */ - -#ifdef TI_HASH_TEST -void wc_InitMd5_ti(Md5* md5) ; -void wc_Md5Update_ti(Md5* md5, const byte* data, word32 len) ; -void wc_Md5Final_ti(Md5* md5, byte* hash) ; -#endif - -void wc_InitMd5(Md5* md5) -{ - md5->digest[0] = 0x67452301L; - md5->digest[1] = 0xefcdab89L; - md5->digest[2] = 0x98badcfeL; - md5->digest[3] = 0x10325476L; - - md5->buffLen = 0; - md5->loLen = 0; - md5->hiLen = 0; - -#ifdef TI_HASH_TEST - wc_InitMd5_ti(md5) ; -#endif -} - -#ifndef FREESCALE_MMCAU - -static void Transform(Md5* md5) -{ -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) - -#define MD5STEP(f, w, x, y, z, data, s) \ - w = rotlFixed(w + f(x, y, z) + data, s) + x - - /* Copy context->state[] to working vars */ - word32 a = md5->digest[0]; - word32 b = md5->digest[1]; - word32 c = md5->digest[2]; - word32 d = md5->digest[3]; - - MD5STEP(F1, a, b, c, d, md5->buffer[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, md5->buffer[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, md5->buffer[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, md5->buffer[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, md5->buffer[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, md5->buffer[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, md5->buffer[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, md5->buffer[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, md5->buffer[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, md5->buffer[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, md5->buffer[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, md5->buffer[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, md5->buffer[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, md5->buffer[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, md5->buffer[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, md5->buffer[15] + 0x49b40821, 22); - - MD5STEP(F2, a, b, c, d, md5->buffer[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, md5->buffer[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, md5->buffer[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, md5->buffer[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, md5->buffer[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, md5->buffer[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, md5->buffer[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, md5->buffer[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, md5->buffer[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, md5->buffer[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, md5->buffer[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, md5->buffer[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, md5->buffer[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, md5->buffer[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, md5->buffer[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, md5->buffer[12] + 0x8d2a4c8a, 20); - - MD5STEP(F3, a, b, c, d, md5->buffer[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, md5->buffer[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, md5->buffer[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, md5->buffer[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, md5->buffer[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, md5->buffer[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, md5->buffer[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, md5->buffer[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, md5->buffer[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, md5->buffer[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, md5->buffer[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, md5->buffer[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, md5->buffer[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, md5->buffer[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, md5->buffer[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, md5->buffer[2] + 0xc4ac5665, 23); - - MD5STEP(F4, a, b, c, d, md5->buffer[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, md5->buffer[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, md5->buffer[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, md5->buffer[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, md5->buffer[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, md5->buffer[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, md5->buffer[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, md5->buffer[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, md5->buffer[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, md5->buffer[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, md5->buffer[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, md5->buffer[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, md5->buffer[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, md5->buffer[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, md5->buffer[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, md5->buffer[9] + 0xeb86d391, 21); - - /* Add the working vars back into digest state[] */ - md5->digest[0] += a; - md5->digest[1] += b; - md5->digest[2] += c; - md5->digest[3] += d; -} - -#endif /* FREESCALE_MMCAU */ - - -static INLINE void AddLength(Md5* md5, word32 len) -{ - word32 tmp = md5->loLen; - if ( (md5->loLen += len) < tmp) - md5->hiLen++; /* carry low to high */ -} - - -void wc_Md5Update(Md5* md5, const byte* data, word32 len) -{ - /* do block size increments */ - byte* local = (byte*)md5->buffer; - - while (len) { - word32 add = min(len, MD5_BLOCK_SIZE - md5->buffLen); - XMEMCPY(&local[md5->buffLen], data, add); - - md5->buffLen += add; - data += add; - len -= add; - - if (md5->buffLen == MD5_BLOCK_SIZE) { - #if defined(BIG_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(md5->buffer, md5->buffer, MD5_BLOCK_SIZE); - #endif - XTRANSFORM(md5, local); - AddLength(md5, MD5_BLOCK_SIZE); - md5->buffLen = 0; - } - } -#ifdef TI_HASH_TEST - wc_Md5Update_ti(md5, data, len) ; -#endif - -} - - -void wc_Md5Final(Md5* md5, byte* hash) -{ - byte* local = (byte*)md5->buffer; - - AddLength(md5, md5->buffLen); /* before adding pads */ - - local[md5->buffLen++] = 0x80; /* add 1 */ - - /* pad with zeros */ - if (md5->buffLen > MD5_PAD_SIZE) { - XMEMSET(&local[md5->buffLen], 0, MD5_BLOCK_SIZE - md5->buffLen); - md5->buffLen += MD5_BLOCK_SIZE - md5->buffLen; - - #if defined(BIG_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(md5->buffer, md5->buffer, MD5_BLOCK_SIZE); - #endif - XTRANSFORM(md5, local); - md5->buffLen = 0; - } - XMEMSET(&local[md5->buffLen], 0, MD5_PAD_SIZE - md5->buffLen); - - /* put lengths in bits */ - md5->hiLen = (md5->loLen >> (8*sizeof(md5->loLen) - 3)) + - (md5->hiLen << 3); - md5->loLen = md5->loLen << 3; - - /* store lengths */ - #if defined(BIG_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(md5->buffer, md5->buffer, MD5_BLOCK_SIZE); - #endif - /* ! length ordering dependent on digest endian type ! */ - XMEMCPY(&local[MD5_PAD_SIZE], &md5->loLen, sizeof(word32)); - XMEMCPY(&local[MD5_PAD_SIZE + sizeof(word32)], &md5->hiLen, sizeof(word32)); - - XTRANSFORM(md5, local); - #ifdef BIG_ENDIAN_ORDER - ByteReverseWords(md5->digest, md5->digest, MD5_DIGEST_SIZE); - #endif - XMEMCPY(hash, md5->digest, MD5_DIGEST_SIZE); - - wc_InitMd5(md5); /* reset state */ - -#ifdef TI_HASH_TEST - wc_Md5Final_ti(md5, hash) ; -#endif -} - -#endif /* STM32F2_HASH */ - - -int wc_Md5Hash(const byte* data, word32 len, byte* hash) -{ -#ifdef WOLFSSL_SMALL_STACK - Md5* md5; -#else - Md5 md5[1]; -#endif - -#ifdef WOLFSSL_SMALL_STACK - md5 = (Md5*)XMALLOC(sizeof(Md5), NULL, DYNAMIC_TYPE_TMP_BUFFER); - if (md5 == NULL) - return MEMORY_E; -#endif - - wc_InitMd5(md5); - wc_Md5Update(md5, data, len); - wc_Md5Final(md5, hash); - -#ifdef WOLFSSL_SMALL_STACK - XFREE(md5, NULL, DYNAMIC_TYPE_TMP_BUFFER); -#endif - - return 0; -} - -#if defined(WOLFSSL_TI_HASH)||defined(TI_HASH_TEST) -#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif -void wc_Md5GetHash(Md5* md5, byte* hash) -{ -#if defined(WOLFSSL_TI_HASH) || defined(TI_HASH_TEST) - wc_Md5GetHash_ti(md5, hash) ; -#else - Md5 save = *md5 ; - wc_Md5Final(md5, hash) ; - *md5 = save ; -#endif -} -#endif /* NO_MD5 */ +/* md5.c + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#if !defined(NO_MD5) && !defined(WOLFSSL_TI_HASH) + +#ifdef WOLFSSL_PIC32MZ_HASH +#define wc_InitMd5 wc_InitMd5_sw +#define wc_Md5Update wc_Md5Update_sw +#define wc_Md5Final wc_Md5Final_sw +#endif + +#include +#include + +#ifdef NO_INLINE + #include +#else + #include +#endif + +#ifdef FREESCALE_MMCAU + #include "cau_api.h" + #define XTRANSFORM(S,B) cau_md5_hash_n((B), 1, (unsigned char*)(S)->digest) +#else + #define XTRANSFORM(S,B) Transform((S)) +#endif + + +#ifdef STM32F2_HASH + /* + * STM32F2 hardware MD5 support through the STM32F2 standard peripheral + * library. Documentation located in STM32F2xx Standard Peripheral Library + * document (See note in README). + */ + #include "stm32f2xx.h" + + void wc_InitMd5(Md5* md5) + { + /* STM32F2 struct notes: + * md5->buffer = first 4 bytes used to hold partial block if needed + * md5->buffLen = num bytes currently stored in md5->buffer + * md5->loLen = num bytes that have been written to STM32 FIFO + */ + XMEMSET(md5->buffer, 0, MD5_REG_SIZE); + + md5->buffLen = 0; + md5->loLen = 0; + + /* initialize HASH peripheral */ + HASH_DeInit(); + + /* configure algo used, algo mode, datatype */ + HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE); + HASH->CR |= (HASH_AlgoSelection_MD5 | HASH_AlgoMode_HASH + | HASH_DataType_8b); + + /* reset HASH processor */ + HASH->CR |= HASH_CR_INIT; + } + + void wc_Md5Update(Md5* md5, const byte* data, word32 len) + { + word32 i = 0; + word32 fill = 0; + word32 diff = 0; + + /* if saved partial block is available */ + if (md5->buffLen > 0) { + fill = 4 - md5->buffLen; + + /* if enough data to fill, fill and push to FIFO */ + if (fill <= len) { + XMEMCPY((byte*)md5->buffer + md5->buffLen, data, fill); + HASH_DataIn(*(uint32_t*)md5->buffer); + + data += fill; + len -= fill; + md5->loLen += 4; + md5->buffLen = 0; + } else { + /* append partial to existing stored block */ + XMEMCPY((byte*)md5->buffer + md5->buffLen, data, len); + md5->buffLen += len; + return; + } + } + + /* write input block in the IN FIFO */ + for (i = 0; i < len; i += 4) + { + diff = len - i; + if (diff < 4) { + /* store incomplete last block, not yet in FIFO */ + XMEMSET(md5->buffer, 0, MD5_REG_SIZE); + XMEMCPY((byte*)md5->buffer, data, diff); + md5->buffLen = diff; + } else { + HASH_DataIn(*(uint32_t*)data); + data+=4; + } + } + + /* keep track of total data length thus far */ + md5->loLen += (len - md5->buffLen); + } + + void wc_Md5Final(Md5* md5, byte* hash) + { + __IO uint16_t nbvalidbitsdata = 0; + + /* finish reading any trailing bytes into FIFO */ + if (md5->buffLen > 0) { + HASH_DataIn(*(uint32_t*)md5->buffer); + md5->loLen += md5->buffLen; + } + + /* calculate number of valid bits in last word of input data */ + nbvalidbitsdata = 8 * (md5->loLen % MD5_REG_SIZE); + + /* configure number of valid bits in last word of the data */ + HASH_SetLastWordValidBitsNbr(nbvalidbitsdata); + + /* start HASH processor */ + HASH_StartDigest(); + + /* wait until Busy flag == RESET */ + while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {} + + /* read message digest */ + md5->digest[0] = HASH->HR[0]; + md5->digest[1] = HASH->HR[1]; + md5->digest[2] = HASH->HR[2]; + md5->digest[3] = HASH->HR[3]; + + ByteReverseWords(md5->digest, md5->digest, MD5_DIGEST_SIZE); + + XMEMCPY(hash, md5->digest, MD5_DIGEST_SIZE); + + wc_InitMd5(md5); /* reset state */ + } + +#elif defined(WOLFSSL_IT_HASH) + + /* defined in port/ti_md5.c */ + +#else /* CTaoCrypt software implementation */ + +#ifndef WOLFSSL_HAVE_MIN +#define WOLFSSL_HAVE_MIN + + static INLINE word32 min(word32 a, word32 b) + { + return a > b ? b : a; + } + +#endif /* WOLFSSL_HAVE_MIN */ + +#ifdef TI_HASH_TEST +void wc_InitMd5_ti(Md5* md5) ; +void wc_Md5Update_ti(Md5* md5, const byte* data, word32 len) ; +void wc_Md5Final_ti(Md5* md5, byte* hash) ; +#endif + +void wc_InitMd5(Md5* md5) +{ + md5->digest[0] = 0x67452301L; + md5->digest[1] = 0xefcdab89L; + md5->digest[2] = 0x98badcfeL; + md5->digest[3] = 0x10325476L; + + md5->buffLen = 0; + md5->loLen = 0; + md5->hiLen = 0; + +#ifdef TI_HASH_TEST + wc_InitMd5_ti(md5) ; +#endif +} + +#ifndef FREESCALE_MMCAU + +static void Transform(Md5* md5) +{ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define MD5STEP(f, w, x, y, z, data, s) \ + w = rotlFixed(w + f(x, y, z) + data, s) + x + + /* Copy context->state[] to working vars */ + word32 a = md5->digest[0]; + word32 b = md5->digest[1]; + word32 c = md5->digest[2]; + word32 d = md5->digest[3]; + + MD5STEP(F1, a, b, c, d, md5->buffer[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, md5->buffer[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, md5->buffer[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, md5->buffer[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, md5->buffer[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, md5->buffer[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, md5->buffer[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, md5->buffer[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, md5->buffer[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, md5->buffer[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, md5->buffer[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, md5->buffer[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, md5->buffer[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, md5->buffer[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, md5->buffer[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, md5->buffer[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, md5->buffer[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, md5->buffer[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, md5->buffer[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, md5->buffer[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, md5->buffer[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, md5->buffer[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, md5->buffer[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, md5->buffer[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, md5->buffer[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, md5->buffer[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, md5->buffer[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, md5->buffer[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, md5->buffer[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, md5->buffer[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, md5->buffer[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, md5->buffer[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, md5->buffer[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, md5->buffer[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, md5->buffer[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, md5->buffer[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, md5->buffer[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, md5->buffer[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, md5->buffer[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, md5->buffer[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, md5->buffer[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, md5->buffer[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, md5->buffer[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, md5->buffer[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, md5->buffer[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, md5->buffer[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, md5->buffer[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, md5->buffer[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, md5->buffer[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, md5->buffer[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, md5->buffer[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, md5->buffer[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, md5->buffer[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, md5->buffer[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, md5->buffer[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, md5->buffer[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, md5->buffer[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, md5->buffer[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, md5->buffer[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, md5->buffer[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, md5->buffer[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, md5->buffer[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, md5->buffer[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, md5->buffer[9] + 0xeb86d391, 21); + + /* Add the working vars back into digest state[] */ + md5->digest[0] += a; + md5->digest[1] += b; + md5->digest[2] += c; + md5->digest[3] += d; +} + +#endif /* FREESCALE_MMCAU */ + + +static INLINE void AddLength(Md5* md5, word32 len) +{ + word32 tmp = md5->loLen; + if ( (md5->loLen += len) < tmp) + md5->hiLen++; /* carry low to high */ +} + + +void wc_Md5Update(Md5* md5, const byte* data, word32 len) +{ + /* do block size increments */ + byte* local = (byte*)md5->buffer; + + while (len) { + word32 add = min(len, MD5_BLOCK_SIZE - md5->buffLen); + XMEMCPY(&local[md5->buffLen], data, add); + + md5->buffLen += add; + data += add; + len -= add; + + if (md5->buffLen == MD5_BLOCK_SIZE) { + #if defined(BIG_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + ByteReverseWords(md5->buffer, md5->buffer, MD5_BLOCK_SIZE); + #endif + XTRANSFORM(md5, local); + AddLength(md5, MD5_BLOCK_SIZE); + md5->buffLen = 0; + } + } +#ifdef TI_HASH_TEST + wc_Md5Update_ti(md5, data, len) ; +#endif + +} + + +void wc_Md5Final(Md5* md5, byte* hash) +{ + byte* local = (byte*)md5->buffer; + + AddLength(md5, md5->buffLen); /* before adding pads */ + + local[md5->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (md5->buffLen > MD5_PAD_SIZE) { + XMEMSET(&local[md5->buffLen], 0, MD5_BLOCK_SIZE - md5->buffLen); + md5->buffLen += MD5_BLOCK_SIZE - md5->buffLen; + + #if defined(BIG_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + ByteReverseWords(md5->buffer, md5->buffer, MD5_BLOCK_SIZE); + #endif + XTRANSFORM(md5, local); + md5->buffLen = 0; + } + XMEMSET(&local[md5->buffLen], 0, MD5_PAD_SIZE - md5->buffLen); + + /* put lengths in bits */ + md5->hiLen = (md5->loLen >> (8*sizeof(md5->loLen) - 3)) + + (md5->hiLen << 3); + md5->loLen = md5->loLen << 3; + + /* store lengths */ + #if defined(BIG_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + ByteReverseWords(md5->buffer, md5->buffer, MD5_BLOCK_SIZE); + #endif + /* ! length ordering dependent on digest endian type ! */ + XMEMCPY(&local[MD5_PAD_SIZE], &md5->loLen, sizeof(word32)); + XMEMCPY(&local[MD5_PAD_SIZE + sizeof(word32)], &md5->hiLen, sizeof(word32)); + + XTRANSFORM(md5, local); + #ifdef BIG_ENDIAN_ORDER + ByteReverseWords(md5->digest, md5->digest, MD5_DIGEST_SIZE); + #endif + XMEMCPY(hash, md5->digest, MD5_DIGEST_SIZE); + + wc_InitMd5(md5); /* reset state */ + +#ifdef TI_HASH_TEST + wc_Md5Final_ti(md5, hash) ; +#endif +} + +#endif /* STM32F2_HASH */ + + +int wc_Md5Hash(const byte* data, word32 len, byte* hash) +{ +#ifdef WOLFSSL_SMALL_STACK + Md5* md5; +#else + Md5 md5[1]; +#endif + +#ifdef WOLFSSL_SMALL_STACK + md5 = (Md5*)XMALLOC(sizeof(Md5), NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (md5 == NULL) + return MEMORY_E; +#endif + + wc_InitMd5(md5); + wc_Md5Update(md5, data, len); + wc_Md5Final(md5, hash); + +#ifdef WOLFSSL_SMALL_STACK + XFREE(md5, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return 0; +} + +#if defined(WOLFSSL_TI_HASH)||defined(TI_HASH_TEST) +#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif +void wc_Md5GetHash(Md5* md5, byte* hash) +{ +#if defined(WOLFSSL_TI_HASH) || defined(TI_HASH_TEST) + wc_Md5GetHash_ti(md5, hash) ; +#else + Md5 save = *md5 ; + wc_Md5Final(md5, hash) ; + *md5 = save ; +#endif +} +#endif /* NO_MD5 */ diff --git a/wolfcrypt/src/port/ti/ti-aes.c b/wolfcrypt/src/port/ti/ti-aes.c index 11a0680b6..8ca83385c 100644 --- a/wolfcrypt/src/port/ti/ti-aes.c +++ b/wolfcrypt/src/port/ti/ti-aes.c @@ -1,548 +1,548 @@ -/* port/ti/ti-aes.c - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - -#ifdef HAVE_CONFIG_H - #include -#endif - -#include - -#ifndef NO_AES - - -#if defined(WOLFSSL_TI_CRYPT) -#include -#include - -#include -#include -#include - -#include "inc/hw_aes.h" -#include "inc/hw_memmap.h" -#include "inc/hw_ints.h" -#include "driverlib/aes.h" -#include "driverlib/sysctl.h" -#include "driverlib/rom_map.h" -#include "driverlib/rom.h" - -static int AesSetIV(Aes* aes, const byte* iv) -{ - if (aes == NULL) - return BAD_FUNC_ARG; - - if (iv) - XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE); - else - XMEMSET(aes->reg, 0, AES_BLOCK_SIZE); - - return 0; -} - -WOLFSSL_API int wc_AesSetKey(Aes* aes, const byte* key, word32 len, const byte* iv, - int dir) -{ - if(!wolfSSL_TI_CCMInit())return 1 ; - if ((aes == NULL) || (key == NULL) || (iv == NULL)) - return BAD_FUNC_ARG; - if(!((dir == AES_ENCRYPTION) || (dir == AES_DECRYPTION))) - return BAD_FUNC_ARG; - - switch(len) { - case 16: aes->keylen = AES_CFG_KEY_SIZE_128BIT ; break ; - case 24: aes->keylen = AES_CFG_KEY_SIZE_192BIT ; break ; - case 32: aes->keylen = AES_CFG_KEY_SIZE_256BIT ; break ; - default: return BAD_FUNC_ARG; - } - - XMEMCPY(aes->key, key, len) ; - #ifdef WOLFSSL_AES_COUNTER - aes->left = 0; - #endif /* WOLFSSL_AES_COUNTER */ - return AesSetIV(aes, iv); -} - -#define AES_CFG_MODE_CTR_NOCTR AES_CFG_MODE_CTR+100 -#define IS_ALIGN16(p) (((unsigned int)(p)&0xf) == 0) - -static int AesAlign16(Aes* aes, byte* out, const byte* in, word32 sz, word32 dir, word32 mode) -{ - wolfSSL_TI_lockCCM() ; - ROM_AESReset(AES_BASE); - ROM_AESConfigSet(AES_BASE, (aes->keylen | dir | - (mode==AES_CFG_MODE_CTR_NOCTR ? AES_CFG_MODE_CTR : mode))); - AESIVSet(AES_BASE, aes->reg); - ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen); - if((dir == AES_CFG_DIR_DECRYPT)&& (mode == AES_CFG_MODE_CBC)) - /* if input and output same will overwrite input iv */ - XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - ROM_AESDataProcess(AES_BASE, (uint32_t *)in, (uint32_t *)out, sz); - wolfSSL_TI_unlockCCM() ; - - /* store iv for next call */ - if(mode == AES_CFG_MODE_CBC){ - if(dir == AES_CFG_DIR_ENCRYPT) - XMEMCPY(aes->reg, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - else - XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE); - } - - if(mode == AES_CFG_MODE_CTR) { - do { - int i ; - for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) { - if (++((byte *)aes->reg)[i]) - break ; - } - sz -= AES_BLOCK_SIZE ; - } while((int)sz > 0) ; - } - - return 0 ; -} - -static int AesProcess(Aes* aes, byte* out, const byte* in, word32 sz, word32 dir, word32 mode) -{ - const byte * in_p ; byte * out_p ; - word32 size ; - #define TI_BUFFSIZE 1024 - byte buff[TI_BUFFSIZE] ; - - if ((aes == NULL) || (in == NULL) || (out == NULL)) - return BAD_FUNC_ARG; - if(sz % AES_BLOCK_SIZE) - return BAD_FUNC_ARG; - - while(sz > 0) { - size = sz ; in_p = in ; out_p = out ; - if(!IS_ALIGN16(in)){ - size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ; - XMEMCPY(buff, in, size) ; - in_p = (const byte *)buff ; - } - if(!IS_ALIGN16(out)){ - size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ; - out_p = buff ; - } - - AesAlign16(aes, out_p, in_p, size, dir, mode) ; - - if(!IS_ALIGN16(out)){ - XMEMCPY(out, buff, size) ; - } - sz -= size ; in += size ; out += size ; - } - - return 0 ; -} - -WOLFSSL_API int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) -{ - return AesProcess(aes, out, in, sz, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CBC) ; -} - -WOLFSSL_API int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) -{ - return AesProcess(aes, out, in, sz, AES_CFG_DIR_DECRYPT, AES_CFG_MODE_CBC) ; -} - -#ifdef WOLFSSL_AES_COUNTER -WOLFSSL_API void wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) -{ - char out_block[AES_BLOCK_SIZE] ; - int odd ; - int even ; - char *tmp ; /* (char *)aes->tmp, for short */ - - tmp = (char *)aes->tmp ; - if(aes->left) { - if((aes->left + sz) >= AES_BLOCK_SIZE){ - odd = AES_BLOCK_SIZE - aes->left ; - } else { - odd = sz ; - } - XMEMCPY(tmp+aes->left, in, odd) ; - if((odd+aes->left) == AES_BLOCK_SIZE){ - AesProcess(aes, (byte *)out_block, (byte const *)tmp, AES_BLOCK_SIZE, - AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CTR) ; - XMEMCPY(out, out_block+aes->left, odd) ; - aes->left = 0 ; - XMEMSET(tmp, 0x0, AES_BLOCK_SIZE) ; - } - in += odd ; - out+= odd ; - sz -= odd ; - } - odd = sz % AES_BLOCK_SIZE ; /* if there is tail flagment */ - if(sz / AES_BLOCK_SIZE) { - even = (sz/AES_BLOCK_SIZE)*AES_BLOCK_SIZE ; - AesProcess(aes, out, in, even, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CTR); - out += even ; - in += even ; - } - if(odd) { - XMEMSET(tmp+aes->left, 0x0, AES_BLOCK_SIZE - aes->left) ; - XMEMCPY(tmp+aes->left, in, odd) ; - AesProcess(aes, (byte *)out_block, (byte const *)tmp, AES_BLOCK_SIZE, - AES_CFG_DIR_ENCRYPT, - AES_CFG_MODE_CTR_NOCTR /* Counter mode without counting IV */ - ); - XMEMCPY(out, out_block+aes->left,odd) ; - aes->left += odd ; - } -} -#endif - -/* AES-DIRECT */ -#if defined(WOLFSSL_AES_DIRECT) -WOLFSSL_API void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in) -{ - AesProcess(aes, out, in, AES_BLOCK_SIZE, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CBC) ; -} -WOLFSSL_API void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in) -{ - AesProcess(aes, out, in, AES_BLOCK_SIZE, AES_CFG_DIR_DECRYPT, AES_CFG_MODE_CBC) ; -} -WOLFSSL_API int wc_AesSetKeyDirect(Aes* aes, const byte* key, word32 len, - const byte* iv, int dir) -{ - return(wc_AesSetKey(aes, key, len, iv, dir)) ; -} -#endif - - -#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) - -static int AesAuthSetKey(Aes* aes, const byte* key, word32 keySz) -{ - byte nonce[AES_BLOCK_SIZE]; - - if ((aes == NULL) || (key == NULL)) - return BAD_FUNC_ARG ; - if (!((keySz == 16) || (keySz == 24) || (keySz == 32))) - return BAD_FUNC_ARG ; - - XMEMSET(nonce, 0, sizeof(nonce)); - return wc_AesSetKey(aes, key, keySz, nonce, AES_ENCRYPTION); -} - - -static int AesAuthArgCheck(Aes* aes, byte* out, const byte* in, word32 inSz, - const byte* nonce, word32 nonceSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz, word32 *M, word32 *L) -{ - if((aes == NULL)||(nonce == NULL)||(authTag== NULL)||(authIn == NULL)) - return BAD_FUNC_ARG; - if((inSz != 0) && ((out == NULL)||(in == NULL))) - return BAD_FUNC_ARG; - - switch(authTagSz){ - case 4: - *M = AES_CFG_CCM_M_4; break ; - case 6: - *M = AES_CFG_CCM_M_6; break ; - case 8: - *M = AES_CFG_CCM_M_8; break ; - case 10: - *M = AES_CFG_CCM_M_10; break ; - case 12: - *M = AES_CFG_CCM_M_12; break ; - case 14: - *M = AES_CFG_CCM_M_14; break ; - case 16: - *M = AES_CFG_CCM_M_16; break ; - default: - return 1 ; - } - - switch(nonceSz){ - case 7: - *L = AES_CFG_CCM_L_8; break ; - case 8: - *L = AES_CFG_CCM_L_7; break ; - case 9: - *L = AES_CFG_CCM_L_6; break ; - case 10: - *L = AES_CFG_CCM_L_5; break ; - case 11: - *L = AES_CFG_CCM_L_4; break ; - case 12: - *L = AES_CFG_CCM_L_3; break ; - case 13: - *L = AES_CFG_CCM_L_2; break ; - case 14: - *L = AES_CFG_CCM_L_1; break ; - default: - return 1; - } - return 0 ; -} - -static void AesAuthSetIv(Aes *aes, const byte *nonce, word32 len, word32 L, int mode) { - - if(mode == AES_CFG_MODE_CCM){ - XMEMSET(aes->reg, 0, 16) ; - switch(L){ - case AES_CFG_CCM_L_8: - aes->reg[0] = 0x7; break ; - case AES_CFG_CCM_L_7: - aes->reg[0] = 0x6; break ; - case AES_CFG_CCM_L_6: - aes->reg[0] = 0x5; break ; - case AES_CFG_CCM_L_5: - aes->reg[0] = 0x4; break ; - case AES_CFG_CCM_L_4: - aes->reg[0] = 0x3; break ; - case AES_CFG_CCM_L_3: - aes->reg[0] = 0x2; break ; - case AES_CFG_CCM_L_2: - aes->reg[0] = 0x1; break ; - case AES_CFG_CCM_L_1: - aes->reg[0] = 0x0; break ; - } - XMEMCPY(((byte *)aes->reg)+1, nonce, len) ; - } else { - byte *b = (byte *)aes->reg ; - XMEMSET(aes->reg, 0, AES_BLOCK_SIZE); - XMEMCPY(aes->reg, nonce, len); - b[AES_BLOCK_SIZE-4] = 0 ; - b[AES_BLOCK_SIZE-3] = 0 ; - b[AES_BLOCK_SIZE-2] = 0 ; - b[AES_BLOCK_SIZE-1] = 1 ; - } -} - -#define RoundUp16(n) ((n+15)&0xfffffff0) -#define FREE_ALL \ - if(in_save) XFREE(in_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\ - if(out_save) XFREE(out_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\ - if(authIn_save)XFREE(authIn_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\ - if(nonce_save) XFREE(nonce_save, NULL, DYNAMIC_TYPE_TMP_BUFFER); - -static int AesAuthEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, - const byte* nonce, word32 nonceSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz, int mode) -{ - word32 M, L ; - byte *in_a, *in_save ; - byte *out_a, *out_save ; - byte *authIn_a, *authIn_save ; - byte *nonce_a, *nonce_save ; - word32 tmpTag[4] ; - int ret ; - - if(AesAuthArgCheck(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, authIn, authInSz, &M, &L) - == BAD_FUNC_ARG)return BAD_FUNC_ARG ; - - /* 16 byte padding */ - in_save = NULL ; out_save = NULL ; authIn_save = NULL ; nonce_save = NULL ; - if((inSz%16)==0){ - in_save = NULL ; in_a = (byte *)in ; - out_save = NULL ; out_a = out ; - } else { - if((in_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E ; } - in_a = in_save ; XMEMSET(in_a, 0, RoundUp16(inSz)) ; XMEMCPY(in_a, in, inSz) ; - - if((out_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E ; } - out_a = out_save ; - } - - if((authInSz%16)==0){ - authIn_save = NULL ; authIn_a = (byte *)authIn ; - } else { - if((authIn_save = XMALLOC(RoundUp16(authInSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E ; } - authIn_a = authIn_save ; XMEMSET(authIn_a, 0, RoundUp16(authInSz)) ; XMEMCPY(authIn_a, authIn, authInSz) ; - } - - if((nonceSz%16)==0){ - nonce_save = NULL ; nonce_a = (byte *)nonce ; - } else { - if((nonce_save = XMALLOC(RoundUp16(nonceSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E; } - nonce_a = nonce_save ; XMEMSET(nonce_a, 0, RoundUp16(nonceSz)) ; XMEMCPY(nonce_a, nonce, nonceSz) ; - } - - /* do aes-ccm */ - AesAuthSetIv(aes, nonce, nonceSz, L, mode) ; - ROM_AESReset(AES_BASE); - ROM_AESConfigSet(AES_BASE, (aes->keylen | AES_CFG_DIR_ENCRYPT | - AES_CFG_CTR_WIDTH_128 | - mode | ((mode== AES_CFG_MODE_CCM) ? (L | M) : 0 ))) ; - ROM_AESIVSet(AES_BASE, aes->reg); - ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen); - ret = ROM_AESDataProcessAuth(AES_BASE, (unsigned int*)in_a, (unsigned int *)out_a, inSz, - (unsigned int*)authIn_a, authInSz, (unsigned int *)tmpTag); - if(ret == false){ - XMEMSET(out, 0, inSz) ; - XMEMSET(authTag, 0, authTagSz) ; - } else { - XMEMCPY(out, out_a, inSz) ; - XMEMCPY(authTag, tmpTag, authTagSz) ; - } - - FREE_ALL; - return 0 ; -} - -static int AesAuthDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, - const byte* nonce, word32 nonceSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz, int mode) -{ - word32 M, L ; - byte *in_a, *in_save ; - byte *out_a, *out_save ; - byte *authIn_a, *authIn_save ; - byte *nonce_a, *nonce_save ; - word32 tmpTag[4] ; - bool ret ; - - if(AesAuthArgCheck(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, authIn, authInSz, &M, &L) - == BAD_FUNC_ARG)return BAD_FUNC_ARG ; - - /* 16 byte padding */ - in_save = NULL ; out_save = NULL ; authIn_save = NULL ; nonce_save = NULL ; - if((inSz%16)==0){ - in_save = NULL ; in_a = (byte *)in ; - out_save = NULL ; out_a = out ; - } else { - if((in_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E;} - in_a = in_save ; XMEMSET(in_a, 0, RoundUp16(inSz)) ; XMEMCPY(in_a, in, inSz) ; - - if((out_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E;} - out_a = out_save ; - } - - if((authInSz%16)==0){ - authIn_save = NULL ; authIn_a = (byte *)authIn ; - } else { - if((authIn_save = XMALLOC(RoundUp16(authInSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E; } - authIn_a = authIn_save ; XMEMSET(authIn_a, 0, RoundUp16(authInSz)) ; XMEMCPY(authIn_a, authIn, authInSz) ; - } - - if((nonceSz%16)==0){ - nonce_save = NULL ; nonce_a = (byte *)nonce ; - } else { - if((nonce_save = XMALLOC(RoundUp16(nonceSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ - FREE_ALL; return MEMORY_E; } - nonce_a = nonce_save ; XMEMSET(nonce_a, 0, RoundUp16(nonceSz)) ; XMEMCPY(nonce_a, nonce, nonceSz) ; - } - - /* do aes-ccm */ - AesAuthSetIv(aes, nonce, nonceSz, L, mode) ; - ROM_AESReset(AES_BASE); - ROM_AESConfigSet(AES_BASE, (aes->keylen | AES_CFG_DIR_DECRYPT | - AES_CFG_CTR_WIDTH_128 | - mode | ((mode== AES_CFG_MODE_CCM) ? (L | M) : 0 ))) ; - ROM_AESIVSet(AES_BASE, aes->reg); - ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen); - ret = ROM_AESDataProcessAuth(AES_BASE, (unsigned int*)in_a, (unsigned int *)out_a, inSz, - (unsigned int*)authIn_a, authInSz, (unsigned int *)tmpTag); - if((ret == false) || (XMEMCMP(authTag, tmpTag, authTagSz) != 0)){ - XMEMSET(out, 0, inSz) ; - ret = false ; - } else { - XMEMCPY(out, out_a, inSz) ; - } - - FREE_ALL ; - return ret==true ? 0 : 1 ; -} -#endif - - -#ifdef HAVE_AESGCM -WOLFSSL_API int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) -{ - return AesAuthSetKey(aes, key, len) ; -} - -WOLFSSL_API int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - return AesAuthEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ; -} -WOLFSSL_API int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - return AesAuthDecrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ; -} - -WOLFSSL_API int wc_GmacSetKey(Gmac* gmac, const byte* key, word32 len) -{ - return AesAuthSetKey(&gmac->aes, key, len) ; -} - -WOLFSSL_API int wc_GmacUpdate(Gmac* gmac, const byte* iv, word32 ivSz, - const byte* authIn, word32 authInSz, - byte* authTag, word32 authTagSz) -{ - return AesAuthEncrypt(&gmac->aes, NULL, NULL, 0, iv, ivSz, authTag, authTagSz, - authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ; -} - -#endif /* HAVE_AESGCM */ - -#ifdef HAVE_AESCCM -WOLFSSL_API void wc_AesCcmSetKey(Aes* aes, const byte* key, word32 keySz) -{ - AesAuthSetKey(aes, key, keySz) ; -} - -WOLFSSL_API void wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, - const byte* nonce, word32 nonceSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - AesAuthEncrypt(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, - authIn, authInSz, AES_CFG_MODE_CCM) ; -} - -WOLFSSL_API int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, - const byte* nonce, word32 nonceSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - return AesAuthDecrypt(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, - authIn, authInSz, AES_CFG_MODE_CCM) ; -} -#endif /* HAVE_AESCCM */ - -#endif /* WOLFSSL_TI_CRYPT */ - -#endif /* NO_AES */ - - - +/* port/ti/ti-aes.c + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#ifndef NO_AES + + +#if defined(WOLFSSL_TI_CRYPT) +#include +#include + +#include +#include +#include + +#include "inc/hw_aes.h" +#include "inc/hw_memmap.h" +#include "inc/hw_ints.h" +#include "driverlib/aes.h" +#include "driverlib/sysctl.h" +#include "driverlib/rom_map.h" +#include "driverlib/rom.h" + +static int AesSetIV(Aes* aes, const byte* iv) +{ + if (aes == NULL) + return BAD_FUNC_ARG; + + if (iv) + XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE); + else + XMEMSET(aes->reg, 0, AES_BLOCK_SIZE); + + return 0; +} + +WOLFSSL_API int wc_AesSetKey(Aes* aes, const byte* key, word32 len, const byte* iv, + int dir) +{ + if(!wolfSSL_TI_CCMInit())return 1 ; + if ((aes == NULL) || (key == NULL) || (iv == NULL)) + return BAD_FUNC_ARG; + if(!((dir == AES_ENCRYPTION) || (dir == AES_DECRYPTION))) + return BAD_FUNC_ARG; + + switch(len) { + case 16: aes->keylen = AES_CFG_KEY_SIZE_128BIT ; break ; + case 24: aes->keylen = AES_CFG_KEY_SIZE_192BIT ; break ; + case 32: aes->keylen = AES_CFG_KEY_SIZE_256BIT ; break ; + default: return BAD_FUNC_ARG; + } + + XMEMCPY(aes->key, key, len) ; + #ifdef WOLFSSL_AES_COUNTER + aes->left = 0; + #endif /* WOLFSSL_AES_COUNTER */ + return AesSetIV(aes, iv); +} + +#define AES_CFG_MODE_CTR_NOCTR AES_CFG_MODE_CTR+100 +#define IS_ALIGN16(p) (((unsigned int)(p)&0xf) == 0) + +static int AesAlign16(Aes* aes, byte* out, const byte* in, word32 sz, word32 dir, word32 mode) +{ + wolfSSL_TI_lockCCM() ; + ROM_AESReset(AES_BASE); + ROM_AESConfigSet(AES_BASE, (aes->keylen | dir | + (mode==AES_CFG_MODE_CTR_NOCTR ? AES_CFG_MODE_CTR : mode))); + AESIVSet(AES_BASE, aes->reg); + ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen); + if((dir == AES_CFG_DIR_DECRYPT)&& (mode == AES_CFG_MODE_CBC)) + /* if input and output same will overwrite input iv */ + XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); + ROM_AESDataProcess(AES_BASE, (uint32_t *)in, (uint32_t *)out, sz); + wolfSSL_TI_unlockCCM() ; + + /* store iv for next call */ + if(mode == AES_CFG_MODE_CBC){ + if(dir == AES_CFG_DIR_ENCRYPT) + XMEMCPY(aes->reg, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); + else + XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE); + } + + if(mode == AES_CFG_MODE_CTR) { + do { + int i ; + for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) { + if (++((byte *)aes->reg)[i]) + break ; + } + sz -= AES_BLOCK_SIZE ; + } while((int)sz > 0) ; + } + + return 0 ; +} + +static int AesProcess(Aes* aes, byte* out, const byte* in, word32 sz, word32 dir, word32 mode) +{ + const byte * in_p ; byte * out_p ; + word32 size ; + #define TI_BUFFSIZE 1024 + byte buff[TI_BUFFSIZE] ; + + if ((aes == NULL) || (in == NULL) || (out == NULL)) + return BAD_FUNC_ARG; + if(sz % AES_BLOCK_SIZE) + return BAD_FUNC_ARG; + + while(sz > 0) { + size = sz ; in_p = in ; out_p = out ; + if(!IS_ALIGN16(in)){ + size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ; + XMEMCPY(buff, in, size) ; + in_p = (const byte *)buff ; + } + if(!IS_ALIGN16(out)){ + size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ; + out_p = buff ; + } + + AesAlign16(aes, out_p, in_p, size, dir, mode) ; + + if(!IS_ALIGN16(out)){ + XMEMCPY(out, buff, size) ; + } + sz -= size ; in += size ; out += size ; + } + + return 0 ; +} + +WOLFSSL_API int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) +{ + return AesProcess(aes, out, in, sz, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CBC) ; +} + +WOLFSSL_API int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) +{ + return AesProcess(aes, out, in, sz, AES_CFG_DIR_DECRYPT, AES_CFG_MODE_CBC) ; +} + +#ifdef WOLFSSL_AES_COUNTER +WOLFSSL_API void wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) +{ + char out_block[AES_BLOCK_SIZE] ; + int odd ; + int even ; + char *tmp ; /* (char *)aes->tmp, for short */ + + tmp = (char *)aes->tmp ; + if(aes->left) { + if((aes->left + sz) >= AES_BLOCK_SIZE){ + odd = AES_BLOCK_SIZE - aes->left ; + } else { + odd = sz ; + } + XMEMCPY(tmp+aes->left, in, odd) ; + if((odd+aes->left) == AES_BLOCK_SIZE){ + AesProcess(aes, (byte *)out_block, (byte const *)tmp, AES_BLOCK_SIZE, + AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CTR) ; + XMEMCPY(out, out_block+aes->left, odd) ; + aes->left = 0 ; + XMEMSET(tmp, 0x0, AES_BLOCK_SIZE) ; + } + in += odd ; + out+= odd ; + sz -= odd ; + } + odd = sz % AES_BLOCK_SIZE ; /* if there is tail flagment */ + if(sz / AES_BLOCK_SIZE) { + even = (sz/AES_BLOCK_SIZE)*AES_BLOCK_SIZE ; + AesProcess(aes, out, in, even, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CTR); + out += even ; + in += even ; + } + if(odd) { + XMEMSET(tmp+aes->left, 0x0, AES_BLOCK_SIZE - aes->left) ; + XMEMCPY(tmp+aes->left, in, odd) ; + AesProcess(aes, (byte *)out_block, (byte const *)tmp, AES_BLOCK_SIZE, + AES_CFG_DIR_ENCRYPT, + AES_CFG_MODE_CTR_NOCTR /* Counter mode without counting IV */ + ); + XMEMCPY(out, out_block+aes->left,odd) ; + aes->left += odd ; + } +} +#endif + +/* AES-DIRECT */ +#if defined(WOLFSSL_AES_DIRECT) +WOLFSSL_API void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in) +{ + AesProcess(aes, out, in, AES_BLOCK_SIZE, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CBC) ; +} +WOLFSSL_API void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in) +{ + AesProcess(aes, out, in, AES_BLOCK_SIZE, AES_CFG_DIR_DECRYPT, AES_CFG_MODE_CBC) ; +} +WOLFSSL_API int wc_AesSetKeyDirect(Aes* aes, const byte* key, word32 len, + const byte* iv, int dir) +{ + return(wc_AesSetKey(aes, key, len, iv, dir)) ; +} +#endif + + +#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) + +static int AesAuthSetKey(Aes* aes, const byte* key, word32 keySz) +{ + byte nonce[AES_BLOCK_SIZE]; + + if ((aes == NULL) || (key == NULL)) + return BAD_FUNC_ARG ; + if (!((keySz == 16) || (keySz == 24) || (keySz == 32))) + return BAD_FUNC_ARG ; + + XMEMSET(nonce, 0, sizeof(nonce)); + return wc_AesSetKey(aes, key, keySz, nonce, AES_ENCRYPTION); +} + + +static int AesAuthArgCheck(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz, word32 *M, word32 *L) +{ + if((aes == NULL)||(nonce == NULL)||(authTag== NULL)||(authIn == NULL)) + return BAD_FUNC_ARG; + if((inSz != 0) && ((out == NULL)||(in == NULL))) + return BAD_FUNC_ARG; + + switch(authTagSz){ + case 4: + *M = AES_CFG_CCM_M_4; break ; + case 6: + *M = AES_CFG_CCM_M_6; break ; + case 8: + *M = AES_CFG_CCM_M_8; break ; + case 10: + *M = AES_CFG_CCM_M_10; break ; + case 12: + *M = AES_CFG_CCM_M_12; break ; + case 14: + *M = AES_CFG_CCM_M_14; break ; + case 16: + *M = AES_CFG_CCM_M_16; break ; + default: + return 1 ; + } + + switch(nonceSz){ + case 7: + *L = AES_CFG_CCM_L_8; break ; + case 8: + *L = AES_CFG_CCM_L_7; break ; + case 9: + *L = AES_CFG_CCM_L_6; break ; + case 10: + *L = AES_CFG_CCM_L_5; break ; + case 11: + *L = AES_CFG_CCM_L_4; break ; + case 12: + *L = AES_CFG_CCM_L_3; break ; + case 13: + *L = AES_CFG_CCM_L_2; break ; + case 14: + *L = AES_CFG_CCM_L_1; break ; + default: + return 1; + } + return 0 ; +} + +static void AesAuthSetIv(Aes *aes, const byte *nonce, word32 len, word32 L, int mode) { + + if(mode == AES_CFG_MODE_CCM){ + XMEMSET(aes->reg, 0, 16) ; + switch(L){ + case AES_CFG_CCM_L_8: + aes->reg[0] = 0x7; break ; + case AES_CFG_CCM_L_7: + aes->reg[0] = 0x6; break ; + case AES_CFG_CCM_L_6: + aes->reg[0] = 0x5; break ; + case AES_CFG_CCM_L_5: + aes->reg[0] = 0x4; break ; + case AES_CFG_CCM_L_4: + aes->reg[0] = 0x3; break ; + case AES_CFG_CCM_L_3: + aes->reg[0] = 0x2; break ; + case AES_CFG_CCM_L_2: + aes->reg[0] = 0x1; break ; + case AES_CFG_CCM_L_1: + aes->reg[0] = 0x0; break ; + } + XMEMCPY(((byte *)aes->reg)+1, nonce, len) ; + } else { + byte *b = (byte *)aes->reg ; + XMEMSET(aes->reg, 0, AES_BLOCK_SIZE); + XMEMCPY(aes->reg, nonce, len); + b[AES_BLOCK_SIZE-4] = 0 ; + b[AES_BLOCK_SIZE-3] = 0 ; + b[AES_BLOCK_SIZE-2] = 0 ; + b[AES_BLOCK_SIZE-1] = 1 ; + } +} + +#define RoundUp16(n) ((n+15)&0xfffffff0) +#define FREE_ALL \ + if(in_save) XFREE(in_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\ + if(out_save) XFREE(out_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\ + if(authIn_save)XFREE(authIn_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\ + if(nonce_save) XFREE(nonce_save, NULL, DYNAMIC_TYPE_TMP_BUFFER); + +static int AesAuthEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz, int mode) +{ + word32 M, L ; + byte *in_a, *in_save ; + byte *out_a, *out_save ; + byte *authIn_a, *authIn_save ; + byte *nonce_a, *nonce_save ; + word32 tmpTag[4] ; + int ret ; + + if(AesAuthArgCheck(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, authIn, authInSz, &M, &L) + == BAD_FUNC_ARG)return BAD_FUNC_ARG ; + + /* 16 byte padding */ + in_save = NULL ; out_save = NULL ; authIn_save = NULL ; nonce_save = NULL ; + if((inSz%16)==0){ + in_save = NULL ; in_a = (byte *)in ; + out_save = NULL ; out_a = out ; + } else { + if((in_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E ; } + in_a = in_save ; XMEMSET(in_a, 0, RoundUp16(inSz)) ; XMEMCPY(in_a, in, inSz) ; + + if((out_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E ; } + out_a = out_save ; + } + + if((authInSz%16)==0){ + authIn_save = NULL ; authIn_a = (byte *)authIn ; + } else { + if((authIn_save = XMALLOC(RoundUp16(authInSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E ; } + authIn_a = authIn_save ; XMEMSET(authIn_a, 0, RoundUp16(authInSz)) ; XMEMCPY(authIn_a, authIn, authInSz) ; + } + + if((nonceSz%16)==0){ + nonce_save = NULL ; nonce_a = (byte *)nonce ; + } else { + if((nonce_save = XMALLOC(RoundUp16(nonceSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E; } + nonce_a = nonce_save ; XMEMSET(nonce_a, 0, RoundUp16(nonceSz)) ; XMEMCPY(nonce_a, nonce, nonceSz) ; + } + + /* do aes-ccm */ + AesAuthSetIv(aes, nonce, nonceSz, L, mode) ; + ROM_AESReset(AES_BASE); + ROM_AESConfigSet(AES_BASE, (aes->keylen | AES_CFG_DIR_ENCRYPT | + AES_CFG_CTR_WIDTH_128 | + mode | ((mode== AES_CFG_MODE_CCM) ? (L | M) : 0 ))) ; + ROM_AESIVSet(AES_BASE, aes->reg); + ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen); + ret = ROM_AESDataProcessAuth(AES_BASE, (unsigned int*)in_a, (unsigned int *)out_a, inSz, + (unsigned int*)authIn_a, authInSz, (unsigned int *)tmpTag); + if(ret == false){ + XMEMSET(out, 0, inSz) ; + XMEMSET(authTag, 0, authTagSz) ; + } else { + XMEMCPY(out, out_a, inSz) ; + XMEMCPY(authTag, tmpTag, authTagSz) ; + } + + FREE_ALL; + return 0 ; +} + +static int AesAuthDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz, int mode) +{ + word32 M, L ; + byte *in_a, *in_save ; + byte *out_a, *out_save ; + byte *authIn_a, *authIn_save ; + byte *nonce_a, *nonce_save ; + word32 tmpTag[4] ; + bool ret ; + + if(AesAuthArgCheck(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, authIn, authInSz, &M, &L) + == BAD_FUNC_ARG)return BAD_FUNC_ARG ; + + /* 16 byte padding */ + in_save = NULL ; out_save = NULL ; authIn_save = NULL ; nonce_save = NULL ; + if((inSz%16)==0){ + in_save = NULL ; in_a = (byte *)in ; + out_save = NULL ; out_a = out ; + } else { + if((in_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E;} + in_a = in_save ; XMEMSET(in_a, 0, RoundUp16(inSz)) ; XMEMCPY(in_a, in, inSz) ; + + if((out_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E;} + out_a = out_save ; + } + + if((authInSz%16)==0){ + authIn_save = NULL ; authIn_a = (byte *)authIn ; + } else { + if((authIn_save = XMALLOC(RoundUp16(authInSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E; } + authIn_a = authIn_save ; XMEMSET(authIn_a, 0, RoundUp16(authInSz)) ; XMEMCPY(authIn_a, authIn, authInSz) ; + } + + if((nonceSz%16)==0){ + nonce_save = NULL ; nonce_a = (byte *)nonce ; + } else { + if((nonce_save = XMALLOC(RoundUp16(nonceSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){ + FREE_ALL; return MEMORY_E; } + nonce_a = nonce_save ; XMEMSET(nonce_a, 0, RoundUp16(nonceSz)) ; XMEMCPY(nonce_a, nonce, nonceSz) ; + } + + /* do aes-ccm */ + AesAuthSetIv(aes, nonce, nonceSz, L, mode) ; + ROM_AESReset(AES_BASE); + ROM_AESConfigSet(AES_BASE, (aes->keylen | AES_CFG_DIR_DECRYPT | + AES_CFG_CTR_WIDTH_128 | + mode | ((mode== AES_CFG_MODE_CCM) ? (L | M) : 0 ))) ; + ROM_AESIVSet(AES_BASE, aes->reg); + ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen); + ret = ROM_AESDataProcessAuth(AES_BASE, (unsigned int*)in_a, (unsigned int *)out_a, inSz, + (unsigned int*)authIn_a, authInSz, (unsigned int *)tmpTag); + if((ret == false) || (XMEMCMP(authTag, tmpTag, authTagSz) != 0)){ + XMEMSET(out, 0, inSz) ; + ret = false ; + } else { + XMEMCPY(out, out_a, inSz) ; + } + + FREE_ALL ; + return ret==true ? 0 : 1 ; +} +#endif + + +#ifdef HAVE_AESGCM +WOLFSSL_API int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) +{ + return AesAuthSetKey(aes, key, len) ; +} + +WOLFSSL_API int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + return AesAuthEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ; +} +WOLFSSL_API int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + return AesAuthDecrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ; +} + +WOLFSSL_API int wc_GmacSetKey(Gmac* gmac, const byte* key, word32 len) +{ + return AesAuthSetKey(&gmac->aes, key, len) ; +} + +WOLFSSL_API int wc_GmacUpdate(Gmac* gmac, const byte* iv, word32 ivSz, + const byte* authIn, word32 authInSz, + byte* authTag, word32 authTagSz) +{ + return AesAuthEncrypt(&gmac->aes, NULL, NULL, 0, iv, ivSz, authTag, authTagSz, + authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ; +} + +#endif /* HAVE_AESGCM */ + +#ifdef HAVE_AESCCM +WOLFSSL_API void wc_AesCcmSetKey(Aes* aes, const byte* key, word32 keySz) +{ + AesAuthSetKey(aes, key, keySz) ; +} + +WOLFSSL_API void wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + AesAuthEncrypt(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, + authIn, authInSz, AES_CFG_MODE_CCM) ; +} + +WOLFSSL_API int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + return AesAuthDecrypt(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, + authIn, authInSz, AES_CFG_MODE_CCM) ; +} +#endif /* HAVE_AESCCM */ + +#endif /* WOLFSSL_TI_CRYPT */ + +#endif /* NO_AES */ + + + diff --git a/wolfcrypt/src/sha.c b/wolfcrypt/src/sha.c index 2a91116a2..be2106cbd 100644 --- a/wolfcrypt/src/sha.c +++ b/wolfcrypt/src/sha.c @@ -1,473 +1,473 @@ -/* sha.c - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - - -#ifdef HAVE_CONFIG_H - #include -#endif - -#include - -#if !defined(NO_SHA) && !defined(WOLFSSL_TI_HASH) - -#include -#include -#include - -#ifdef NO_INLINE - #include -#else - #include -#endif - -/* fips wrapper calls, user can call direct */ -#ifdef HAVE_FIPS - int wc_InitSha(Sha* sha) - { - return InitSha_fips(sha); - } - - - int wc_ShaUpdate(Sha* sha, const byte* data, word32 len) - { - return ShaUpdate_fips(sha, data, len); - } - - - int wc_ShaFinal(Sha* sha, byte* out) - { - return ShaFinal_fips(sha,out); - } - - int wc_ShaHash(const byte* data, word32 sz, byte* out) - { - return ShaHash(data, sz, out); - } - -#else /* else build without fips */ - -#ifdef WOLFSSL_PIC32MZ_HASH -#define wc_InitSha wc_InitSha_sw -#define wc_ShaUpdate wc_ShaUpdate_sw -#define wc_ShaFinal wc_ShaFinal_sw -#endif - - -#ifdef FREESCALE_MMCAU - #include "cau_api.h" - #define XTRANSFORM(S,B) cau_sha1_hash_n((B), 1, ((S))->digest) -#else - #define XTRANSFORM(S,B) Transform((S)) -#endif - -#ifdef STM32F2_HASH -/* - * STM32F2 hardware SHA1 support through the STM32F2 standard peripheral - * library. Documentation located in STM32F2xx Standard Peripheral Library - * document (See note in README). - */ -#include "stm32f2xx.h" -#include "stm32f2xx_hash.h" - -int wc_InitSha(Sha* sha) -{ - /* STM32F2 struct notes: - * sha->buffer = first 4 bytes used to hold partial block if needed - * sha->buffLen = num bytes currently stored in sha->buffer - * sha->loLen = num bytes that have been written to STM32 FIFO - */ - XMEMSET(sha->buffer, 0, SHA_REG_SIZE); - sha->buffLen = 0; - sha->loLen = 0; - - /* initialize HASH peripheral */ - HASH_DeInit(); - - /* configure algo used, algo mode, datatype */ - HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE); - HASH->CR |= (HASH_AlgoSelection_SHA1 | HASH_AlgoMode_HASH - | HASH_DataType_8b); - - /* reset HASH processor */ - HASH->CR |= HASH_CR_INIT; - - return 0; -} - -int wc_ShaUpdate(Sha* sha, const byte* data, word32 len) -{ - word32 i = 0; - word32 fill = 0; - word32 diff = 0; - - /* if saved partial block is available */ - if (sha->buffLen) { - fill = 4 - sha->buffLen; - - /* if enough data to fill, fill and push to FIFO */ - if (fill <= len) { - XMEMCPY((byte*)sha->buffer + sha->buffLen, data, fill); - HASH_DataIn(*(uint32_t*)sha->buffer); - - data += fill; - len -= fill; - sha->loLen += 4; - sha->buffLen = 0; - } else { - /* append partial to existing stored block */ - XMEMCPY((byte*)sha->buffer + sha->buffLen, data, len); - sha->buffLen += len; - return; - } - } - - /* write input block in the IN FIFO */ - for(i = 0; i < len; i += 4) - { - diff = len - i; - if ( diff < 4) { - /* store incomplete last block, not yet in FIFO */ - XMEMSET(sha->buffer, 0, SHA_REG_SIZE); - XMEMCPY((byte*)sha->buffer, data, diff); - sha->buffLen = diff; - } else { - HASH_DataIn(*(uint32_t*)data); - data+=4; - } - } - - /* keep track of total data length thus far */ - sha->loLen += (len - sha->buffLen); - - return 0; -} - -int wc_ShaFinal(Sha* sha, byte* hash) -{ - __IO uint16_t nbvalidbitsdata = 0; - - /* finish reading any trailing bytes into FIFO */ - if (sha->buffLen) { - HASH_DataIn(*(uint32_t*)sha->buffer); - sha->loLen += sha->buffLen; - } - - /* calculate number of valid bits in last word of input data */ - nbvalidbitsdata = 8 * (sha->loLen % SHA_REG_SIZE); - - /* configure number of valid bits in last word of the data */ - HASH_SetLastWordValidBitsNbr(nbvalidbitsdata); - - /* start HASH processor */ - HASH_StartDigest(); - - /* wait until Busy flag == RESET */ - while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {} - - /* read message digest */ - sha->digest[0] = HASH->HR[0]; - sha->digest[1] = HASH->HR[1]; - sha->digest[2] = HASH->HR[2]; - sha->digest[3] = HASH->HR[3]; - sha->digest[4] = HASH->HR[4]; - - ByteReverseWords(sha->digest, sha->digest, SHA_DIGEST_SIZE); - - XMEMCPY(hash, sha->digest, SHA_DIGEST_SIZE); - - return wc_InitSha(sha); /* reset state */ -} - -#elif defined(WOLFSSL_TI_HASH) - - /* defined in port/ti/ti_sha.c */ - -#else /* wc_ software implementation */ - -#ifndef WOLFSSL_HAVE_MIN -#define WOLFSSL_HAVE_MIN - - static INLINE word32 min(word32 a, word32 b) - { - return a > b ? b : a; - } - -#endif /* WOLFSSL_HAVE_MIN */ - - -int wc_InitSha(Sha* sha) -{ -#ifdef FREESCALE_MMCAU - cau_sha1_initialize_output(sha->digest); -#else - sha->digest[0] = 0x67452301L; - sha->digest[1] = 0xEFCDAB89L; - sha->digest[2] = 0x98BADCFEL; - sha->digest[3] = 0x10325476L; - sha->digest[4] = 0xC3D2E1F0L; -#endif - - sha->buffLen = 0; - sha->loLen = 0; - sha->hiLen = 0; - - return 0; -} - -#ifndef FREESCALE_MMCAU - -#define blk0(i) (W[i] = sha->buffer[i]) -#define blk1(i) (W[(i)&15] = \ -rotlFixed(W[((i)+13)&15]^W[((i)+8)&15]^W[((i)+2)&15]^W[(i)&15],1)) - -#define f1(x,y,z) ((z)^((x) &((y)^(z)))) -#define f2(x,y,z) ((x)^(y)^(z)) -#define f3(x,y,z) (((x)&(y))|((z)&((x)|(y)))) -#define f4(x,y,z) ((x)^(y)^(z)) - -/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ -#define R0(v,w,x,y,z,i) (z)+= f1((w),(x),(y)) + blk0((i)) + 0x5A827999+ \ -rotlFixed((v),5); (w) = rotlFixed((w),30); -#define R1(v,w,x,y,z,i) (z)+= f1((w),(x),(y)) + blk1((i)) + 0x5A827999+ \ -rotlFixed((v),5); (w) = rotlFixed((w),30); -#define R2(v,w,x,y,z,i) (z)+= f2((w),(x),(y)) + blk1((i)) + 0x6ED9EBA1+ \ -rotlFixed((v),5); (w) = rotlFixed((w),30); -#define R3(v,w,x,y,z,i) (z)+= f3((w),(x),(y)) + blk1((i)) + 0x8F1BBCDC+ \ -rotlFixed((v),5); (w) = rotlFixed((w),30); -#define R4(v,w,x,y,z,i) (z)+= f4((w),(x),(y)) + blk1((i)) + 0xCA62C1D6+ \ -rotlFixed((v),5); (w) = rotlFixed((w),30); - -static void Transform(Sha* sha) -{ - word32 W[SHA_BLOCK_SIZE / sizeof(word32)]; - - /* Copy context->state[] to working vars */ - word32 a = sha->digest[0]; - word32 b = sha->digest[1]; - word32 c = sha->digest[2]; - word32 d = sha->digest[3]; - word32 e = sha->digest[4]; - -#ifdef USE_SLOW_SHA - word32 t, i; - - for (i = 0; i < 16; i++) { - R0(a, b, c, d, e, i); - t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 20; i++) { - R1(a, b, c, d, e, i); - t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 40; i++) { - R2(a, b, c, d, e, i); - t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 60; i++) { - R3(a, b, c, d, e, i); - t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 80; i++) { - R4(a, b, c, d, e, i); - t = e; e = d; d = c; c = b; b = a; a = t; - } -#else - /* nearly 1 K bigger in code size but 25% faster */ - /* 4 rounds of 20 operations each. Loop unrolled. */ - R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); - R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); - R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); - R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); - - R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); - - R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); - R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); - R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); - R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); - R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); - - R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); - R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); - R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); - R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); - R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); - - R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); - R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); - R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); - R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); - R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); -#endif - - /* Add the working vars back into digest state[] */ - sha->digest[0] += a; - sha->digest[1] += b; - sha->digest[2] += c; - sha->digest[3] += d; - sha->digest[4] += e; -} - -#endif /* FREESCALE_MMCAU */ - - -static INLINE void AddLength(Sha* sha, word32 len) -{ - word32 tmp = sha->loLen; - if ( (sha->loLen += len) < tmp) - sha->hiLen++; /* carry low to high */ -} - - -int wc_ShaUpdate(Sha* sha, const byte* data, word32 len) -{ - /* do block size increments */ - byte* local = (byte*)sha->buffer; - - while (len) { - word32 add = min(len, SHA_BLOCK_SIZE - sha->buffLen); - XMEMCPY(&local[sha->buffLen], data, add); - - sha->buffLen += add; - data += add; - len -= add; - - if (sha->buffLen == SHA_BLOCK_SIZE) { -#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(sha->buffer, sha->buffer, SHA_BLOCK_SIZE); -#endif - XTRANSFORM(sha, local); - AddLength(sha, SHA_BLOCK_SIZE); - sha->buffLen = 0; - } - } - - return 0; -} - - -int wc_ShaFinal(Sha* sha, byte* hash) -{ - byte* local = (byte*)sha->buffer; - - AddLength(sha, sha->buffLen); /* before adding pads */ - - local[sha->buffLen++] = 0x80; /* add 1 */ - - /* pad with zeros */ - if (sha->buffLen > SHA_PAD_SIZE) { - XMEMSET(&local[sha->buffLen], 0, SHA_BLOCK_SIZE - sha->buffLen); - sha->buffLen += SHA_BLOCK_SIZE - sha->buffLen; - -#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(sha->buffer, sha->buffer, SHA_BLOCK_SIZE); -#endif - XTRANSFORM(sha, local); - sha->buffLen = 0; - } - XMEMSET(&local[sha->buffLen], 0, SHA_PAD_SIZE - sha->buffLen); - - /* put lengths in bits */ - sha->hiLen = (sha->loLen >> (8*sizeof(sha->loLen) - 3)) + - (sha->hiLen << 3); - sha->loLen = sha->loLen << 3; - - /* store lengths */ -#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(sha->buffer, sha->buffer, SHA_BLOCK_SIZE); -#endif - /* ! length ordering dependent on digest endian type ! */ - XMEMCPY(&local[SHA_PAD_SIZE], &sha->hiLen, sizeof(word32)); - XMEMCPY(&local[SHA_PAD_SIZE + sizeof(word32)], &sha->loLen, sizeof(word32)); - -#ifdef FREESCALE_MMCAU - /* Kinetis requires only these bytes reversed */ - ByteReverseWords(&sha->buffer[SHA_PAD_SIZE/sizeof(word32)], - &sha->buffer[SHA_PAD_SIZE/sizeof(word32)], - 2 * sizeof(word32)); -#endif - - XTRANSFORM(sha, local); -#ifdef LITTLE_ENDIAN_ORDER - ByteReverseWords(sha->digest, sha->digest, SHA_DIGEST_SIZE); -#endif - XMEMCPY(hash, sha->digest, SHA_DIGEST_SIZE); - - return wc_InitSha(sha); /* reset state */ -} - -#endif /* STM32F2_HASH */ - - -int wc_ShaHash(const byte* data, word32 len, byte* hash) -{ - int ret = 0; -#ifdef WOLFSSL_SMALL_STACK - Sha* sha; -#else - Sha sha[1]; -#endif - -#ifdef WOLFSSL_SMALL_STACK - sha = (Sha*)XMALLOC(sizeof(Sha), NULL, DYNAMIC_TYPE_TMP_BUFFER); - if (sha == NULL) - return MEMORY_E; -#endif - - if ((ret = wc_InitSha(sha)) != 0) { - WOLFSSL_MSG("wc_InitSha failed"); - } - else { - wc_ShaUpdate(sha, data, len); - wc_ShaFinal(sha, hash); - } - -#ifdef WOLFSSL_SMALL_STACK - XFREE(sha, NULL, DYNAMIC_TYPE_TMP_BUFFER); -#endif - - return ret; - -} - -#ifdef WOLFSSL_TI_HASH -#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif -int wc_ShaGetHash(Sha* sha, byte* hash) -{ -#if defined(WOLFSS_TI_HASH) - wc_ShaGetHash_TI(sha, hash) ; -#else - int ret ; - Sha save = *sha ; - ret = wc_ShaFinal(sha, hash) ; - *sha = save ; - return ret ; -#endif -} - -#endif /* HAVE_FIPS */ -#endif /* NO_SHA */ - +/* sha.c + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#if !defined(NO_SHA) && !defined(WOLFSSL_TI_HASH) + +#include +#include +#include + +#ifdef NO_INLINE + #include +#else + #include +#endif + +/* fips wrapper calls, user can call direct */ +#ifdef HAVE_FIPS + int wc_InitSha(Sha* sha) + { + return InitSha_fips(sha); + } + + + int wc_ShaUpdate(Sha* sha, const byte* data, word32 len) + { + return ShaUpdate_fips(sha, data, len); + } + + + int wc_ShaFinal(Sha* sha, byte* out) + { + return ShaFinal_fips(sha,out); + } + + int wc_ShaHash(const byte* data, word32 sz, byte* out) + { + return ShaHash(data, sz, out); + } + +#else /* else build without fips */ + +#ifdef WOLFSSL_PIC32MZ_HASH +#define wc_InitSha wc_InitSha_sw +#define wc_ShaUpdate wc_ShaUpdate_sw +#define wc_ShaFinal wc_ShaFinal_sw +#endif + + +#ifdef FREESCALE_MMCAU + #include "cau_api.h" + #define XTRANSFORM(S,B) cau_sha1_hash_n((B), 1, ((S))->digest) +#else + #define XTRANSFORM(S,B) Transform((S)) +#endif + +#ifdef STM32F2_HASH +/* + * STM32F2 hardware SHA1 support through the STM32F2 standard peripheral + * library. Documentation located in STM32F2xx Standard Peripheral Library + * document (See note in README). + */ +#include "stm32f2xx.h" +#include "stm32f2xx_hash.h" + +int wc_InitSha(Sha* sha) +{ + /* STM32F2 struct notes: + * sha->buffer = first 4 bytes used to hold partial block if needed + * sha->buffLen = num bytes currently stored in sha->buffer + * sha->loLen = num bytes that have been written to STM32 FIFO + */ + XMEMSET(sha->buffer, 0, SHA_REG_SIZE); + sha->buffLen = 0; + sha->loLen = 0; + + /* initialize HASH peripheral */ + HASH_DeInit(); + + /* configure algo used, algo mode, datatype */ + HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE); + HASH->CR |= (HASH_AlgoSelection_SHA1 | HASH_AlgoMode_HASH + | HASH_DataType_8b); + + /* reset HASH processor */ + HASH->CR |= HASH_CR_INIT; + + return 0; +} + +int wc_ShaUpdate(Sha* sha, const byte* data, word32 len) +{ + word32 i = 0; + word32 fill = 0; + word32 diff = 0; + + /* if saved partial block is available */ + if (sha->buffLen) { + fill = 4 - sha->buffLen; + + /* if enough data to fill, fill and push to FIFO */ + if (fill <= len) { + XMEMCPY((byte*)sha->buffer + sha->buffLen, data, fill); + HASH_DataIn(*(uint32_t*)sha->buffer); + + data += fill; + len -= fill; + sha->loLen += 4; + sha->buffLen = 0; + } else { + /* append partial to existing stored block */ + XMEMCPY((byte*)sha->buffer + sha->buffLen, data, len); + sha->buffLen += len; + return; + } + } + + /* write input block in the IN FIFO */ + for(i = 0; i < len; i += 4) + { + diff = len - i; + if ( diff < 4) { + /* store incomplete last block, not yet in FIFO */ + XMEMSET(sha->buffer, 0, SHA_REG_SIZE); + XMEMCPY((byte*)sha->buffer, data, diff); + sha->buffLen = diff; + } else { + HASH_DataIn(*(uint32_t*)data); + data+=4; + } + } + + /* keep track of total data length thus far */ + sha->loLen += (len - sha->buffLen); + + return 0; +} + +int wc_ShaFinal(Sha* sha, byte* hash) +{ + __IO uint16_t nbvalidbitsdata = 0; + + /* finish reading any trailing bytes into FIFO */ + if (sha->buffLen) { + HASH_DataIn(*(uint32_t*)sha->buffer); + sha->loLen += sha->buffLen; + } + + /* calculate number of valid bits in last word of input data */ + nbvalidbitsdata = 8 * (sha->loLen % SHA_REG_SIZE); + + /* configure number of valid bits in last word of the data */ + HASH_SetLastWordValidBitsNbr(nbvalidbitsdata); + + /* start HASH processor */ + HASH_StartDigest(); + + /* wait until Busy flag == RESET */ + while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {} + + /* read message digest */ + sha->digest[0] = HASH->HR[0]; + sha->digest[1] = HASH->HR[1]; + sha->digest[2] = HASH->HR[2]; + sha->digest[3] = HASH->HR[3]; + sha->digest[4] = HASH->HR[4]; + + ByteReverseWords(sha->digest, sha->digest, SHA_DIGEST_SIZE); + + XMEMCPY(hash, sha->digest, SHA_DIGEST_SIZE); + + return wc_InitSha(sha); /* reset state */ +} + +#elif defined(WOLFSSL_TI_HASH) + + /* defined in port/ti/ti_sha.c */ + +#else /* wc_ software implementation */ + +#ifndef WOLFSSL_HAVE_MIN +#define WOLFSSL_HAVE_MIN + + static INLINE word32 min(word32 a, word32 b) + { + return a > b ? b : a; + } + +#endif /* WOLFSSL_HAVE_MIN */ + + +int wc_InitSha(Sha* sha) +{ +#ifdef FREESCALE_MMCAU + cau_sha1_initialize_output(sha->digest); +#else + sha->digest[0] = 0x67452301L; + sha->digest[1] = 0xEFCDAB89L; + sha->digest[2] = 0x98BADCFEL; + sha->digest[3] = 0x10325476L; + sha->digest[4] = 0xC3D2E1F0L; +#endif + + sha->buffLen = 0; + sha->loLen = 0; + sha->hiLen = 0; + + return 0; +} + +#ifndef FREESCALE_MMCAU + +#define blk0(i) (W[i] = sha->buffer[i]) +#define blk1(i) (W[(i)&15] = \ +rotlFixed(W[((i)+13)&15]^W[((i)+8)&15]^W[((i)+2)&15]^W[(i)&15],1)) + +#define f1(x,y,z) ((z)^((x) &((y)^(z)))) +#define f2(x,y,z) ((x)^(y)^(z)) +#define f3(x,y,z) (((x)&(y))|((z)&((x)|(y)))) +#define f4(x,y,z) ((x)^(y)^(z)) + +/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ +#define R0(v,w,x,y,z,i) (z)+= f1((w),(x),(y)) + blk0((i)) + 0x5A827999+ \ +rotlFixed((v),5); (w) = rotlFixed((w),30); +#define R1(v,w,x,y,z,i) (z)+= f1((w),(x),(y)) + blk1((i)) + 0x5A827999+ \ +rotlFixed((v),5); (w) = rotlFixed((w),30); +#define R2(v,w,x,y,z,i) (z)+= f2((w),(x),(y)) + blk1((i)) + 0x6ED9EBA1+ \ +rotlFixed((v),5); (w) = rotlFixed((w),30); +#define R3(v,w,x,y,z,i) (z)+= f3((w),(x),(y)) + blk1((i)) + 0x8F1BBCDC+ \ +rotlFixed((v),5); (w) = rotlFixed((w),30); +#define R4(v,w,x,y,z,i) (z)+= f4((w),(x),(y)) + blk1((i)) + 0xCA62C1D6+ \ +rotlFixed((v),5); (w) = rotlFixed((w),30); + +static void Transform(Sha* sha) +{ + word32 W[SHA_BLOCK_SIZE / sizeof(word32)]; + + /* Copy context->state[] to working vars */ + word32 a = sha->digest[0]; + word32 b = sha->digest[1]; + word32 c = sha->digest[2]; + word32 d = sha->digest[3]; + word32 e = sha->digest[4]; + +#ifdef USE_SLOW_SHA + word32 t, i; + + for (i = 0; i < 16; i++) { + R0(a, b, c, d, e, i); + t = e; e = d; d = c; c = b; b = a; a = t; + } + + for (; i < 20; i++) { + R1(a, b, c, d, e, i); + t = e; e = d; d = c; c = b; b = a; a = t; + } + + for (; i < 40; i++) { + R2(a, b, c, d, e, i); + t = e; e = d; d = c; c = b; b = a; a = t; + } + + for (; i < 60; i++) { + R3(a, b, c, d, e, i); + t = e; e = d; d = c; c = b; b = a; a = t; + } + + for (; i < 80; i++) { + R4(a, b, c, d, e, i); + t = e; e = d; d = c; c = b; b = a; a = t; + } +#else + /* nearly 1 K bigger in code size but 25% faster */ + /* 4 rounds of 20 operations each. Loop unrolled. */ + R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); + R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); + R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); + R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); + + R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); + + R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); + R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); + R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); + R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); + R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); + + R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); + R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); + R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); + R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); + R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); + + R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); + R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); + R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); + R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); + R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); +#endif + + /* Add the working vars back into digest state[] */ + sha->digest[0] += a; + sha->digest[1] += b; + sha->digest[2] += c; + sha->digest[3] += d; + sha->digest[4] += e; +} + +#endif /* FREESCALE_MMCAU */ + + +static INLINE void AddLength(Sha* sha, word32 len) +{ + word32 tmp = sha->loLen; + if ( (sha->loLen += len) < tmp) + sha->hiLen++; /* carry low to high */ +} + + +int wc_ShaUpdate(Sha* sha, const byte* data, word32 len) +{ + /* do block size increments */ + byte* local = (byte*)sha->buffer; + + while (len) { + word32 add = min(len, SHA_BLOCK_SIZE - sha->buffLen); + XMEMCPY(&local[sha->buffLen], data, add); + + sha->buffLen += add; + data += add; + len -= add; + + if (sha->buffLen == SHA_BLOCK_SIZE) { +#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + ByteReverseWords(sha->buffer, sha->buffer, SHA_BLOCK_SIZE); +#endif + XTRANSFORM(sha, local); + AddLength(sha, SHA_BLOCK_SIZE); + sha->buffLen = 0; + } + } + + return 0; +} + + +int wc_ShaFinal(Sha* sha, byte* hash) +{ + byte* local = (byte*)sha->buffer; + + AddLength(sha, sha->buffLen); /* before adding pads */ + + local[sha->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha->buffLen > SHA_PAD_SIZE) { + XMEMSET(&local[sha->buffLen], 0, SHA_BLOCK_SIZE - sha->buffLen); + sha->buffLen += SHA_BLOCK_SIZE - sha->buffLen; + +#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + ByteReverseWords(sha->buffer, sha->buffer, SHA_BLOCK_SIZE); +#endif + XTRANSFORM(sha, local); + sha->buffLen = 0; + } + XMEMSET(&local[sha->buffLen], 0, SHA_PAD_SIZE - sha->buffLen); + + /* put lengths in bits */ + sha->hiLen = (sha->loLen >> (8*sizeof(sha->loLen) - 3)) + + (sha->hiLen << 3); + sha->loLen = sha->loLen << 3; + + /* store lengths */ +#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + ByteReverseWords(sha->buffer, sha->buffer, SHA_BLOCK_SIZE); +#endif + /* ! length ordering dependent on digest endian type ! */ + XMEMCPY(&local[SHA_PAD_SIZE], &sha->hiLen, sizeof(word32)); + XMEMCPY(&local[SHA_PAD_SIZE + sizeof(word32)], &sha->loLen, sizeof(word32)); + +#ifdef FREESCALE_MMCAU + /* Kinetis requires only these bytes reversed */ + ByteReverseWords(&sha->buffer[SHA_PAD_SIZE/sizeof(word32)], + &sha->buffer[SHA_PAD_SIZE/sizeof(word32)], + 2 * sizeof(word32)); +#endif + + XTRANSFORM(sha, local); +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords(sha->digest, sha->digest, SHA_DIGEST_SIZE); +#endif + XMEMCPY(hash, sha->digest, SHA_DIGEST_SIZE); + + return wc_InitSha(sha); /* reset state */ +} + +#endif /* STM32F2_HASH */ + + +int wc_ShaHash(const byte* data, word32 len, byte* hash) +{ + int ret = 0; +#ifdef WOLFSSL_SMALL_STACK + Sha* sha; +#else + Sha sha[1]; +#endif + +#ifdef WOLFSSL_SMALL_STACK + sha = (Sha*)XMALLOC(sizeof(Sha), NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (sha == NULL) + return MEMORY_E; +#endif + + if ((ret = wc_InitSha(sha)) != 0) { + WOLFSSL_MSG("wc_InitSha failed"); + } + else { + wc_ShaUpdate(sha, data, len); + wc_ShaFinal(sha, hash); + } + +#ifdef WOLFSSL_SMALL_STACK + XFREE(sha, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return ret; + +} + +#ifdef WOLFSSL_TI_HASH +#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif +int wc_ShaGetHash(Sha* sha, byte* hash) +{ +#if defined(WOLFSS_TI_HASH) + wc_ShaGetHash_TI(sha, hash) ; +#else + int ret ; + Sha save = *sha ; + ret = wc_ShaFinal(sha, hash) ; + *sha = save ; + return ret ; +#endif +} + +#endif /* HAVE_FIPS */ +#endif /* NO_SHA */ + diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index ec4eb918b..90f99a35b 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -1,1781 +1,1781 @@ -/* sha256.c - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - -/* code submitted by raphael.huck@efixo.com */ - -#ifdef HAVE_CONFIG_H - #include -#endif - -#include -#include - -#if !defined(NO_SHA256) -#ifdef HAVE_FIPS - -int wc_InitSha256(Sha256* sha) -{ - return InitSha256_fips(sha); -} - - -int wc_Sha256Update(Sha256* sha, const byte* data, word32 len) -{ - return Sha256Update_fips(sha, data, len); -} - - -int wc_Sha256Final(Sha256* sha, byte* out) -{ - return Sha256Final_fips(sha, out); -} - - -int wc_Sha256Hash(const byte* data, word32 len, byte* out) -{ - return Sha256Hash(data, len, out); -} - -#else /* else build without fips */ - -#if !defined(NO_SHA256) && !defined(WOLFSSL_TI_HASH) - /* defined in port/ti/ti_sha256.c */ - -#if !defined (ALIGN32) - #if defined (__GNUC__) - #define ALIGN32 __attribute__ ( (aligned (32))) - #elif defined(_MSC_VER) - /* disable align warning, we want alignment ! */ - #pragma warning(disable: 4324) - #define ALIGN32 __declspec (align (32)) - #else - #define ALIGN32 - #endif -#endif - -#ifdef WOLFSSL_PIC32MZ_HASH -#define wc_InitSha256 wc_InitSha256_sw -#define wc_Sha256Update wc_Sha256Update_sw -#define wc_Sha256Final wc_Sha256Final_sw -#endif - -#ifdef HAVE_FIPS - /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ - #define FIPS_NO_WRAPPERS -#endif - -#if defined(USE_INTEL_SPEEDUP) -#define HAVE_INTEL_AVX1 -#define HAVE_INTEL_AVX2 - -#if defined(DEBUG_XMM) -#include "stdio.h" -#endif - -#endif - -#if defined(HAVE_INTEL_AVX2) -#define HAVE_INTEL_RORX -#endif - - -/***** -Intel AVX1/AVX2 Macro Control Structure - -#define HAVE_INTEL_AVX1 -#define HAVE_INTEL_AVX2 - -#define HAVE_INTEL_RORX - - -int InitSha256(Sha256* sha256) { - Save/Recover XMM, YMM - ... -} - -#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) - Transform() ; Function prototype -#else - Transform() { } - int Sha256Final() { - Save/Recover XMM, YMM - ... - } -#endif - -#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) - #if defined(HAVE_INTEL_RORX - #define RND with rorx instuction - #else - #define RND - #endif -#endif - -#if defined(HAVE_INTEL_AVX1) - - #define XMM Instructions/inline asm - - int Transform() { - Stitched Message Sched/Round - } - -#elif defined(HAVE_INTEL_AVX2) - - #define YMM Instructions/inline asm - - int Transform() { - More granural Stitched Message Sched/Round - } - -*/ - - -#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - -/* Each platform needs to query info type 1 from cpuid to see if aesni is - * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts - */ - -#ifndef _MSC_VER - #define cpuid(reg, leaf, sub)\ - __asm__ __volatile__ ("cpuid":\ - "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ - "a" (leaf), "c"(sub)); - - #define XASM_LINK(f) asm(f) -#else - - #include - #define cpuid(a,b) __cpuid((int*)a,b) - - #define XASM_LINK(f) - -#endif /* _MSC_VER */ - -#define EAX 0 -#define EBX 1 -#define ECX 2 -#define EDX 3 - -#define CPUID_AVX1 0x1 -#define CPUID_AVX2 0x2 -#define CPUID_RDRAND 0x4 -#define CPUID_RDSEED 0x8 -#define CPUID_BMI2 0x10 /* MULX, RORX */ - -#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) -#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) -#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2) -#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) -#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) - -static word32 cpuid_check = 0 ; -static word32 cpuid_flags = 0 ; - -static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { - int got_intel_cpu=0; - unsigned int reg[5]; - - reg[4] = '\0' ; - cpuid(reg, 0, 0); - if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && - memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && - memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { - got_intel_cpu = 1; - } - if (got_intel_cpu) { - cpuid(reg, leaf, sub); - return((reg[num]>>bit)&0x1) ; - } - return 0 ; -} - -static int set_cpuid_flags(void) { - if(cpuid_check==0) { - if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} - if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } - if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; } - if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } - if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } - cpuid_check = 1 ; - return 0 ; - } - return 1 ; -} - - -/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ -static int Transform(Sha256* sha256); - -#if defined(HAVE_INTEL_AVX1) -static int Transform_AVX1(Sha256 *sha256) ; -#endif -#if defined(HAVE_INTEL_AVX2) -static int Transform_AVX2(Sha256 *sha256) ; -static int Transform_AVX1_RORX(Sha256 *sha256) ; -#endif - -static int (*Transform_p)(Sha256* sha256) /* = _Transform */; - -#define XTRANSFORM(sha256, B) (*Transform_p)(sha256) - -static void set_Transform(void) { - if(set_cpuid_flags())return ; - -#if defined(HAVE_INTEL_AVX2) - if(IS_INTEL_AVX2 && IS_INTEL_BMI2){ - Transform_p = Transform_AVX1_RORX; return ; - Transform_p = Transform_AVX2 ; - /* for avoiding warning,"not used" */ - } -#endif -#if defined(HAVE_INTEL_AVX1) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ; -#endif - Transform_p = Transform ; return ; -} - -#else - #if defined(FREESCALE_MMCAU) - #define XTRANSFORM(sha256, B) Transform(sha256, B) - #else - #define XTRANSFORM(sha256, B) Transform(sha256) - #endif -#endif - -/* Dummy for saving MM_REGs on behalf of Transform */ -#if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) -#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ - "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") -#elif defined(HAVE_INTEL_AVX1) -#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ - "xmm11","xmm12","xmm13","xmm14","xmm15") -#else -#define SAVE_XMM_YMM -#endif - -#ifdef WOLFSSL_PIC32MZ_HASH -#define InitSha256 InitSha256_sw -#define Sha256Update Sha256Update_sw -#define Sha256Final Sha256Final_sw -#endif - -#include -#include - -#ifdef NO_INLINE - #include -#else - #include -#endif - -#ifdef FREESCALE_MMCAU - #include "cau_api.h" -#endif - -#ifndef WOLFSSL_HAVE_MIN -#define WOLFSSL_HAVE_MIN - - static INLINE word32 min(word32 a, word32 b) - { - return a > b ? b : a; - } - -#endif /* WOLFSSL_HAVE_MIN */ - - -int wc_InitSha256(Sha256* sha256) -{ - #ifdef FREESCALE_MMCAU - cau_sha256_initialize_output(sha256->digest); - #else - sha256->digest[0] = 0x6A09E667L; - sha256->digest[1] = 0xBB67AE85L; - sha256->digest[2] = 0x3C6EF372L; - sha256->digest[3] = 0xA54FF53AL; - sha256->digest[4] = 0x510E527FL; - sha256->digest[5] = 0x9B05688CL; - sha256->digest[6] = 0x1F83D9ABL; - sha256->digest[7] = 0x5BE0CD19L; - #endif - - sha256->buffLen = 0; - sha256->loLen = 0; - sha256->hiLen = 0; - -#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) - set_Transform() ; /* choose best Transform function under this runtime environment */ -#endif - - return 0; -} - - -#if !defined(FREESCALE_MMCAU) -static const ALIGN32 word32 K[64] = { - 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, - 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, - 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, - 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, - 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, - 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, - 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, - 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, - 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, - 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, - 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, - 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, - 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L -}; - -#endif - -#if defined(FREESCALE_MMCAU) - -static int Transform(Sha256* sha256, byte* buf) -{ - cau_sha256_hash_n(buf, 1, sha256->digest); - - return 0; -} - -#endif /* FREESCALE_MMCAU */ - -#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) -#define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) -#define R(x, n) (((x)&0xFFFFFFFFU)>>(n)) - -#define S(x, n) rotrFixed(x, n) -#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -#define RND(a,b,c,d,e,f,g,h,i) \ - t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ - t1 = Sigma0((a)) + Maj((a), (b), (c)); \ - (d) += t0; \ - (h) = t0 + t1; - -#if !defined(FREESCALE_MMCAU) -static int Transform(Sha256* sha256) -{ - word32 S[8], t0, t1; - int i; - -#ifdef WOLFSSL_SMALL_STACK - word32* W; - - W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); - if (W == NULL) - return MEMORY_E; -#else - word32 W[64]; -#endif - - /* Copy context->state[] to working vars */ - for (i = 0; i < 8; i++) - S[i] = sha256->digest[i]; - - for (i = 0; i < 16; i++) - W[i] = sha256->buffer[i]; - - for (i = 16; i < 64; i++) - W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; - - for (i = 0; i < 64; i += 8) { - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7); - } - - /* Add the working vars back into digest state[] */ - for (i = 0; i < 8; i++) { - sha256->digest[i] += S[i]; - } - -#ifdef WOLFSSL_SMALL_STACK - XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); -#endif - - return 0; -} - -#endif /* #if !defined(FREESCALE_MMCAU) */ - -static INLINE void AddLength(Sha256* sha256, word32 len) -{ - word32 tmp = sha256->loLen; - if ( (sha256->loLen += len) < tmp) - sha256->hiLen++; /* carry low to high */ -} - -int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) -{ - - /* do block size increments */ - byte* local = (byte*)sha256->buffer; - - SAVE_XMM_YMM ; /* for Intel AVX */ - - while (len) { - word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen); - XMEMCPY(&local[sha256->buffLen], data, add); - - sha256->buffLen += add; - data += add; - len -= add; - - if (sha256->buffLen == SHA256_BLOCK_SIZE) { - int ret; - - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) - #endif - ByteReverseWords(sha256->buffer, sha256->buffer, - SHA256_BLOCK_SIZE); - #endif - ret = XTRANSFORM(sha256, local); - if (ret != 0) - return ret; - - AddLength(sha256, SHA256_BLOCK_SIZE); - sha256->buffLen = 0; - } - } - - return 0; -} - -int wc_Sha256Final(Sha256* sha256, byte* hash) -{ - byte* local = (byte*)sha256->buffer; - int ret; - - SAVE_XMM_YMM ; /* for Intel AVX */ - - AddLength(sha256, sha256->buffLen); /* before adding pads */ - - local[sha256->buffLen++] = 0x80; /* add 1 */ - - /* pad with zeros */ - if (sha256->buffLen > SHA256_PAD_SIZE) { - XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen); - sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; - - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) - #endif - ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); - #endif - - ret = XTRANSFORM(sha256, local); - if (ret != 0) - return ret; - - sha256->buffLen = 0; - } - XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen); - - /* put lengths in bits */ - sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) + - (sha256->hiLen << 3); - sha256->loLen = sha256->loLen << 3; - - /* store lengths */ - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) - #endif - ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); - #endif - /* ! length ordering dependent on digest endian type ! */ - XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); - XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, - sizeof(word32)); - - #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - /* Kinetis requires only these bytes reversed */ - #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if(IS_INTEL_AVX1 || IS_INTEL_AVX2) - #endif - ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], - &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], - 2 * sizeof(word32)); - #endif - - ret = XTRANSFORM(sha256, local); - if (ret != 0) - return ret; - - #if defined(LITTLE_ENDIAN_ORDER) - ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE); - #endif - XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE); - - return wc_InitSha256(sha256); /* reset state */ -} - - - -int wc_Sha256Hash(const byte* data, word32 len, byte* hash) -{ - int ret = 0; -#ifdef WOLFSSL_SMALL_STACK - Sha256* sha256; -#else - Sha256 sha256[1]; -#endif - -#ifdef WOLFSSL_SMALL_STACK - sha256 = (Sha256*)XMALLOC(sizeof(Sha256), NULL, DYNAMIC_TYPE_TMP_BUFFER); - if (sha256 == NULL) - return MEMORY_E; -#endif - - if ((ret = wc_InitSha256(sha256)) != 0) { - WOLFSSL_MSG("InitSha256 failed"); - } - else if ((ret = wc_Sha256Update(sha256, data, len)) != 0) { - WOLFSSL_MSG("Sha256Update failed"); - } - else if ((ret = wc_Sha256Final(sha256, hash)) != 0) { - WOLFSSL_MSG("Sha256Final failed"); - } - -#ifdef WOLFSSL_SMALL_STACK - XFREE(sha256, NULL, DYNAMIC_TYPE_TMP_BUFFER); -#endif - - return ret; -} - -#ifdef WOLFSSL_TI_HASH -#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif -int wc_Sha256GetHash(Sha256* sha256, byte* hash) -{ -#if defined(WOLFSS_TI_HASH) - return wc_Sha256GetHash_TI(sha256, hash) ; -#else - int ret ; - Sha256 save = *sha256 ; - ret = wc_Sha256Final(sha256, hash) ; - *sha256 = save ; - return ret ; -#endif -} - -#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - -#define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - { word32 d ;\ - d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\ - d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\ - d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\ - d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\ - d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\ - d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\ - d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\ - d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\ -} - -#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - { word32 d ; \ - __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\ - __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\ - __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\ - __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\ - __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\ - __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\ - __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\ - __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\ -} - - -#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - -#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - - - - -#define S_0 %r15d -#define S_1 %r10d -#define S_2 %r11d -#define S_3 %r12d -#define S_4 %r13d -#define S_5 %r14d -#define S_6 %ebx -#define S_7 %r9d - -#define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" - -#if defined(HAVE_INTEL_RORX) -#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ - -#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ -__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ -__asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ - -#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ -__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ -__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ -__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ -__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ - -#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\ -/*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\ -__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ -__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ -__asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ -__asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\ - -#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ -__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\ -__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\ - -#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ -__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ -__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\ -__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ - -#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ -__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ -__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ - -#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ -__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ -__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \ -__asm__ volatile("movl %r8d, "#h"\n\t"); - -#endif - -#define RND_STEP_1(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\ -__asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ -__asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\ - -#define RND_STEP_2(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ -__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ -__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\ -__asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ - -#define RND_STEP_3(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ -__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ -__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ -__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ -__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ - -#define RND_STEP_4(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ -__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ -__asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\ -__asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ -__asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\ -__asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\ -__asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\ - -#define RND_STEP_5(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ -__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\ -__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\ - -#define RND_STEP_6(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ -__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ -__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\ -__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ - -#define RND_STEP_7(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ -__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ -__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ - -#define RND_STEP_8(a,b,c,d,e,f,g,h,i)\ -__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ -__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ - /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ -__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ - /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ -__asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ - /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ - -#define RND_X(a,b,c,d,e,f,g,h,i) \ - RND_STEP_1(a,b,c,d,e,f,g,h,i); \ - RND_STEP_2(a,b,c,d,e,f,g,h,i); \ - RND_STEP_3(a,b,c,d,e,f,g,h,i); \ - RND_STEP_4(a,b,c,d,e,f,g,h,i); \ - RND_STEP_5(a,b,c,d,e,f,g,h,i); \ - RND_STEP_6(a,b,c,d,e,f,g,h,i); \ - RND_STEP_7(a,b,c,d,e,f,g,h,i); \ - RND_STEP_8(a,b,c,d,e,f,g,h,i); - -#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); -#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); -#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); -#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); -#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); -#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); -#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); -#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); - - -#define RND_1_3(a,b,c,d,e,f,g,h,i) {\ - RND_STEP_1(a,b,c,d,e,f,g,h,i); \ - RND_STEP_2(a,b,c,d,e,f,g,h,i); \ - RND_STEP_3(a,b,c,d,e,f,g,h,i); \ -} - -#define RND_4_6(a,b,c,d,e,f,g,h,i) {\ - RND_STEP_4(a,b,c,d,e,f,g,h,i); \ - RND_STEP_5(a,b,c,d,e,f,g,h,i); \ - RND_STEP_6(a,b,c,d,e,f,g,h,i); \ -} - -#define RND_7_8(a,b,c,d,e,f,g,h,i) {\ - RND_STEP_7(a,b,c,d,e,f,g,h,i); \ - RND_STEP_8(a,b,c,d,e,f,g,h,i); \ -} - -#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); -#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); -#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); -#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); -#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); -#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); -#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); -#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); - - -#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); -#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); -#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); -#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); -#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); -#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); -#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); -#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); - -#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); -#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); -#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); -#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); -#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); -#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); -#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); -#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); - -#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); -#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); -#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); -#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); -#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); -#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); -#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); -#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); - -#define FOR(cnt, init, max, inc, loop) \ - __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) -#define END(cnt, init, max, inc, loop) \ - __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ; - -#endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ - -#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ - -#define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs) -#define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs) - -#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ - a,b,c,d,e,f,g,h,_i)\ - RND_STEP_1(a,b,c,d,e,f,g,h,_i);\ - VPALIGNR (XTMP0, X3, X2, 4) ;\ - RND_STEP_2(a,b,c,d,e,f,g,h,_i);\ - VPADDD (XTMP0, XTMP0, X0) ;\ - RND_STEP_3(a,b,c,d,e,f,g,h,_i);\ - VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ - RND_STEP_4(a,b,c,d,e,f,g,h,_i);\ - VPSRLD (XTMP2, XTMP1, 7) ;\ - RND_STEP_5(a,b,c,d,e,f,g,h,_i);\ - VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ - RND_STEP_6(a,b,c,d,e,f,g,h,_i);\ - VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ - RND_STEP_7(a,b,c,d,e,f,g,h,_i);\ - VPSRLD (XTMP2, XTMP1,18) ;\ - RND_STEP_8(a,b,c,d,e,f,g,h,_i);\ -\ - RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\ - VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ - RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\ - VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ - RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\ - VPXOR (XTMP3, XTMP3, XTMP1) ;\ - RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\ - VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ - RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\ - VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ - RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\ - VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ - RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\ - VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ - RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\ -\ - RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\ - VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ - RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\ - VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ - RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\ - VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ - RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\ - VPXOR (XTMP2, XTMP2, XTMP3) ;\ - RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\ - VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ - RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\ - VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ - RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\ - VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ - RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\ -\ - RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\ - VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ - RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\ - VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ - RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\ - VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ - RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\ - VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ - RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\ - VPXOR (XTMP2, XTMP2, XTMP3) ;\ - RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\ - VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ - RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\ - VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ - RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\ - VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ - -#if defined(HAVE_INTEL_RORX) - -#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ - XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ - RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\ - VPALIGNR (XTMP0, X3, X2, 4) ;\ - RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\ - VPADDD (XTMP0, XTMP0, X0) ;\ - RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\ - VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ - RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\ - VPSRLD (XTMP2, XTMP1, 7) ;\ - RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\ - VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ - RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\ - VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ - RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\ - VPSRLD (XTMP2, XTMP1,18) ;\ - RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\ -\ - RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\ - VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ - RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\ - VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ - RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\ - VPXOR (XTMP3, XTMP3, XTMP1) ;\ - RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\ - VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ - RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\ - VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ - RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\ - VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ - RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\ - VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ - RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\ -\ - RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\ - VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ - RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\ - VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ - RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\ - VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ - RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\ - VPXOR (XTMP2, XTMP2, XTMP3) ;\ - RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\ - VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ - RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\ - VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ - RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\ - VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ - RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\ -\ - RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\ - VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ - RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\ - VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ - RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\ - VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ - RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\ - VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ - RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\ - VPXOR (XTMP2, XTMP2, XTMP3) ;\ - RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\ - VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ - RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\ - VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ - RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\ - VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ - -#endif - - -#define W_K_from_buff\ - __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\ - "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\ - :: "m"(sha256->buffer[0]):"%xmm4") ;\ - __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\ - "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\ - ::"m"(sha256->buffer[4]):"%xmm5") ;\ - __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\ - "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\ - ::"m"(sha256->buffer[8]):"%xmm6") ;\ - __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\ - "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\ - ::"m"(sha256->buffer[12]):"%xmm7") ;\ - -#define _SET_W_K_XFER(reg, i)\ - __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\ - __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ; - -#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) - -static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */ -static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */ -static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ; - - -#define _Init_Masks(mask1, mask2, mask3)\ -__asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\ -__asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\ -__asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ; - -#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ - _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) - -#define X0 %xmm4 -#define X1 %xmm5 -#define X2 %xmm6 -#define X3 %xmm7 -#define X_ X0 - -#define XTMP0 %xmm0 -#define XTMP1 %xmm1 -#define XTMP2 %xmm2 -#define XTMP3 %xmm3 -#define XTMP4 %xmm8 -#define XTMP5 %xmm9 -#define XFER %xmm10 - -#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */ -#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ -#define BYTE_FLIP_MASK %xmm13 - -#define XMM_REGs /* Registers are saved in Sha256Update/Finel */ - /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */ - -static int Transform_AVX1(Sha256* sha256) -{ - - word32 W_K[64] ; /* temp for W+K */ - - #if defined(DEBUG_XMM) - int i, j ; - word32 xmm[29][4*15] ; - #endif - - Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; - W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ - - DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; - - SET_W_K_XFER(X0, 0) ; - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; - SET_W_K_XFER(X1, 4) ; - MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; - SET_W_K_XFER(X2, 8) ; - MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; - SET_W_K_XFER(X3, 12) ; - MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; - SET_W_K_XFER(X0, 16) ; - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; - SET_W_K_XFER(X1, 20) ; - MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; - SET_W_K_XFER(X2, 24) ; - MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; - SET_W_K_XFER(X3, 28) ; - MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; - SET_W_K_XFER(X0, 32) ; - MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; - SET_W_K_XFER(X1, 36) ; - MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; - SET_W_K_XFER(X2, 40) ; - MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; - SET_W_K_XFER(X3, 44) ; - MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, - SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; - - SET_W_K_XFER(X0, 48) ; - SET_W_K_XFER(X1, 52) ; - SET_W_K_XFER(X2, 56) ; - SET_W_K_XFER(X3, 60) ; - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; - - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; - - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; - - RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; - - #if defined(DEBUG_XMM) - for(i=0; i<29; i++) { - for(j=0; j<4*14; j+=4) - printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, - xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; - printf("\n") ; - } - - for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; - #endif - - return 0; -} - -#if defined(HAVE_INTEL_RORX) -static int Transform_AVX1_RORX(Sha256* sha256) -{ - - word32 W_K[64] ; /* temp for W+K */ - - #if defined(DEBUG_XMM) - int i, j ; - word32 xmm[29][4*15] ; - #endif - - Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; - W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ - - DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; - SET_W_K_XFER(X0, 0) ; - MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; - SET_W_K_XFER(X1, 4) ; - MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; - SET_W_K_XFER(X2, 8) ; - MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; - SET_W_K_XFER(X3, 12) ; - MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; - SET_W_K_XFER(X0, 16) ; - MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; - SET_W_K_XFER(X1, 20) ; - MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; - SET_W_K_XFER(X2, 24) ; - MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; - SET_W_K_XFER(X3, 28) ; - MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; - SET_W_K_XFER(X0, 32) ; - MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; - SET_W_K_XFER(X1, 36) ; - MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; - SET_W_K_XFER(X2, 40) ; - MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; - SET_W_K_XFER(X3, 44) ; - MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, - XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; - - SET_W_K_XFER(X0, 48) ; - SET_W_K_XFER(X1, 52) ; - SET_W_K_XFER(X2, 56) ; - SET_W_K_XFER(X3, 60) ; - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; - - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; - - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; - - RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; - - #if defined(DEBUG_XMM) - for(i=0; i<29; i++) { - for(j=0; j<4*14; j+=4) - printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, - xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; - printf("\n") ; - } - - for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; - #endif - - return 0; -} -#endif /* HAVE_INTEL_RORX */ - -#endif /* HAVE_INTEL_AVX1 */ - - -#if defined(HAVE_INTEL_AVX2) - -#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ; -#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ; -#define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\ - :: "m"(map):YMM_REGs) ; -#define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\ - #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ; -#define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\ - #ymm0"\n\t":: "m"(map):YMM_REGs) ; -#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\ - #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ - #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ; -#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\ - #src", %%"#dest" ":::YMM_REGs) ; -#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs) ; -#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs) ; -#define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\ - #src2", %%"#dest" ":::YMM_REGs) ; -#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\ - #dest" "::"m"(mem):YMM_REGs) ; -#define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ - #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ; - -#define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_4(ymm, xmm, mem)\ - __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\ - __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; -#define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; - -#define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ; -#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) - -#define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem) -#define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm) -#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) -#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) -#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) -#define XOR(dest, src1, src2) _XOR(dest, src1, src2) -#define OR(dest, src1, src2) _OR(dest, src1, src2) -#define ADD(dest, src1, src2) _ADD(dest, src1, src2) -#define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem) -#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) - -#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); -#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) -#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) - -#define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \ - XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ; -#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); -#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \ - XOR(dest, G_TEMP, dest) ; - -#define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \ - XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ; -#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); -#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \ - XOR(dest, G_TEMP, dest) ; - -#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \ - BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ; -#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; \ - MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ; -#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \ - BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ; - -#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\ - MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ; - -#undef voitle - -#define W_I_16 ymm8 -#define W_I_15 ymm9 -#define W_I_7 ymm10 -#define W_I_2 ymm11 -#define W_I ymm12 -#define G_TEMP ymm13 -#define S_TEMP ymm14 -#define YMM_TEMP0 ymm15 -#define YMM_TEMP0x xmm15 -#define W_I_TEMP ymm7 -#define W_K_TEMP ymm15 -#define W_K_TEMPx xmm15 - -#define YMM_REGs /* Registers are saved in Sha256Update/Finel */ - /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ - - -#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\ - __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ - __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ - __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ - -#define MOVE_7_to_15(w_i_15, w_i_7)\ - __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ - -#define MOVE_I_to_7(w_i_7, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ - __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ - __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\ - -#define MOVE_I_to_2(w_i_2, w_i)\ - __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\ - __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\ - -#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ - MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \ - MOVE_7_to_15(w_i_15, w_i_7) ; \ - MOVE_I_to_7(w_i_7, w_i) ; \ - MOVE_I_to_2(w_i_2, w_i) ;\ - -#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - { word32 d ;\ - __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[0] += d;\ - __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[1] += d;\ - __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[2] += d;\ - __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[3] += d;\ - __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[4] += d;\ - __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[5] += d;\ - __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[6] += d;\ - __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\ - sha256->digest[7] += d;\ -} - -#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - { word32 d[8] ;\ - __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\ - __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\ - printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ - __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\ - __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\ -} - - -#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - -#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - -#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ - _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) - - - /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ - static const unsigned long mBYTE_FLIP_MASK_16[] = - { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; - static const unsigned long mBYTE_FLIP_MASK_15[] = - { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; - static const unsigned long mBYTE_FLIP_MASK_7 [] = - { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ; - static const unsigned long mBYTE_FLIP_MASK_2 [] = - { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ; - - static const unsigned long mMAPtoW_I_7[] = - { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ; - static const unsigned long mMAP1toW_I_2[] = - { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ; - static const unsigned long mMAP2toW_I_2[] = - { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ; - static const unsigned long mMAP3toW_I_2[] = - { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ; - -static int Transform_AVX2(Sha256* sha256) -{ - - #ifdef WOLFSSL_SMALL_STACK - word32* W_K; - W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); - if (W_K == NULL) - return MEMORY_E; - #else - word32 W_K[64] ; - #endif - - MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ; - MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ; - MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ; - MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ; - MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ; - - DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; - - ADD_MEM(W_K_TEMP, W_I_16, K[0]) ; - MOVE_to_MEM(W_K[0], W_K_TEMP) ; - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ; - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ; - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ; - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ; - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ; - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ; - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ; - - ADD_MEM(YMM_TEMP0, W_I, K[8]) ; - MOVE_to_MEM(W_K[8], YMM_TEMP0) ; - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; - GAMMA0_1(W_I_TEMP, W_I_15) ; - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; - GAMMA0_2(W_I_TEMP, W_I_15) ; - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; - ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; - ADD(W_I, W_I_7, W_I_TEMP); - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; - ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; - FEEDBACK1_to_W_I_2 ; - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; - FEEDBACK_to_W_I_7 ; - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; - ADD(W_I_TEMP, W_I_7, W_I_TEMP); - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; - FEEDBACK2_to_W_I_2 ; - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; - FEEDBACK3_to_W_I_2 ; - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; - GAMMA1(YMM_TEMP0, W_I_2) ; - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; - - MOVE_to_REG(YMM_TEMP0, K[16]) ; - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; - ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; - MOVE_to_MEM(W_K[16], YMM_TEMP0) ; - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; - GAMMA0_1(W_I_TEMP, W_I_15) ; - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; - GAMMA0_2(W_I_TEMP, W_I_15) ; - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; - ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; - ADD(W_I, W_I_7, W_I_TEMP); - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; - ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; - FEEDBACK1_to_W_I_2 ; - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; - FEEDBACK_to_W_I_7 ; - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; - ADD(W_I_TEMP, W_I_7, W_I_TEMP); - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; - GAMMA1(YMM_TEMP0, W_I_2) ; - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; - FEEDBACK2_to_W_I_2 ; - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; - FEEDBACK3_to_W_I_2 ; - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; - - MOVE_to_REG(YMM_TEMP0, K[24]) ; - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; - ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; - MOVE_to_MEM(W_K[24], YMM_TEMP0) ; - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; - GAMMA0_1(W_I_TEMP, W_I_15) ; - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; - GAMMA0_2(W_I_TEMP, W_I_15) ; - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; - ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; - ADD(W_I, W_I_7, W_I_TEMP); - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; - ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; - FEEDBACK1_to_W_I_2 ; - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; - FEEDBACK_to_W_I_7 ; - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; - ADD(W_I_TEMP, W_I_7, W_I_TEMP); - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; - FEEDBACK2_to_W_I_2 ; - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; - FEEDBACK3_to_W_I_2 ; - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; - GAMMA1(YMM_TEMP0, W_I_2) ; - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; - - MOVE_to_REG(YMM_TEMP0, K[32]) ; - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; - ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; - MOVE_to_MEM(W_K[32], YMM_TEMP0) ; - - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; - GAMMA0_1(W_I_TEMP, W_I_15) ; - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; - GAMMA0_2(W_I_TEMP, W_I_15) ; - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; - ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; - ADD(W_I, W_I_7, W_I_TEMP); - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; - ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; - FEEDBACK1_to_W_I_2 ; - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; - FEEDBACK_to_W_I_7 ; - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; - ADD(W_I_TEMP, W_I_7, W_I_TEMP); - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; - FEEDBACK2_to_W_I_2 ; - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; - FEEDBACK3_to_W_I_2 ; - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; - - MOVE_to_REG(YMM_TEMP0, K[40]) ; - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; - ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; - MOVE_to_MEM(W_K[40], YMM_TEMP0) ; - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; - GAMMA0_1(W_I_TEMP, W_I_15) ; - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; - GAMMA0_2(W_I_TEMP, W_I_15) ; - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; - ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; - ADD(W_I, W_I_7, W_I_TEMP); - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; - ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; - FEEDBACK1_to_W_I_2 ; - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; - FEEDBACK_to_W_I_7 ; - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; - ADD(W_I_TEMP, W_I_7, W_I_TEMP); - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; - FEEDBACK2_to_W_I_2 ; - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; - FEEDBACK3_to_W_I_2 ; - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; - - MOVE_to_REG(YMM_TEMP0, K[48]) ; - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; - ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; - MOVE_to_MEM(W_K[48], YMM_TEMP0) ; - - /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ - RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; - GAMMA0_1(W_I_TEMP, W_I_15) ; - RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; - GAMMA0_2(W_I_TEMP, W_I_15) ; - RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; - ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ - RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; - ADD(W_I, W_I_7, W_I_TEMP); - RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; - ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ - RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; - FEEDBACK1_to_W_I_2 ; - RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; - FEEDBACK_to_W_I_7 ; - RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; - ADD(W_I_TEMP, W_I_7, W_I_TEMP); - RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ - RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; - FEEDBACK2_to_W_I_2 ; - RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ - RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; - FEEDBACK3_to_W_I_2 ; - RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; - GAMMA1_1(YMM_TEMP0, W_I_2) ; - RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; - GAMMA1_2(YMM_TEMP0, W_I_2) ; - RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; - ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ - RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; - - MOVE_to_REG(YMM_TEMP0, K[56]) ; - RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; - ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; - RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; - ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; - MOVE_to_MEM(W_K[56], YMM_TEMP0) ; - - RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; - RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; - RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; - RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; - - RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; - RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; - RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; - RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; - - RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; - - #ifdef WOLFSSL_SMALL_STACK - XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); - #endif - - return 0; -} - -#endif /* HAVE_INTEL_AVX2 */ - -#endif /* WOLFSSL_TI_HAHS */ - -#endif /* HAVE_FIPS */ - -#endif /* NO_SHA256 */ - +/* sha256.c + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +/* code submitted by raphael.huck@efixo.com */ + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include + +#if !defined(NO_SHA256) +#ifdef HAVE_FIPS + +int wc_InitSha256(Sha256* sha) +{ + return InitSha256_fips(sha); +} + + +int wc_Sha256Update(Sha256* sha, const byte* data, word32 len) +{ + return Sha256Update_fips(sha, data, len); +} + + +int wc_Sha256Final(Sha256* sha, byte* out) +{ + return Sha256Final_fips(sha, out); +} + + +int wc_Sha256Hash(const byte* data, word32 len, byte* out) +{ + return Sha256Hash(data, len, out); +} + +#else /* else build without fips */ + +#if !defined(NO_SHA256) && !defined(WOLFSSL_TI_HASH) + /* defined in port/ti/ti_sha256.c */ + +#if !defined (ALIGN32) + #if defined (__GNUC__) + #define ALIGN32 __attribute__ ( (aligned (32))) + #elif defined(_MSC_VER) + /* disable align warning, we want alignment ! */ + #pragma warning(disable: 4324) + #define ALIGN32 __declspec (align (32)) + #else + #define ALIGN32 + #endif +#endif + +#ifdef WOLFSSL_PIC32MZ_HASH +#define wc_InitSha256 wc_InitSha256_sw +#define wc_Sha256Update wc_Sha256Update_sw +#define wc_Sha256Final wc_Sha256Final_sw +#endif + +#ifdef HAVE_FIPS + /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ + #define FIPS_NO_WRAPPERS +#endif + +#if defined(USE_INTEL_SPEEDUP) +#define HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX2 + +#if defined(DEBUG_XMM) +#include "stdio.h" +#endif + +#endif + +#if defined(HAVE_INTEL_AVX2) +#define HAVE_INTEL_RORX +#endif + + +/***** +Intel AVX1/AVX2 Macro Control Structure + +#define HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX2 + +#define HAVE_INTEL_RORX + + +int InitSha256(Sha256* sha256) { + Save/Recover XMM, YMM + ... +} + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + Transform() ; Function prototype +#else + Transform() { } + int Sha256Final() { + Save/Recover XMM, YMM + ... + } +#endif + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + #if defined(HAVE_INTEL_RORX + #define RND with rorx instuction + #else + #define RND + #endif +#endif + +#if defined(HAVE_INTEL_AVX1) + + #define XMM Instructions/inline asm + + int Transform() { + Stitched Message Sched/Round + } + +#elif defined(HAVE_INTEL_AVX2) + + #define YMM Instructions/inline asm + + int Transform() { + More granural Stitched Message Sched/Round + } + +*/ + + +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + +/* Each platform needs to query info type 1 from cpuid to see if aesni is + * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts + */ + +#ifndef _MSC_VER + #define cpuid(reg, leaf, sub)\ + __asm__ __volatile__ ("cpuid":\ + "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ + "a" (leaf), "c"(sub)); + + #define XASM_LINK(f) asm(f) +#else + + #include + #define cpuid(a,b) __cpuid((int*)a,b) + + #define XASM_LINK(f) + +#endif /* _MSC_VER */ + +#define EAX 0 +#define EBX 1 +#define ECX 2 +#define EDX 3 + +#define CPUID_AVX1 0x1 +#define CPUID_AVX2 0x2 +#define CPUID_RDRAND 0x4 +#define CPUID_RDSEED 0x8 +#define CPUID_BMI2 0x10 /* MULX, RORX */ + +#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) +#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) +#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2) +#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) +#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) + +static word32 cpuid_check = 0 ; +static word32 cpuid_flags = 0 ; + +static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { + int got_intel_cpu=0; + unsigned int reg[5]; + + reg[4] = '\0' ; + cpuid(reg, 0, 0); + if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && + memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && + memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { + got_intel_cpu = 1; + } + if (got_intel_cpu) { + cpuid(reg, leaf, sub); + return((reg[num]>>bit)&0x1) ; + } + return 0 ; +} + +static int set_cpuid_flags(void) { + if(cpuid_check==0) { + if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} + if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } + if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; } + if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } + if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } + cpuid_check = 1 ; + return 0 ; + } + return 1 ; +} + + +/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ +static int Transform(Sha256* sha256); + +#if defined(HAVE_INTEL_AVX1) +static int Transform_AVX1(Sha256 *sha256) ; +#endif +#if defined(HAVE_INTEL_AVX2) +static int Transform_AVX2(Sha256 *sha256) ; +static int Transform_AVX1_RORX(Sha256 *sha256) ; +#endif + +static int (*Transform_p)(Sha256* sha256) /* = _Transform */; + +#define XTRANSFORM(sha256, B) (*Transform_p)(sha256) + +static void set_Transform(void) { + if(set_cpuid_flags())return ; + +#if defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX2 && IS_INTEL_BMI2){ + Transform_p = Transform_AVX1_RORX; return ; + Transform_p = Transform_AVX2 ; + /* for avoiding warning,"not used" */ + } +#endif +#if defined(HAVE_INTEL_AVX1) + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ; +#endif + Transform_p = Transform ; return ; +} + +#else + #if defined(FREESCALE_MMCAU) + #define XTRANSFORM(sha256, B) Transform(sha256, B) + #else + #define XTRANSFORM(sha256, B) Transform(sha256) + #endif +#endif + +/* Dummy for saving MM_REGs on behalf of Transform */ +#if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) +#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ + "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") +#elif defined(HAVE_INTEL_AVX1) +#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ + "xmm11","xmm12","xmm13","xmm14","xmm15") +#else +#define SAVE_XMM_YMM +#endif + +#ifdef WOLFSSL_PIC32MZ_HASH +#define InitSha256 InitSha256_sw +#define Sha256Update Sha256Update_sw +#define Sha256Final Sha256Final_sw +#endif + +#include +#include + +#ifdef NO_INLINE + #include +#else + #include +#endif + +#ifdef FREESCALE_MMCAU + #include "cau_api.h" +#endif + +#ifndef WOLFSSL_HAVE_MIN +#define WOLFSSL_HAVE_MIN + + static INLINE word32 min(word32 a, word32 b) + { + return a > b ? b : a; + } + +#endif /* WOLFSSL_HAVE_MIN */ + + +int wc_InitSha256(Sha256* sha256) +{ + #ifdef FREESCALE_MMCAU + cau_sha256_initialize_output(sha256->digest); + #else + sha256->digest[0] = 0x6A09E667L; + sha256->digest[1] = 0xBB67AE85L; + sha256->digest[2] = 0x3C6EF372L; + sha256->digest[3] = 0xA54FF53AL; + sha256->digest[4] = 0x510E527FL; + sha256->digest[5] = 0x9B05688CL; + sha256->digest[6] = 0x1F83D9ABL; + sha256->digest[7] = 0x5BE0CD19L; + #endif + + sha256->buffLen = 0; + sha256->loLen = 0; + sha256->hiLen = 0; + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + set_Transform() ; /* choose best Transform function under this runtime environment */ +#endif + + return 0; +} + + +#if !defined(FREESCALE_MMCAU) +static const ALIGN32 word32 K[64] = { + 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, + 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, + 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, + 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, + 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, + 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, + 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, + 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, + 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, + 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, + 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, + 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, + 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L +}; + +#endif + +#if defined(FREESCALE_MMCAU) + +static int Transform(Sha256* sha256, byte* buf) +{ + cau_sha256_hash_n(buf, 1, sha256->digest); + + return 0; +} + +#endif /* FREESCALE_MMCAU */ + +#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) +#define R(x, n) (((x)&0xFFFFFFFFU)>>(n)) + +#define S(x, n) rotrFixed(x, n) +#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) +#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) +#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) +#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) + +#define RND(a,b,c,d,e,f,g,h,i) \ + t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ + t1 = Sigma0((a)) + Maj((a), (b), (c)); \ + (d) += t0; \ + (h) = t0 + t1; + +#if !defined(FREESCALE_MMCAU) +static int Transform(Sha256* sha256) +{ + word32 S[8], t0, t1; + int i; + +#ifdef WOLFSSL_SMALL_STACK + word32* W; + + W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (W == NULL) + return MEMORY_E; +#else + word32 W[64]; +#endif + + /* Copy context->state[] to working vars */ + for (i = 0; i < 8; i++) + S[i] = sha256->digest[i]; + + for (i = 0; i < 16; i++) + W[i] = sha256->buffer[i]; + + for (i = 16; i < 64; i++) + W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; + + for (i = 0; i < 64; i += 8) { + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7); + } + + /* Add the working vars back into digest state[] */ + for (i = 0; i < 8; i++) { + sha256->digest[i] += S[i]; + } + +#ifdef WOLFSSL_SMALL_STACK + XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return 0; +} + +#endif /* #if !defined(FREESCALE_MMCAU) */ + +static INLINE void AddLength(Sha256* sha256, word32 len) +{ + word32 tmp = sha256->loLen; + if ( (sha256->loLen += len) < tmp) + sha256->hiLen++; /* carry low to high */ +} + +int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) +{ + + /* do block size increments */ + byte* local = (byte*)sha256->buffer; + + SAVE_XMM_YMM ; /* for Intel AVX */ + + while (len) { + word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen); + XMEMCPY(&local[sha256->buffLen], data, add); + + sha256->buffLen += add; + data += add; + len -= add; + + if (sha256->buffLen == SHA256_BLOCK_SIZE) { + int ret; + + #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords(sha256->buffer, sha256->buffer, + SHA256_BLOCK_SIZE); + #endif + ret = XTRANSFORM(sha256, local); + if (ret != 0) + return ret; + + AddLength(sha256, SHA256_BLOCK_SIZE); + sha256->buffLen = 0; + } + } + + return 0; +} + +int wc_Sha256Final(Sha256* sha256, byte* hash) +{ + byte* local = (byte*)sha256->buffer; + int ret; + + SAVE_XMM_YMM ; /* for Intel AVX */ + + AddLength(sha256, sha256->buffLen); /* before adding pads */ + + local[sha256->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha256->buffLen > SHA256_PAD_SIZE) { + XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen); + sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; + + #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); + #endif + + ret = XTRANSFORM(sha256, local); + if (ret != 0) + return ret; + + sha256->buffLen = 0; + } + XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen); + + /* put lengths in bits */ + sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) + + (sha256->hiLen << 3); + sha256->loLen = sha256->loLen << 3; + + /* store lengths */ + #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); + #endif + /* ! length ordering dependent on digest endian type ! */ + XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); + XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, + sizeof(word32)); + + #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + /* Kinetis requires only these bytes reversed */ + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX1 || IS_INTEL_AVX2) + #endif + ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], + &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], + 2 * sizeof(word32)); + #endif + + ret = XTRANSFORM(sha256, local); + if (ret != 0) + return ret; + + #if defined(LITTLE_ENDIAN_ORDER) + ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE); + #endif + XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE); + + return wc_InitSha256(sha256); /* reset state */ +} + + + +int wc_Sha256Hash(const byte* data, word32 len, byte* hash) +{ + int ret = 0; +#ifdef WOLFSSL_SMALL_STACK + Sha256* sha256; +#else + Sha256 sha256[1]; +#endif + +#ifdef WOLFSSL_SMALL_STACK + sha256 = (Sha256*)XMALLOC(sizeof(Sha256), NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (sha256 == NULL) + return MEMORY_E; +#endif + + if ((ret = wc_InitSha256(sha256)) != 0) { + WOLFSSL_MSG("InitSha256 failed"); + } + else if ((ret = wc_Sha256Update(sha256, data, len)) != 0) { + WOLFSSL_MSG("Sha256Update failed"); + } + else if ((ret = wc_Sha256Final(sha256, hash)) != 0) { + WOLFSSL_MSG("Sha256Final failed"); + } + +#ifdef WOLFSSL_SMALL_STACK + XFREE(sha256, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return ret; +} + +#ifdef WOLFSSL_TI_HASH +#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif +int wc_Sha256GetHash(Sha256* sha256, byte* hash) +{ +#if defined(WOLFSS_TI_HASH) + return wc_Sha256GetHash_TI(sha256, hash) ; +#else + int ret ; + Sha256 save = *sha256 ; + ret = wc_Sha256Final(sha256, hash) ; + *sha256 = save ; + return ret ; +#endif +} + +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + +#define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d ;\ + d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\ + d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\ + d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\ + d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\ + d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\ + d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\ + d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\ + d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\ +} + +#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d ; \ + __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\ + __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\ + __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\ + __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\ + __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\ + __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\ + __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\ + __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\ +} + + +#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + +#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + + + + +#define S_0 %r15d +#define S_1 %r10d +#define S_2 %r11d +#define S_3 %r12d +#define S_4 %r13d +#define S_5 %r14d +#define S_6 %ebx +#define S_7 %r9d + +#define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" + +#if defined(HAVE_INTEL_RORX) +#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ + +#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ +__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ +__asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ + +#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ +__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ + +#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\ +/*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\ +__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ +__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ +__asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ +__asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\ + +#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ +__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\ + +#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ +__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ +__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\ +__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ + +#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ +__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ +__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ + +#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ +__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ +__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \ +__asm__ volatile("movl %r8d, "#h"\n\t"); + +#endif + +#define RND_STEP_1(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\ +__asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ +__asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\ + +#define RND_STEP_2(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ +__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ +__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\ +__asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ + +#define RND_STEP_3(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ +__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ + +#define RND_STEP_4(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ +__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ +__asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\ +__asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ +__asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\ +__asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\ +__asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\ + +#define RND_STEP_5(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ +__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\ + +#define RND_STEP_6(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ +__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ +__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\ +__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ + +#define RND_STEP_7(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ +__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ +__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ + +#define RND_STEP_8(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ +__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ + /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ +__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ + /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ +__asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ + /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + +#define RND_X(a,b,c,d,e,f,g,h,i) \ + RND_STEP_1(a,b,c,d,e,f,g,h,i); \ + RND_STEP_2(a,b,c,d,e,f,g,h,i); \ + RND_STEP_3(a,b,c,d,e,f,g,h,i); \ + RND_STEP_4(a,b,c,d,e,f,g,h,i); \ + RND_STEP_5(a,b,c,d,e,f,g,h,i); \ + RND_STEP_6(a,b,c,d,e,f,g,h,i); \ + RND_STEP_7(a,b,c,d,e,f,g,h,i); \ + RND_STEP_8(a,b,c,d,e,f,g,h,i); + +#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + + +#define RND_1_3(a,b,c,d,e,f,g,h,i) {\ + RND_STEP_1(a,b,c,d,e,f,g,h,i); \ + RND_STEP_2(a,b,c,d,e,f,g,h,i); \ + RND_STEP_3(a,b,c,d,e,f,g,h,i); \ +} + +#define RND_4_6(a,b,c,d,e,f,g,h,i) {\ + RND_STEP_4(a,b,c,d,e,f,g,h,i); \ + RND_STEP_5(a,b,c,d,e,f,g,h,i); \ + RND_STEP_6(a,b,c,d,e,f,g,h,i); \ +} + +#define RND_7_8(a,b,c,d,e,f,g,h,i) {\ + RND_STEP_7(a,b,c,d,e,f,g,h,i); \ + RND_STEP_8(a,b,c,d,e,f,g,h,i); \ +} + +#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + + +#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + +#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + +#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + +#define FOR(cnt, init, max, inc, loop) \ + __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) +#define END(cnt, init, max, inc, loop) \ + __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ; + +#endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ + +#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ + +#define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs) + +#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ + a,b,c,d,e,f,g,h,_i)\ + RND_STEP_1(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP0, X3, X2, 4) ;\ + RND_STEP_2(a,b,c,d,e,f,g,h,_i);\ + VPADDD (XTMP0, XTMP0, X0) ;\ + RND_STEP_3(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ + RND_STEP_4(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1, 7) ;\ + RND_STEP_5(a,b,c,d,e,f,g,h,_i);\ + VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ + RND_STEP_6(a,b,c,d,e,f,g,h,_i);\ + VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ + RND_STEP_7(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1,18) ;\ + RND_STEP_8(a,b,c,d,e,f,g,h,_i);\ +\ + RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\ + VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ + RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\ + VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ + RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP1) ;\ + RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ + RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ + RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\ + VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ + RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\ + VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ + RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\ +\ + RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ + RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ + RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ + RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ + RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\ + VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ + RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\ + VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ + RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\ +\ + RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ + RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ + RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ + RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ + RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ + RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ + RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\ + VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ + +#if defined(HAVE_INTEL_RORX) + +#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ + XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ + RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP0, X3, X2, 4) ;\ + RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\ + VPADDD (XTMP0, XTMP0, X0) ;\ + RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ + RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1, 7) ;\ + RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\ + VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ + RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\ + VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ + RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1,18) ;\ + RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\ +\ + RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\ + VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ + RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\ + VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ + RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP1) ;\ + RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ + RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ + RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\ + VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ + RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\ + VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ + RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\ +\ + RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ + RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ + RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ + RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ + RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\ + VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ + RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\ + VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ + RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\ +\ + RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ + RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ + RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ + RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ + RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ + RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ + RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\ + VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ + +#endif + + +#define W_K_from_buff\ + __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\ + "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\ + :: "m"(sha256->buffer[0]):"%xmm4") ;\ + __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\ + "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\ + ::"m"(sha256->buffer[4]):"%xmm5") ;\ + __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\ + "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\ + ::"m"(sha256->buffer[8]):"%xmm6") ;\ + __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\ + "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\ + ::"m"(sha256->buffer[12]):"%xmm7") ;\ + +#define _SET_W_K_XFER(reg, i)\ + __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\ + __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ; + +#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) + +static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */ +static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */ +static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ; + + +#define _Init_Masks(mask1, mask2, mask3)\ +__asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\ +__asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\ +__asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ; + +#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ + _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) + +#define X0 %xmm4 +#define X1 %xmm5 +#define X2 %xmm6 +#define X3 %xmm7 +#define X_ X0 + +#define XTMP0 %xmm0 +#define XTMP1 %xmm1 +#define XTMP2 %xmm2 +#define XTMP3 %xmm3 +#define XTMP4 %xmm8 +#define XTMP5 %xmm9 +#define XFER %xmm10 + +#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK %xmm13 + +#define XMM_REGs /* Registers are saved in Sha256Update/Finel */ + /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */ + +static int Transform_AVX1(Sha256* sha256) +{ + + word32 W_K[64] ; /* temp for W+K */ + + #if defined(DEBUG_XMM) + int i, j ; + word32 xmm[29][4*15] ; + #endif + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; + W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ + + DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + SET_W_K_XFER(X0, 0) ; + MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; + SET_W_K_XFER(X1, 4) ; + MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; + SET_W_K_XFER(X2, 8) ; + MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + SET_W_K_XFER(X3, 12) ; + MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; + SET_W_K_XFER(X0, 16) ; + MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + SET_W_K_XFER(X1, 20) ; + MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; + SET_W_K_XFER(X2, 24) ; + MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + SET_W_K_XFER(X3, 28) ; + MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; + SET_W_K_XFER(X0, 32) ; + MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + SET_W_K_XFER(X1, 36) ; + MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; + SET_W_K_XFER(X2, 40) ; + MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + SET_W_K_XFER(X3, 44) ; + MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; + + SET_W_K_XFER(X0, 48) ; + SET_W_K_XFER(X1, 52) ; + SET_W_K_XFER(X2, 56) ; + SET_W_K_XFER(X3, 60) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; + + RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + #if defined(DEBUG_XMM) + for(i=0; i<29; i++) { + for(j=0; j<4*14; j+=4) + printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, + xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; + printf("\n") ; + } + + for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; + #endif + + return 0; +} + +#if defined(HAVE_INTEL_RORX) +static int Transform_AVX1_RORX(Sha256* sha256) +{ + + word32 W_K[64] ; /* temp for W+K */ + + #if defined(DEBUG_XMM) + int i, j ; + word32 xmm[29][4*15] ; + #endif + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; + W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ + + DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + SET_W_K_XFER(X0, 0) ; + MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; + SET_W_K_XFER(X1, 4) ; + MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; + SET_W_K_XFER(X2, 8) ; + MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + SET_W_K_XFER(X3, 12) ; + MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; + SET_W_K_XFER(X0, 16) ; + MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + SET_W_K_XFER(X1, 20) ; + MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; + SET_W_K_XFER(X2, 24) ; + MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + SET_W_K_XFER(X3, 28) ; + MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; + SET_W_K_XFER(X0, 32) ; + MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + SET_W_K_XFER(X1, 36) ; + MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; + SET_W_K_XFER(X2, 40) ; + MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + SET_W_K_XFER(X3, 44) ; + MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; + + SET_W_K_XFER(X0, 48) ; + SET_W_K_XFER(X1, 52) ; + SET_W_K_XFER(X2, 56) ; + SET_W_K_XFER(X3, 60) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; + + RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + #if defined(DEBUG_XMM) + for(i=0; i<29; i++) { + for(j=0; j<4*14; j+=4) + printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, + xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; + printf("\n") ; + } + + for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; + #endif + + return 0; +} +#endif /* HAVE_INTEL_RORX */ + +#endif /* HAVE_INTEL_AVX1 */ + + +#if defined(HAVE_INTEL_AVX2) + +#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ; +#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ; +#define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\ + :: "m"(map):YMM_REGs) ; +#define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\ + #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ; +#define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\ + #ymm0"\n\t":: "m"(map):YMM_REGs) ; +#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\ + #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ + #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ; +#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\ + #src", %%"#dest" ":::YMM_REGs) ; +#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\ + #dest" "::"m"(mem):YMM_REGs) ; +#define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ + #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ; + +#define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_4(ymm, xmm, mem)\ + __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\ + __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; + +#define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ; +#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) + +#define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem) +#define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm) +#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) +#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) +#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) +#define XOR(dest, src1, src2) _XOR(dest, src1, src2) +#define OR(dest, src1, src2) _OR(dest, src1, src2) +#define ADD(dest, src1, src2) _ADD(dest, src1, src2) +#define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem) +#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) + +#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); +#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) +#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) + +#define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \ + XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ; +#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); +#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \ + XOR(dest, G_TEMP, dest) ; + +#define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \ + XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ; +#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); +#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \ + XOR(dest, G_TEMP, dest) ; + +#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \ + BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ; +#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; \ + MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ; +#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \ + BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ; + +#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\ + MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ; + +#undef voitle + +#define W_I_16 ymm8 +#define W_I_15 ymm9 +#define W_I_7 ymm10 +#define W_I_2 ymm11 +#define W_I ymm12 +#define G_TEMP ymm13 +#define S_TEMP ymm14 +#define YMM_TEMP0 ymm15 +#define YMM_TEMP0x xmm15 +#define W_I_TEMP ymm7 +#define W_K_TEMP ymm15 +#define W_K_TEMPx xmm15 + +#define YMM_REGs /* Registers are saved in Sha256Update/Finel */ + /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ + + +#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ + +#define MOVE_7_to_15(w_i_15, w_i_7)\ + __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ + +#define MOVE_I_to_7(w_i_7, w_i)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\ + +#define MOVE_I_to_2(w_i_2, w_i)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\ + +#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ + MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \ + MOVE_7_to_15(w_i_15, w_i_7) ; \ + MOVE_I_to_7(w_i_7, w_i) ; \ + MOVE_I_to_2(w_i_2, w_i) ;\ + +#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d ;\ + __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[0] += d;\ + __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[1] += d;\ + __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[2] += d;\ + __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[3] += d;\ + __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[4] += d;\ + __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[5] += d;\ + __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[6] += d;\ + __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[7] += d;\ +} + +#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d[8] ;\ + __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\ + printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ + __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\ +} + + +#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + +#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + +#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + + + /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ + static const unsigned long mBYTE_FLIP_MASK_16[] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; + static const unsigned long mBYTE_FLIP_MASK_15[] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; + static const unsigned long mBYTE_FLIP_MASK_7 [] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ; + static const unsigned long mBYTE_FLIP_MASK_2 [] = + { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ; + + static const unsigned long mMAPtoW_I_7[] = + { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ; + static const unsigned long mMAP1toW_I_2[] = + { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ; + static const unsigned long mMAP2toW_I_2[] = + { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ; + static const unsigned long mMAP3toW_I_2[] = + { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ; + +static int Transform_AVX2(Sha256* sha256) +{ + + #ifdef WOLFSSL_SMALL_STACK + word32* W_K; + W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (W_K == NULL) + return MEMORY_E; + #else + word32 W_K[64] ; + #endif + + MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ; + MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ; + MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ; + MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ; + MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ; + + DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + ADD_MEM(W_K_TEMP, W_I_16, K[0]) ; + MOVE_to_MEM(W_K[0], W_K_TEMP) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ; + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ; + + ADD_MEM(YMM_TEMP0, W_I, K[8]) ; + MOVE_to_MEM(W_K[8], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; + GAMMA1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; + + MOVE_to_REG(YMM_TEMP0, K[16]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[16], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; + GAMMA1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; + + MOVE_to_REG(YMM_TEMP0, K[24]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[24], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; + GAMMA1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; + + MOVE_to_REG(YMM_TEMP0, K[32]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[32], YMM_TEMP0) ; + + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; + + MOVE_to_REG(YMM_TEMP0, K[40]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[40], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; + + MOVE_to_REG(YMM_TEMP0, K[48]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[48], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + + MOVE_to_REG(YMM_TEMP0, K[56]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[56], YMM_TEMP0) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; + + RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + #ifdef WOLFSSL_SMALL_STACK + XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + #endif + + return 0; +} + +#endif /* HAVE_INTEL_AVX2 */ + +#endif /* WOLFSSL_TI_HAHS */ + +#endif /* HAVE_FIPS */ + +#endif /* NO_SHA256 */ + diff --git a/wolfssl/wolfcrypt/md5.h b/wolfssl/wolfcrypt/md5.h index 94e4466f7..f7d7c150a 100644 --- a/wolfssl/wolfcrypt/md5.h +++ b/wolfssl/wolfcrypt/md5.h @@ -1,100 +1,100 @@ -/* md5.h - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - -#ifndef WOLF_CRYPT_MD5_H -#define WOLF_CRYPT_MD5_H - -#include - -#ifndef NO_MD5 - -#ifdef HAVE_FIPS - #define wc_InitMd5 InitMd5 - #define wc_Md5Update Md5Update - #define wc_Md5Final Md5Final - #define wc_Md5Hash Md5Hash -#endif - -#ifdef __cplusplus - extern "C" { -#endif - -/* in bytes */ -enum { -#ifdef STM32F2_HASH - MD5_REG_SIZE = 4, /* STM32 register size, bytes */ -#endif - MD5 = 0, /* hash type unique */ - MD5_BLOCK_SIZE = 64, - MD5_DIGEST_SIZE = 16, - MD5_PAD_SIZE = 56 -}; - -#if defined(WOLFSSL_PIC32MZ_HASH) -#include "port/pic32/pic32mz-crypt.h" -#endif - -#ifdef TI_HASH_TEST -#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif - - -#ifndef WOLFSSL_TI_HASH - -/* MD5 digest */ -typedef struct Md5 { - word32 buffLen; /* in bytes */ - word32 loLen; /* length in bytes */ - word32 hiLen; /* length in bytes */ - word32 buffer[MD5_BLOCK_SIZE / sizeof(word32)]; - #if !defined(WOLFSSL_PIC32MZ_HASH) - word32 digest[MD5_DIGEST_SIZE / sizeof(word32)]; - #else - word32 digest[PIC32_HASH_SIZE / sizeof(word32)]; - pic32mz_desc desc ; /* Crypt Engine descripter */ - #endif - -#ifdef TI_HASH_TEST - wolfssl_TI_Hash ti ; -#endif - -} Md5; - -#if defined(TI_HASH_TEST) -void wc_Md5GetHash_ti(Md5* md5, byte* hash) ; -#endif - -#else /* WOLFSSL_TI_HASH */ - #include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif - -WOLFSSL_API void wc_InitMd5(Md5*); -WOLFSSL_API void wc_Md5Update(Md5*, const byte*, word32); -WOLFSSL_API void wc_Md5Final(Md5*, byte*); -WOLFSSL_API int wc_Md5Hash(const byte*, word32, byte*); -WOLFSSL_API void wc_Md5GetHash(Md5*, byte*); - -#ifdef __cplusplus - } /* extern "C" */ -#endif - -#endif /* NO_MD5 */ -#endif /* WOLF_CRYPT_MD5_H */ +/* md5.h + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#ifndef WOLF_CRYPT_MD5_H +#define WOLF_CRYPT_MD5_H + +#include + +#ifndef NO_MD5 + +#ifdef HAVE_FIPS + #define wc_InitMd5 InitMd5 + #define wc_Md5Update Md5Update + #define wc_Md5Final Md5Final + #define wc_Md5Hash Md5Hash +#endif + +#ifdef __cplusplus + extern "C" { +#endif + +/* in bytes */ +enum { +#ifdef STM32F2_HASH + MD5_REG_SIZE = 4, /* STM32 register size, bytes */ +#endif + MD5 = 0, /* hash type unique */ + MD5_BLOCK_SIZE = 64, + MD5_DIGEST_SIZE = 16, + MD5_PAD_SIZE = 56 +}; + +#if defined(WOLFSSL_PIC32MZ_HASH) +#include "port/pic32/pic32mz-crypt.h" +#endif + +#ifdef TI_HASH_TEST +#include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif + + +#ifndef WOLFSSL_TI_HASH + +/* MD5 digest */ +typedef struct Md5 { + word32 buffLen; /* in bytes */ + word32 loLen; /* length in bytes */ + word32 hiLen; /* length in bytes */ + word32 buffer[MD5_BLOCK_SIZE / sizeof(word32)]; + #if !defined(WOLFSSL_PIC32MZ_HASH) + word32 digest[MD5_DIGEST_SIZE / sizeof(word32)]; + #else + word32 digest[PIC32_HASH_SIZE / sizeof(word32)]; + pic32mz_desc desc ; /* Crypt Engine descripter */ + #endif + +#ifdef TI_HASH_TEST + wolfssl_TI_Hash ti ; +#endif + +} Md5; + +#if defined(TI_HASH_TEST) +void wc_Md5GetHash_ti(Md5* md5, byte* hash) ; +#endif + +#else /* WOLFSSL_TI_HASH */ + #include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif + +WOLFSSL_API void wc_InitMd5(Md5*); +WOLFSSL_API void wc_Md5Update(Md5*, const byte*, word32); +WOLFSSL_API void wc_Md5Final(Md5*, byte*); +WOLFSSL_API int wc_Md5Hash(const byte*, word32, byte*); +WOLFSSL_API void wc_Md5GetHash(Md5*, byte*); + +#ifdef __cplusplus + } /* extern "C" */ +#endif + +#endif /* NO_MD5 */ +#endif /* WOLF_CRYPT_MD5_H */ diff --git a/wolfssl/wolfcrypt/sha.h b/wolfssl/wolfcrypt/sha.h index 46dce03d0..b5ff4908d 100644 --- a/wolfssl/wolfcrypt/sha.h +++ b/wolfssl/wolfcrypt/sha.h @@ -1,88 +1,88 @@ -/* sha.h - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - -#ifndef WOLF_CRYPT_SHA_H -#define WOLF_CRYPT_SHA_H - -#include - -#ifndef NO_SHA - -#ifdef HAVE_FIPS -/* for fips @wc_fips */ -#include -#endif - -#ifdef __cplusplus - extern "C" { -#endif - -#ifndef HAVE_FIPS /* avoid redefining structs */ -/* in bytes */ -enum { -#ifdef STM32F2_HASH - SHA_REG_SIZE = 4, /* STM32 register size, bytes */ -#endif - SHA = 1, /* hash type unique */ - SHA_BLOCK_SIZE = 64, - SHA_DIGEST_SIZE = 20, - SHA_PAD_SIZE = 56 -}; - -#ifdef WOLFSSL_PIC32MZ_HASH -#include "port/pic32/pic32mz-crypt.h" -#endif - -#ifndef WOLFSSL_TI_HASH - -/* Sha digest */ -typedef struct Sha { - word32 buffLen; /* in bytes */ - word32 loLen; /* length in bytes */ - word32 hiLen; /* length in bytes */ - word32 buffer[SHA_BLOCK_SIZE / sizeof(word32)]; - #ifndef WOLFSSL_PIC32MZ_HASH - word32 digest[SHA_DIGEST_SIZE / sizeof(word32)]; - #else - word32 digest[PIC32_HASH_SIZE / sizeof(word32)]; - pic32mz_desc desc; /* Crypt Engine descripter */ - #endif -} Sha; - -#else /* WOLFSSL_TI_HASH */ - #include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif - -#endif /* HAVE_FIPS */ - -WOLFSSL_API int wc_InitSha(Sha*); -WOLFSSL_API int wc_ShaUpdate(Sha*, const byte*, word32); -WOLFSSL_API int wc_ShaFinal(Sha*, byte*); -WOLFSSL_API int wc_ShaHash(const byte*, word32, byte*); -WOLFSSL_API int wc_ShaGetHash(Sha*, byte*); - -#ifdef __cplusplus - } /* extern "C" */ -#endif - -#endif /* NO_SHA */ -#endif /* WOLF_CRYPT_SHA_H */ - +/* sha.h + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#ifndef WOLF_CRYPT_SHA_H +#define WOLF_CRYPT_SHA_H + +#include + +#ifndef NO_SHA + +#ifdef HAVE_FIPS +/* for fips @wc_fips */ +#include +#endif + +#ifdef __cplusplus + extern "C" { +#endif + +#ifndef HAVE_FIPS /* avoid redefining structs */ +/* in bytes */ +enum { +#ifdef STM32F2_HASH + SHA_REG_SIZE = 4, /* STM32 register size, bytes */ +#endif + SHA = 1, /* hash type unique */ + SHA_BLOCK_SIZE = 64, + SHA_DIGEST_SIZE = 20, + SHA_PAD_SIZE = 56 +}; + +#ifdef WOLFSSL_PIC32MZ_HASH +#include "port/pic32/pic32mz-crypt.h" +#endif + +#ifndef WOLFSSL_TI_HASH + +/* Sha digest */ +typedef struct Sha { + word32 buffLen; /* in bytes */ + word32 loLen; /* length in bytes */ + word32 hiLen; /* length in bytes */ + word32 buffer[SHA_BLOCK_SIZE / sizeof(word32)]; + #ifndef WOLFSSL_PIC32MZ_HASH + word32 digest[SHA_DIGEST_SIZE / sizeof(word32)]; + #else + word32 digest[PIC32_HASH_SIZE / sizeof(word32)]; + pic32mz_desc desc; /* Crypt Engine descripter */ + #endif +} Sha; + +#else /* WOLFSSL_TI_HASH */ + #include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif + +#endif /* HAVE_FIPS */ + +WOLFSSL_API int wc_InitSha(Sha*); +WOLFSSL_API int wc_ShaUpdate(Sha*, const byte*, word32); +WOLFSSL_API int wc_ShaFinal(Sha*, byte*); +WOLFSSL_API int wc_ShaHash(const byte*, word32, byte*); +WOLFSSL_API int wc_ShaGetHash(Sha*, byte*); + +#ifdef __cplusplus + } /* extern "C" */ +#endif + +#endif /* NO_SHA */ +#endif /* WOLF_CRYPT_SHA_H */ + diff --git a/wolfssl/wolfcrypt/sha256.h b/wolfssl/wolfcrypt/sha256.h index 86616e320..d022d3ac5 100644 --- a/wolfssl/wolfcrypt/sha256.h +++ b/wolfssl/wolfcrypt/sha256.h @@ -1,86 +1,86 @@ -/* sha256.h - * - * Copyright (C) 2006-2015 wolfSSL Inc. - * - * This file is part of wolfSSL. (formerly known as CyaSSL) - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - */ - -/* code submitted by raphael.huck@efixo.com */ - -#ifndef WOLF_CRYPT_SHA256_H -#define WOLF_CRYPT_SHA256_H - -#include - -#ifndef NO_SHA256 - -#ifdef HAVE_FIPS - /* for fips @wc_fips */ - #include -#endif - - -#ifdef __cplusplus - extern "C" { -#endif - -#ifndef HAVE_FIPS /* avoid redefinition of structs */ -#ifdef WOLFSSL_PIC32MZ_HASH - #include "port/pic32/pic32mz-crypt.h" -#endif - -/* in bytes */ -enum { - SHA256 = 2, /* hash type unique */ - SHA256_BLOCK_SIZE = 64, - SHA256_DIGEST_SIZE = 32, - SHA256_PAD_SIZE = 56 -}; - -#ifndef WOLFSSL_TI_HASH - -/* Sha256 digest */ -typedef struct Sha256 { - word32 buffLen; /* in bytes */ - word32 loLen; /* length in bytes */ - word32 hiLen; /* length in bytes */ - word32 digest[SHA256_DIGEST_SIZE / sizeof(word32)]; - word32 buffer[SHA256_BLOCK_SIZE / sizeof(word32)]; - #ifdef WOLFSSL_PIC32MZ_HASH - pic32mz_desc desc ; /* Crypt Engine descripter */ - #endif -} Sha256; - -#else /* WOLFSSL_TI_HASH */ - #include "wolfssl/wolfcrypt/port/ti/ti-hash.h" -#endif - -#endif /* HAVE_FIPS */ - -WOLFSSL_API int wc_InitSha256(Sha256*); -WOLFSSL_API int wc_Sha256Update(Sha256*, const byte*, word32); -WOLFSSL_API int wc_Sha256Final(Sha256*, byte*); -WOLFSSL_API int wc_Sha256Hash(const byte*, word32, byte*); -WOLFSSL_API int wc_Sha256GetHash(Sha256*, byte*); - -#ifdef __cplusplus - } /* extern "C" */ -#endif - -#endif /* NO_SHA256 */ -#endif /* WOLF_CRYPT_SHA256_H */ - +/* sha256.h + * + * Copyright (C) 2006-2015 wolfSSL Inc. + * + * This file is part of wolfSSL. (formerly known as CyaSSL) + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +/* code submitted by raphael.huck@efixo.com */ + +#ifndef WOLF_CRYPT_SHA256_H +#define WOLF_CRYPT_SHA256_H + +#include + +#ifndef NO_SHA256 + +#ifdef HAVE_FIPS + /* for fips @wc_fips */ + #include +#endif + + +#ifdef __cplusplus + extern "C" { +#endif + +#ifndef HAVE_FIPS /* avoid redefinition of structs */ +#ifdef WOLFSSL_PIC32MZ_HASH + #include "port/pic32/pic32mz-crypt.h" +#endif + +/* in bytes */ +enum { + SHA256 = 2, /* hash type unique */ + SHA256_BLOCK_SIZE = 64, + SHA256_DIGEST_SIZE = 32, + SHA256_PAD_SIZE = 56 +}; + +#ifndef WOLFSSL_TI_HASH + +/* Sha256 digest */ +typedef struct Sha256 { + word32 buffLen; /* in bytes */ + word32 loLen; /* length in bytes */ + word32 hiLen; /* length in bytes */ + word32 digest[SHA256_DIGEST_SIZE / sizeof(word32)]; + word32 buffer[SHA256_BLOCK_SIZE / sizeof(word32)]; + #ifdef WOLFSSL_PIC32MZ_HASH + pic32mz_desc desc ; /* Crypt Engine descripter */ + #endif +} Sha256; + +#else /* WOLFSSL_TI_HASH */ + #include "wolfssl/wolfcrypt/port/ti/ti-hash.h" +#endif + +#endif /* HAVE_FIPS */ + +WOLFSSL_API int wc_InitSha256(Sha256*); +WOLFSSL_API int wc_Sha256Update(Sha256*, const byte*, word32); +WOLFSSL_API int wc_Sha256Final(Sha256*, byte*); +WOLFSSL_API int wc_Sha256Hash(const byte*, word32, byte*); +WOLFSSL_API int wc_Sha256GetHash(Sha256*, byte*); + +#ifdef __cplusplus + } /* extern "C" */ +#endif + +#endif /* NO_SHA256 */ +#endif /* WOLF_CRYPT_SHA256_H */ +