diff --git a/IDE/STM32Cube/STM32_Benchmarks.md b/IDE/STM32Cube/STM32_Benchmarks.md index a84a624bb..4c45a45e0 100644 --- a/IDE/STM32Cube/STM32_Benchmarks.md +++ b/IDE/STM32Cube/STM32_Benchmarks.md @@ -580,6 +580,51 @@ CPU: Cortex-M33 at 160 MHz IDE: STM32CubeIDE RTOS: FreeRTOS +### STM32U585 (STM Symmetric AES/SHA acceleration, STM PKA PKA w/Fast Math) + +This test uses `WOLFSSL_SMALL_STACK_CACHE`, which slightly improves the DRBG RNG performance. + +Only the ECC sign and verify are currently being accelerated by PKA. + +``` +------------------------------------------------------------------------------ + wolfSSL version 5.3.1 +------------------------------------------------------------------------------ +Running wolfCrypt Benchmarks... +wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) +RNG 575 KB took 1.039 seconds, 553.417 KB/s +AES-128-CBC-enc 6 MB took 1.000 seconds, 6.274 MB/s +AES-128-CBC-dec 6 MB took 1.000 seconds, 6.128 MB/s +AES-256-CBC-enc 6 MB took 1.000 seconds, 6.274 MB/s +AES-256-CBC-dec 6 MB took 1.000 seconds, 6.152 MB/s +AES-128-GCM-enc 6 MB took 1.000 seconds, 5.640 MB/s +AES-128-GCM-dec 6 MB took 1.000 seconds, 5.566 MB/s +AES-256-GCM-enc 6 MB took 1.000 seconds, 5.615 MB/s +AES-256-GCM-dec 6 MB took 1.000 seconds, 5.542 MB/s +GMAC Small 11 MB took 1.000 seconds, 11.499 MB/s +CHACHA 4 MB took 1.000 seconds, 3.882 MB/s +CHA-POLY 2 MB took 1.008 seconds, 2.470 MB/s +3DES 200 KB took 1.071 seconds, 186.741 KB/s +MD5 6 MB took 1.000 seconds, 6.299 MB/s +POLY1305 10 MB took 1.000 seconds, 10.449 MB/s +SHA 6 MB took 1.000 seconds, 6.299 MB/s +SHA-256 6 MB took 1.000 seconds, 6.250 MB/s +HMAC-MD5 6 MB took 1.000 seconds, 6.177 MB/s +HMAC-SHA 6 MB took 1.000 seconds, 6.177 MB/s +HMAC-SHA256 6 MB took 1.000 seconds, 6.104 MB/s +RSA 2048 public 28 ops took 1.031 sec, avg 36.821 ms, 27.158 ops/sec +RSA 2048 private 2 ops took 4.310 sec, avg 2155.000 ms, 0.464 ops/sec +DH 2048 key gen 3 ops took 1.197 sec, avg 399.000 ms, 2.506 ops/sec +DH 2048 agree 2 ops took 1.525 sec, avg 762.500 ms, 1.311 ops/sec +ECC [ SECP256R1] 256 key gen 50 ops took 1.019 sec, avg 20.380 ms, 49.068 ops/sec +ECDHE [ SECP256R1] 256 agree 52 ops took 1.008 sec, avg 19.385 ms, 51.587 ops/sec +ECDSA [ SECP256R1] 256 sign 56 ops took 1.000 sec, avg 17.857 ms, 56.000 ops/sec +ECDSA [ SECP256R1] 256 verify 56 ops took 1.008 sec, avg 18.000 ms, 55.556 ops/sec +Benchmark complete +Benchmark Test: Return code 0 +``` + + ### STM32U585 (STM Symmetric AES/SHA acceleration, SP Math ASM Cortex M) ``` diff --git a/IDE/STM32Cube/default_conf.ftl b/IDE/STM32Cube/default_conf.ftl index 97759570d..9bc4d9bdf 100644 --- a/IDE/STM32Cube/default_conf.ftl +++ b/IDE/STM32Cube/default_conf.ftl @@ -135,12 +135,18 @@ extern ${variable.value} ${variable.name}; #define HAL_CONSOLE_UART huart1 #define WOLFSSL_STM32U5 #define STM32_HAL_V2 + #ifdef STM32U585xx + #undef NO_STM32_HASH + #undef NO_STM32_CRYPTO + #define WOLFSSL_STM32_PKA + #endif #else #warning Please define a hardware platform! /* This means there is not a pre-defined platform for your board/CPU */ /* You need to define a CPU type, HW crypto and debug UART */ /* CPU Type: WOLFSSL_STM32F1, WOLFSSL_STM32F2, WOLFSSL_STM32F4, - WOLFSSL_STM32F7, WOLFSSL_STM32H7, WOLFSSL_STM32L4 and WOLFSSL_STM32L5 */ + WOLFSSL_STM32F7, WOLFSSL_STM32H7, WOLFSSL_STM32L4, WOLFSSL_STM32L5, + WOLFSSL_STM32G0, WOLFSSL_STM32WB and WOLFSSL_STM32U5 */ #define WOLFSSL_STM32F4 /* Debug UART used for printf */ diff --git a/wolfcrypt/src/des3.c b/wolfcrypt/src/des3.c index 917b49f9d..26a1ae444 100644 --- a/wolfcrypt/src/des3.c +++ b/wolfcrypt/src/des3.c @@ -1824,7 +1824,7 @@ void wc_Des_SetIV(Des* des, const byte* iv) { if (des && iv) { XMEMCPY(des->reg, iv, DES_BLOCK_SIZE); - #ifdef STM32_HAL_V2 + #if defined(STM32_CRYPTO) && !defined(STM32_CRYPTO_AES_ONLY) && defined(STM32_HAL_V2) ByteReverseWords(des->reg, des->reg, DES_BLOCK_SIZE); #endif } @@ -1839,7 +1839,7 @@ int wc_Des3_SetIV(Des3* des, const byte* iv) } if (iv) { XMEMCPY(des->reg, iv, DES_BLOCK_SIZE); - #ifdef STM32_HAL_V2 + #if defined(STM32_CRYPTO) && !defined(STM32_CRYPTO_AES_ONLY) && defined(STM32_HAL_V2) ByteReverseWords(des->reg, des->reg, DES_BLOCK_SIZE); #endif } diff --git a/wolfcrypt/src/md5.c b/wolfcrypt/src/md5.c index f5d85c28b..d01542aa6 100644 --- a/wolfcrypt/src/md5.c +++ b/wolfcrypt/src/md5.c @@ -551,6 +551,7 @@ int wc_Md5Copy(wc_Md5* src, wc_Md5* dst) return ret; } + #ifdef OPENSSL_EXTRA /* Apply MD5 transformation to the data */ /* @param md5 a pointer to wc_MD5 structure */ @@ -562,9 +563,14 @@ int wc_Md5Transform(wc_Md5* md5, const byte* data) if (md5 == NULL || data == NULL) { return BAD_FUNC_ARG; } +#ifndef HAVE_MD5_CUST_API return Transform(md5, data); -} +#else + return NOT_COMPILED_IN; #endif +} +#endif /* OPENSSL_EXTRA */ + #ifdef WOLFSSL_HASH_FLAGS int wc_Md5SetFlags(wc_Md5* md5, word32 flags) { diff --git a/wolfcrypt/src/port/st/stm32.c b/wolfcrypt/src/port/st/stm32.c index ef34afdc6..c2f0dbcb4 100644 --- a/wolfcrypt/src/port/st/stm32.c +++ b/wolfcrypt/src/port/st/stm32.c @@ -205,7 +205,7 @@ static int wc_Stm32_Hash_WaitDone(STM32_HASH_Context* stmCtx) /* wait until hash digest is complete */ while ((HASH->SR & HASH_SR_BUSY) && #ifdef HASH_IMR_DCIE - (HASH->SR & HASH_SR_DCIS) == 0 && + (HASH->SR & HASH_SR_DCIS) == 0 && #endif ++timeout < STM32_HASH_TIMEOUT) { }; @@ -477,18 +477,29 @@ int wc_Stm32_Aes_Init(Aes* aes, CRYP_InitTypeDef* cryptInit, #if defined(WOLFSSL_STM32L5) #include #include -#else +#elif defined(WOLFSSL_STM32U5) +#include +#include +#elif defined(WOLFSSL_STM32WB) #include #include +#else +#error Please add the hal_pk.h include #endif extern PKA_HandleTypeDef hpka; +#if !defined(WOLFSSL_STM32_PKA_V2) && defined(PKA_ECC_SCALAR_MUL_IN_B_COEFF) +/* PKA hardware like in U5 added coefB and primeOrder */ +#define WOLFSSL_STM32_PKA_V2 +#endif + /* Reverse array in memory (in place) */ #ifdef HAVE_ECC #include /* convert from mp_int to STM32 PKA HAL integer, as array of bytes of size sz. - * if mp_int has less bytes than sz, add zero bytes at most significant byte positions. + * if mp_int has less bytes than sz, add zero bytes at most significant byte + * positions. * This is when for example modulus is 32 bytes (P-256 curve) * and mp_int has only 31 bytes, we add leading zeros * so that result array has 32 bytes, same as modulus (sz). @@ -523,7 +534,8 @@ static int stm32_get_from_mp_int(uint8_t *dst, const mp_int *a, int sz) return res; } -/* ECC specs in lsbyte at lowest address format for direct use by STM32_PKA PKHA driver functions */ +/* ECC specs in lsbyte at lowest address format for direct use by + * STM32_PKA PKHA driver functions */ #if defined(HAVE_ECC192) || defined(HAVE_ALL_CURVES) #define ECC192 #endif @@ -555,6 +567,11 @@ static const uint8_t stm32_ecc192_coef[ECC192_KEYSIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }; +static const uint8_t stm32_ecc192_coefB[ECC192_KEYSIZE] = { + 0x64, 0x21, 0x05, 0x19, 0xe5, 0x9c, 0x80, 0xe7, + 0x0f, 0xa7, 0xe9, 0xab, 0x72, 0x24, 0x30, 0x49, + 0xfe, 0xb8, 0xde, 0xec, 0xc1, 0x46, 0xb9, 0xb1 +}; static const uint8_t stm32_ecc192_pointX[ECC192_KEYSIZE] = { 0x18, 0x8D, 0xA8, 0x0E, 0xB0, 0x30, 0x90, 0xF6, 0x7C, 0xBF, 0x20, 0xEB, 0x43, 0xA1, 0x88, 0x00, @@ -588,6 +605,12 @@ static const uint8_t stm32_ecc224_coef[ECC224_KEYSIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }; +static const uint8_t stm32_ecc224_coefB[ECC224_KEYSIZE] = { + 0xb4, 0x05, 0x0a, 0x85, 0x0c, 0x04, 0xb3, 0xab, + 0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7, + 0xd7, 0xbf, 0xd8, 0xba, 0x27, 0x0b, 0x39, 0x43, + 0x23, 0x55, 0xff, 0xb4 +}; static const uint8_t stm32_ecc224_pointX[ECC224_KEYSIZE] = { 0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, 0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, @@ -601,9 +624,9 @@ static const uint8_t stm32_ecc224_pointY[ECC224_KEYSIZE] = { 0x85, 0x00, 0x7E, 0x34 }; static const uint8_t stm32_ecc224_order[ECC224_KEYSIZE] = { - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x16, 0xA2, - 0xE0, 0xB8, 0xF0, 0x3E, 0x13, 0xDD, 0x29, 0x45, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x16, 0xA2, + 0xE0, 0xB8, 0xF0, 0x3E, 0x13, 0xDD, 0x29, 0x45, 0x5C, 0x5C, 0x2A, 0x3D }; #endif /* ECC224 */ @@ -624,6 +647,12 @@ static const uint8_t stm32_ecc256_coef[ECC256_KEYSIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }; +static const uint8_t stm32_ecc256_coefB[ECC256_KEYSIZE] = { + 0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, + 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, + 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, + 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b +}; static const uint8_t stm32_ecc256_pointX[ECC256_KEYSIZE] = { 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, @@ -664,6 +693,14 @@ static const uint8_t stm32_ecc384_coef[ECC384_KEYSIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }; +static const uint8_t stm32_ecc384_coefB[ECC384_KEYSIZE] = { + 0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4, + 0x98, 0x8e, 0x05, 0x6b, 0xe3, 0xf8, 0x2d, 0x19, + 0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12, + 0x03, 0x14, 0x08, 0x8f, 0x50, 0x13, 0x87, 0x5a, + 0xc6, 0x56, 0x39, 0x8d, 0x8a, 0x2e, 0xd1, 0x9d, + 0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef +}; static const uint8_t stm32_ecc384_pointX[ECC384_KEYSIZE] = { 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, 0xF3, 0x20, 0xAD, 0x74, @@ -691,13 +728,15 @@ static const uint8_t stm32_ecc384_order[ECC384_KEYSIZE] = { #endif /* ECC384 */ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef, - const uint32_t **coef_sign, const uint8_t **GenPointX, const uint8_t **GenPointY, - const uint8_t **order, int size) + const uint8_t **coefB, const uint32_t **coef_sign, + const uint8_t **GenPointX, const uint8_t **GenPointY, const uint8_t **order, + int size) { switch(size) { case 32: *prime = stm32_ecc256_prime; *coef = stm32_ecc256_coef; + if (coefB) *coefB = stm32_ecc256_coefB; *GenPointX = stm32_ecc256_pointX; *GenPointY = stm32_ecc256_pointY; *coef_sign = &stm32_ecc256_coef_sign; @@ -707,6 +746,7 @@ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef, case 28: *prime = stm32_ecc224_prime; *coef = stm32_ecc224_coef; + if (coefB) *coefB = stm32_ecc224_coefB; *GenPointX = stm32_ecc224_pointX; *GenPointY = stm32_ecc224_pointY; *coef_sign = &stm32_ecc224_coef; @@ -717,6 +757,7 @@ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef, case 24: *prime = stm32_ecc192_prime; *coef = stm32_ecc192_coef; + if (coefB) *coefB = stm32_ecc192_coefB; *GenPointX = stm32_ecc192_pointX; *GenPointY = stm32_ecc192_pointY; *coef_sign = &stm32_ecc192_coef; @@ -727,6 +768,7 @@ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef, case 48: *prime = stm32_ecc384_prime; *coef = stm32_ecc384_coef; + if (coefB) *coefB = stm32_ecc384_coefB; *GenPointX = stm32_ecc384_pointX; *GenPointY = stm32_ecc384_pointY; *coef_sign = &stm32_ecc384_coef; @@ -765,7 +807,7 @@ int wc_ecc_mulmod_ex(const mp_int *k, ecc_point *G, ecc_point *R, mp_int* a, uint8_t kbin[STM32_MAX_ECC_SIZE]; uint8_t PtXbin[STM32_MAX_ECC_SIZE]; uint8_t PtYbin[STM32_MAX_ECC_SIZE]; - const uint8_t *prime, *coef, *gen_x, *gen_y; + const uint8_t *prime, *coef, *coefB, *gen_x, *gen_y, *order; const uint32_t *coef_sign; (void)a; (void)heap; @@ -792,7 +834,8 @@ int wc_ecc_mulmod_ex(const mp_int *k, ecc_point *G, ecc_point *R, mp_int* a, size = (uint8_t)szModulus; /* find STM32_PKA friendly parameters for the selected curve */ - if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, NULL, size)) { + if (0 != stm32_get_ecc_specs(&prime, &coef, &coefB, &coef_sign, + &gen_x, &gen_y, &order, size)) { return ECC_BAD_ARG_E; } @@ -804,6 +847,13 @@ int wc_ecc_mulmod_ex(const mp_int *k, ecc_point *G, ecc_point *R, mp_int* a, pka_mul.pointY = Gybin; pka_mul.scalarMulSize = size; pka_mul.scalarMul = kbin; +#ifdef WOLFSSL_STM32_PKA_V2 + pka_mul.coefB = coefB; + pka_mul.primeOrder = order; +#else + (void)order; + (void)coefB; +#endif status = HAL_PKA_ECCMul(&hpka, &pka_mul, HAL_MAX_DELAY); if (status != HAL_OK) { @@ -887,11 +937,11 @@ int stm32_ecc_verify_hash_ex(mp_int *r, mp_int *s, const byte* hash, size = (uint8_t)szModulus; /* find parameters for the selected curve */ - if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) { + if (0 != stm32_get_ecc_specs(&prime, &coef, NULL, &coef_sign, + &gen_x, &gen_y, &order, size)) { return ECC_BAD_ARG_E; } - pka_ecc.primeOrderSize = size; pka_ecc.modulusSize = size; pka_ecc.coefSign = *coef_sign; @@ -933,7 +983,7 @@ int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng, uint8_t Rbin[STM32_MAX_ECC_SIZE]; uint8_t Sbin[STM32_MAX_ECC_SIZE]; uint8_t Hashbin[STM32_MAX_ECC_SIZE]; - const uint8_t *prime, *coef, *gen_x, *gen_y, *order; + const uint8_t *prime, *coef, *coefB, *gen_x, *gen_y, *order; const uint32_t *coef_sign; XMEMSET(&pka_ecc, 0x00, sizeof(PKA_ECDSASignInTypeDef)); XMEMSET(&pka_ecc_out, 0x00, sizeof(PKA_ECDSASignOutTypeDef)); @@ -952,7 +1002,8 @@ int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng, return status; /* find parameters for the selected curve */ - if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) { + if (0 != stm32_get_ecc_specs(&prime, &coef, &coefB, &coef_sign, + &gen_x, &gen_y, &order, size)) { return ECC_BAD_ARG_E; } @@ -968,6 +1019,11 @@ int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng, pka_ecc.modulusSize = size; pka_ecc.coefSign = *coef_sign; pka_ecc.coef = coef; +#ifdef WOLFSSL_STM32_PKA_V2 + pka_ecc.coefB = coefB; +#else + (void)coefB; +#endif pka_ecc.modulus = prime; pka_ecc.basePointX = gen_x; pka_ecc.basePointY = gen_y; diff --git a/wolfssl/openssl/sha.h b/wolfssl/openssl/sha.h index f85702aee..b3cfc0bf6 100644 --- a/wolfssl/openssl/sha.h +++ b/wolfssl/openssl/sha.h @@ -95,7 +95,7 @@ typedef WOLFSSL_SHA_CTX SHA_CTX; /* adder for HW crypto */ #ifdef STM32_HASH -#define CTX_SHA2_HW_ADDER 30 +#define CTX_SHA2_HW_ADDER 34 #else #define CTX_SHA2_HW_ADDER 0 #endif