diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 48effdaf0..6703a3fc7 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -654,7 +654,6 @@ #define BENCH_RSA 0x00000002 #define BENCH_RSA_SZ 0x00000004 #define BENCH_DH 0x00000010 -#define BENCH_KYBER 0x00000020 #define BENCH_ECC_MAKEKEY 0x00001000 #define BENCH_ECC 0x00002000 #define BENCH_ECC_ENCRYPT 0x00004000 @@ -681,11 +680,22 @@ #define BENCH_SAKKE 0x80000000 /* Post-Quantum Asymmetric algorithms. */ +#define BENCH_KYBER512 0x00000020 +#define BENCH_KYBER768 0x00000040 +#define BENCH_KYBER1024 0x00000080 +#define BENCH_KYBER (BENCH_KYBER512 | BENCH_KYBER768 | \ + BENCH_KYBER1024) #define BENCH_FALCON_LEVEL1_SIGN 0x00000001 #define BENCH_FALCON_LEVEL5_SIGN 0x00000002 #define BENCH_DILITHIUM_LEVEL2_SIGN 0x04000000 #define BENCH_DILITHIUM_LEVEL3_SIGN 0x08000000 #define BENCH_DILITHIUM_LEVEL5_SIGN 0x10000000 +#define BENCH_ML_DSA_44_SIGN 0x04000000 +#define BENCH_ML_DSA_65_SIGN 0x08000000 +#define BENCH_ML_DSA_87_SIGN 0x10000000 +#define BENCH_ML_DSA_SIGN (BENCH_ML_DSA_44_SIGN | \ + BENCH_ML_DSA_65_SIGN | \ + BENCH_ML_DSA_87_SIGN) /* Post-Quantum Asymmetric algorithms. (Part 2) */ #define BENCH_SPHINCS_FAST_LEVEL1_SIGN 0x00000001 @@ -959,9 +969,6 @@ static const bench_alg bench_asym_opt[] = { #ifndef NO_DH { "-dh", BENCH_DH }, #endif -#ifdef WOLFSSL_HAVE_KYBER - { "-kyber", BENCH_KYBER }, -#endif #ifdef HAVE_ECC { "-ecc-kg", BENCH_ECC_MAKEKEY }, { "-ecc", BENCH_ECC }, @@ -1060,7 +1067,8 @@ static const bench_pq_hash_sig_alg bench_pq_hash_sig_opt[] = { }; #endif /* BENCH_PQ_STATEFUL_HBS */ -#if defined(HAVE_FALCON) || defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS) +#if defined(WOLFSSL_HAVE_KYBER) || defined(HAVE_FALCON) || \ + defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS) /* The post-quantum-specific mapping of command line option to bit values and * OQS name. */ typedef struct bench_pq_alg { @@ -1073,18 +1081,25 @@ typedef struct bench_pq_alg { /* All recognized post-quantum asymmetric algorithm choosing command line * options. */ static const bench_pq_alg bench_pq_asym_opt[] = { - { "-pq", 0xffffffff }, + { "-pq", 0xffffffff }, +#ifdef WOLFSSL_HAVE_KYBER + { "-kyber", BENCH_KYBER }, + { "-kyber512", BENCH_KYBER512 }, + { "-kyber768", BENCH_KYBER768 }, + { "-kyber1024", BENCH_KYBER1024 }, +#endif #if defined(HAVE_FALCON) - { "-falcon_level1", BENCH_FALCON_LEVEL1_SIGN }, - { "-falcon_level5", BENCH_FALCON_LEVEL5_SIGN }, + { "-falcon_level1", BENCH_FALCON_LEVEL1_SIGN }, + { "-falcon_level5", BENCH_FALCON_LEVEL5_SIGN }, #endif #if defined(HAVE_DILITHIUM) - { "-dilithium_level2", BENCH_DILITHIUM_LEVEL2_SIGN }, - { "-dilithium_level3", BENCH_DILITHIUM_LEVEL3_SIGN }, - { "-dilithium_level5", BENCH_DILITHIUM_LEVEL5_SIGN }, - { "-ml-dsa-44", BENCH_DILITHIUM_LEVEL2_SIGN }, - { "-ml-dsa-65", BENCH_DILITHIUM_LEVEL3_SIGN }, - { "-ml-dsa-87", BENCH_DILITHIUM_LEVEL5_SIGN }, + { "-dilithium_level2", BENCH_DILITHIUM_LEVEL2_SIGN }, + { "-dilithium_level3", BENCH_DILITHIUM_LEVEL3_SIGN }, + { "-dilithium_level5", BENCH_DILITHIUM_LEVEL5_SIGN }, + { "-ml-dsa", BENCH_ML_DSA_SIGN }, + { "-ml-dsa-44", BENCH_ML_DSA_44_SIGN }, + { "-ml-dsa-65", BENCH_ML_DSA_65_SIGN }, + { "-ml-dsa-87", BENCH_ML_DSA_87_SIGN }, #endif { NULL, 0 } }; @@ -3576,15 +3591,21 @@ static void* benchmarks_do(void* args) #endif #ifdef WOLFSSL_HAVE_KYBER - if (bench_all || (bench_asym_algs & BENCH_KYBER)) { + if (bench_all || (bench_pq_asym_algs & BENCH_KYBER)) { #ifdef WOLFSSL_KYBER512 - bench_kyber(KYBER512); + if (bench_pq_asym_algs & BENCH_KYBER512) { + bench_kyber(KYBER512); + } #endif #ifdef WOLFSSL_KYBER768 - bench_kyber(KYBER768); + if (bench_pq_asym_algs & BENCH_KYBER768) { + bench_kyber(KYBER768); + } #endif #ifdef WOLFSSL_KYBER1024 - bench_kyber(KYBER1024); + if (bench_pq_asym_algs & BENCH_KYBER1024) { + bench_kyber(KYBER1024); + } #endif } #endif @@ -14523,7 +14544,8 @@ static void Usage(void) print_alg(bench_asym_opt[i].str, &line); for (i=0; bench_other_opt[i].str != NULL; i++) print_alg(bench_other_opt[i].str, &line); -#if defined(HAVE_FALCON) || defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS) +#if defined(WOLFSSL_HAVE_KYBER) || defined(HAVE_FALCON) || \ + defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS) for (i=0; bench_pq_asym_opt[i].str != NULL; i++) print_alg(bench_pq_asym_opt[i].str, &line); #if defined(HAVE_SPHINCS) @@ -14799,8 +14821,8 @@ int wolfcrypt_benchmark_main(int argc, char** argv) optMatched = 1; } } - #if defined(HAVE_FALCON) || defined(HAVE_DILITHIUM) || \ - defined(HAVE_SPHINCS) + #if defined(WOLFSSL_HAVE_KYBER) || defined(HAVE_FALCON) || \ + defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS) /* Known asymmetric post-quantum algorithms */ for (i=0; !optMatched && bench_pq_asym_opt[i].str != NULL; i++) { if (string_matches(argv[1], bench_pq_asym_opt[i].str)) { diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S index f667eb6e4..00d694adb 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S @@ -32,6 +32,8 @@ #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) #ifndef WOLFSSL_ARMASM_INLINE +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON .text .type L_sha3_arm2_neon_rt, %object .size L_sha3_arm2_neon_rt, 192 @@ -85,60 +87,6 @@ L_sha3_arm2_neon_rt: .word 0x0 .word 0x80008008 .word 0x80000000 - .text - .type L_sha3_arm2_rt, %object - .size L_sha3_arm2_rt, 192 - .align 4 -L_sha3_arm2_rt: - .word 0x1 - .word 0x0 - .word 0x8082 - .word 0x0 - .word 0x808a - .word 0x80000000 - .word 0x80008000 - .word 0x80000000 - .word 0x808b - .word 0x0 - .word 0x80000001 - .word 0x0 - .word 0x80008081 - .word 0x80000000 - .word 0x8009 - .word 0x80000000 - .word 0x8a - .word 0x0 - .word 0x88 - .word 0x0 - .word 0x80008009 - .word 0x0 - .word 0x8000000a - .word 0x0 - .word 0x8000808b - .word 0x0 - .word 0x8b - .word 0x80000000 - .word 0x8089 - .word 0x80000000 - .word 0x8003 - .word 0x80000000 - .word 0x8002 - .word 0x80000000 - .word 0x80 - .word 0x80000000 - .word 0x800a - .word 0x0 - .word 0x8000000a - .word 0x80000000 - .word 0x80008081 - .word 0x80000000 - .word 0x8080 - .word 0x80000000 - .word 0x80000001 - .word 0x0 - .word 0x80008008 - .word 0x80000000 -#ifndef WOLFSSL_ARMASM_NO_NEON .text .align 4 .globl BlockSha3 @@ -407,6 +355,59 @@ L_sha3_arm32_neon_begin: .size BlockSha3,.-BlockSha3 #endif /* WOLFSSL_ARMASM_NO_NEON */ #ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_sha3_arm2_rt, %object + .size L_sha3_arm2_rt, 192 + .align 4 +L_sha3_arm2_rt: + .word 0x1 + .word 0x0 + .word 0x8082 + .word 0x0 + .word 0x808a + .word 0x80000000 + .word 0x80008000 + .word 0x80000000 + .word 0x808b + .word 0x0 + .word 0x80000001 + .word 0x0 + .word 0x80008081 + .word 0x80000000 + .word 0x8009 + .word 0x80000000 + .word 0x8a + .word 0x0 + .word 0x88 + .word 0x0 + .word 0x80008009 + .word 0x0 + .word 0x8000000a + .word 0x0 + .word 0x8000808b + .word 0x0 + .word 0x8b + .word 0x80000000 + .word 0x8089 + .word 0x80000000 + .word 0x8003 + .word 0x80000000 + .word 0x8002 + .word 0x80000000 + .word 0x80 + .word 0x80000000 + .word 0x800a + .word 0x0 + .word 0x8000000a + .word 0x80000000 + .word 0x80008081 + .word 0x80000000 + .word 0x8080 + .word 0x80000000 + .word 0x80000001 + .word 0x0 + .word 0x80008008 + .word 0x80000000 .text .align 4 .globl BlockSha3 @@ -2391,6 +2392,7 @@ L_sha3_arm32_begin: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size BlockSha3,.-BlockSha3 #endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __arm__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c index 388ec457a..98b74e3f3 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c @@ -51,6 +51,8 @@ #define __asm__ __asm #define __volatile__ volatile #endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON static const uint64_t L_sha3_arm2_neon_rt[] = { 0x0000000000000001UL, 0x0000000000008082UL, 0x800000000000808aUL, 0x8000000080008000UL, @@ -66,29 +68,12 @@ static const uint64_t L_sha3_arm2_neon_rt[] = { 0x0000000080000001UL, 0x8000000080008008UL, }; -static const uint64_t L_sha3_arm2_rt[] = { - 0x0000000000000001UL, 0x0000000000008082UL, - 0x800000000000808aUL, 0x8000000080008000UL, - 0x000000000000808bUL, 0x0000000080000001UL, - 0x8000000080008081UL, 0x8000000000008009UL, - 0x000000000000008aUL, 0x0000000000000088UL, - 0x0000000080008009UL, 0x000000008000000aUL, - 0x000000008000808bUL, 0x800000000000008bUL, - 0x8000000000008089UL, 0x8000000000008003UL, - 0x8000000000008002UL, 0x8000000000000080UL, - 0x000000000000800aUL, 0x800000008000000aUL, - 0x8000000080008081UL, 0x8000000000008080UL, - 0x0000000080000001UL, 0x8000000080008008UL, -}; - #include -#ifndef WOLFSSL_ARMASM_NO_NEON void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt; - register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt; __asm__ __volatile__ ( "sub sp, sp, #16\n\t" @@ -348,16 +333,31 @@ void BlockSha3(word64* state_p) "vst1.8 {d20-d23}, [%[state]]!\n\t" "vst1.8 {d24}, [%[state]]\n\t" "add sp, sp, #16\n\t" - : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c) + : [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c) : - : "memory", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" + : "memory", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc" ); } #endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_sha3_arm2_rt[] = { + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL, +}; + #include -#ifdef WOLFSSL_ARMASM_NO_NEON void BlockSha3(word64* state_p) { register word64* state asm ("r0") = (word64*)state_p; @@ -2348,6 +2348,7 @@ void BlockSha3(word64* state_p) } #endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __arm__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */ diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c index 0c31ad3e6..399157589 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -28,19 +28,12 @@ #include #endif /* HAVE_CONFIG_H */ #include +#include #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__thumb__) -#include -#ifdef HAVE_CONFIG_H - #include -#endif /* HAVE_CONFIG_H */ -#include #ifdef WOLFSSL_ARMASM_INLINE -#ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__thumb__) - #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile @@ -3056,7 +3049,4 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !NO_AES */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__thumb__) */ -#endif /* WOLFSSL_ARMASM */ - #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c index 9b683527f..884b9089d 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -28,19 +28,12 @@ #include #endif /* HAVE_CONFIG_H */ #include +#include #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__thumb__) -#include -#ifdef HAVE_CONFIG_H - #include -#endif /* HAVE_CONFIG_H */ -#include #ifdef WOLFSSL_ARMASM_INLINE -#ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__thumb__) - #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile @@ -6904,7 +6897,4 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) #endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__thumb__) */ -#endif /* WOLFSSL_ARMASM */ - #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c index 61cafed4f..ed496b0b5 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -28,19 +28,12 @@ #include #endif /* HAVE_CONFIG_H */ #include +#include #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__thumb__) -#include -#ifdef HAVE_CONFIG_H - #include -#endif /* HAVE_CONFIG_H */ -#include #ifdef WOLFSSL_ARMASM_INLINE -#ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__thumb__) - #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile @@ -1472,7 +1465,4 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) #endif /* !NO_SHA256 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__thumb__) */ -#endif /* WOLFSSL_ARMASM */ - #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha3-asm.S b/wolfcrypt/src/port/arm/thumb2-sha3-asm.S index bc3ebecf2..106905594 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha3-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha3-asm.S @@ -34,6 +34,7 @@ #ifndef WOLFSSL_ARMASM_INLINE .thumb .syntax unified +#ifdef WOLFSSL_SHA3 .text .type L_sha3_thumb2_rt, %object .size L_sha3_thumb2_rt, 192 @@ -1165,6 +1166,7 @@ L_sha3_thumb2_begin: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 1505 */ .size BlockSha3,.-BlockSha3 +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c index 174f8aa33..53fa09646 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c @@ -28,19 +28,12 @@ #include #endif /* HAVE_CONFIG_H */ #include +#include #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__thumb__) -#include -#ifdef HAVE_CONFIG_H - #include -#endif /* HAVE_CONFIG_H */ -#include #ifdef WOLFSSL_ARMASM_INLINE -#ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__thumb__) - #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile @@ -50,6 +43,7 @@ #define __asm__ __asm #define __volatile__ volatile #endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA3 static const uint64_t L_sha3_thumb2_rt[] = { 0x0000000000000001UL, 0x0000000000008082UL, 0x800000000000808aUL, 0x8000000080008000UL, @@ -1162,9 +1156,7 @@ void BlockSha3(word64* state) ); } +#endif /* WOLFSSL_SHA3 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__thumb__) */ -#endif /* WOLFSSL_ARMASM */ - #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c index 840dd7fde..35363ba3a 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c @@ -28,19 +28,12 @@ #include #endif /* HAVE_CONFIG_H */ #include +#include #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && defined(__thumb__) -#include -#ifdef HAVE_CONFIG_H - #include -#endif /* HAVE_CONFIG_H */ -#include #ifdef WOLFSSL_ARMASM_INLINE -#ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__thumb__) - #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile @@ -3587,7 +3580,4 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) #endif /* WOLFSSL_SHA512 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__thumb__) */ -#endif /* WOLFSSL_ARMASM */ - #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/wc_kyber_poly.c b/wolfcrypt/src/wc_kyber_poly.c index ba839f50a..6cc52d45b 100644 --- a/wolfcrypt/src/wc_kyber_poly.c +++ b/wolfcrypt/src/wc_kyber_poly.c @@ -2056,6 +2056,8 @@ static void kyber_cbd_eta3(sword16* p, const byte* r) { unsigned int i; +#if defined(WOLFSSL_SMALL_STACK) || defined(WOLFSSL_KYBER_NO_LARGE_CODE) || \ + defined(BIG_ENDIAN_ORDER) #ifndef WORD64_AVAILABLE /* Calculate four integer coefficients at a time. */ for (i = 0; i < KYBER_N; i += 4) { @@ -2129,7 +2131,59 @@ static void kyber_cbd_eta3(sword16* p, const byte* r) /* Move over used bytes. */ r += 6; } -#endif +#endif /* WORD64_AVAILABLE */ +#else + /* Calculate eight integer coefficients at a time. */ + for (i = 0; i < KYBER_N; i += 16) { + const word32* r32 = (const word32*)r; + /* Take the next 12 bytes, little endian, as 24 bit values. */ + word32 t0 = r32[0] & 0xffffff; + word32 t1 = ((r32[0] >> 24) | (r32[1] << 8)) & 0xffffff; + word32 t2 = ((r32[1] >> 16) | (r32[2] << 16)) & 0xffffff; + word32 t3 = r32[2] >> 8 ; + word32 d0; + word32 d1; + word32 d2; + word32 d3; + + /* Add second and third bits to first. */ + d0 = (t0 >> 0) & 0x00249249; + d0 += (t0 >> 1) & 0x00249249; + d0 += (t0 >> 2) & 0x00249249; + d1 = (t1 >> 0) & 0x00249249; + d1 += (t1 >> 1) & 0x00249249; + d1 += (t1 >> 2) & 0x00249249; + d2 = (t2 >> 0) & 0x00249249; + d2 += (t2 >> 1) & 0x00249249; + d2 += (t2 >> 2) & 0x00249249; + d3 = (t3 >> 0) & 0x00249249; + d3 += (t3 >> 1) & 0x00249249; + d3 += (t3 >> 2) & 0x00249249; + /* Values 0, 1, 2 or 3 in consecutive 3 bits. + * 0 - 1/8, 1 - 3/8, 2 - 3/8, 3 - 1/8. */ + + p[i + 0] = ETA3_SUB(d0, 0); + p[i + 1] = ETA3_SUB(d0, 1); + p[i + 2] = ETA3_SUB(d0, 2); + p[i + 3] = ETA3_SUB(d0, 3); + p[i + 4] = ETA3_SUB(d1, 0); + p[i + 5] = ETA3_SUB(d1, 1); + p[i + 6] = ETA3_SUB(d1, 2); + p[i + 7] = ETA3_SUB(d1, 3); + p[i + 8] = ETA3_SUB(d2, 0); + p[i + 9] = ETA3_SUB(d2, 1); + p[i + 10] = ETA3_SUB(d2, 2); + p[i + 11] = ETA3_SUB(d2, 3); + p[i + 12] = ETA3_SUB(d3, 0); + p[i + 13] = ETA3_SUB(d3, 1); + p[i + 14] = ETA3_SUB(d3, 2); + p[i + 15] = ETA3_SUB(d3, 3); + /* -3-1/64, -2-6/64, -1-15/64, 0-20/64, 1-15/64, 2-6/64, 3-1/64 */ + + /* Move over used bytes. */ + r += 12; + } +#endif /* WOLFSSL_SMALL_STACK || WOLFSSL_KYBER_NO_LARGE_CODE || BIG_ENDIAN_ORDER */ } #endif @@ -2677,6 +2731,8 @@ static void kyber_vec_compress_10_c(byte* r, sword16* v, unsigned int kp) /* Each polynomial. */ for (i = 0; i < kp; i++) { +#if defined(WOLFSSL_SMALL_STACK) || defined(WOLFSSL_KYBER_NO_LARGE_CODE) || \ + defined(BIG_ENDIAN_ORDER) /* Each 4 polynomial coefficients. */ for (j = 0; j < KYBER_N; j += 4) { #ifdef WOLFSSL_KYBER_SMALL @@ -2710,6 +2766,44 @@ static void kyber_vec_compress_10_c(byte* r, sword16* v, unsigned int kp) /* Move over set bytes. */ r += 5; } +#else + /* Each 16 polynomial coefficients. */ + for (j = 0; j < KYBER_N; j += 16) { + /* Compress four polynomial values to 10 bits each. */ + sword16 t0 = TO_COMP_WORD_10(v, i, j, 0); + sword16 t1 = TO_COMP_WORD_10(v, i, j, 1); + sword16 t2 = TO_COMP_WORD_10(v, i, j, 2); + sword16 t3 = TO_COMP_WORD_10(v, i, j, 3); + sword16 t4 = TO_COMP_WORD_10(v, i, j, 4); + sword16 t5 = TO_COMP_WORD_10(v, i, j, 5); + sword16 t6 = TO_COMP_WORD_10(v, i, j, 6); + sword16 t7 = TO_COMP_WORD_10(v, i, j, 7); + sword16 t8 = TO_COMP_WORD_10(v, i, j, 8); + sword16 t9 = TO_COMP_WORD_10(v, i, j, 9); + sword16 t10 = TO_COMP_WORD_10(v, i, j, 10); + sword16 t11 = TO_COMP_WORD_10(v, i, j, 11); + sword16 t12 = TO_COMP_WORD_10(v, i, j, 12); + sword16 t13 = TO_COMP_WORD_10(v, i, j, 13); + sword16 t14 = TO_COMP_WORD_10(v, i, j, 14); + sword16 t15 = TO_COMP_WORD_10(v, i, j, 15); + + word32* r32 = (word32*)r; + /* Pack sixteen 10-bit values into byte array. */ + r32[0] = t0 | ((word32)t1 << 10) | ((word32)t2 << 20) | + ((word32)t3 << 30); + r32[1] = (t3 >> 2) | ((word32)t4 << 8) | ((word32)t5 << 18) | + ((word32)t6 << 28); + r32[2] = (t6 >> 4) | ((word32)t7 << 6) | ((word32)t8 << 16) | + ((word32)t9 << 26); + r32[3] = (t9 >> 6) | ((word32)t10 << 4) | ((word32)t11 << 14) | + ((word32)t12 << 24); + r32[4] = (t12 >> 8) | ((word32)t13 << 2) | ((word32)t14 << 12) | + ((word32)t15 << 22); + + /* Move over set bytes. */ + r += 20; + } +#endif } }