Kyber: Improve performance

Unroll loops and use larger types.
Allow benchmark to run each kyber parameter separately.
Allow benchmark to have -ml-dsa specified which runs all parameters.
Fix thumb2 ASM C code to not have duplicate includes and ifdef checks.
Fix thumb2 ASM C code to include error-crypt.h to ensure no empty
translation unit.
Check for WOLFSSL_SHA3 before including Thumb2 SHA-3 assembly code.
This commit is contained in:
Sean Parkinson
2024-06-25 18:53:53 +10:00
parent 5793f626ac
commit aa61f98955
10 changed files with 224 additions and 151 deletions

View File

@ -654,7 +654,6 @@
#define BENCH_RSA 0x00000002
#define BENCH_RSA_SZ 0x00000004
#define BENCH_DH 0x00000010
#define BENCH_KYBER 0x00000020
#define BENCH_ECC_MAKEKEY 0x00001000
#define BENCH_ECC 0x00002000
#define BENCH_ECC_ENCRYPT 0x00004000
@ -681,11 +680,22 @@
#define BENCH_SAKKE 0x80000000
/* Post-Quantum Asymmetric algorithms. */
#define BENCH_KYBER512 0x00000020
#define BENCH_KYBER768 0x00000040
#define BENCH_KYBER1024 0x00000080
#define BENCH_KYBER (BENCH_KYBER512 | BENCH_KYBER768 | \
BENCH_KYBER1024)
#define BENCH_FALCON_LEVEL1_SIGN 0x00000001
#define BENCH_FALCON_LEVEL5_SIGN 0x00000002
#define BENCH_DILITHIUM_LEVEL2_SIGN 0x04000000
#define BENCH_DILITHIUM_LEVEL3_SIGN 0x08000000
#define BENCH_DILITHIUM_LEVEL5_SIGN 0x10000000
#define BENCH_ML_DSA_44_SIGN 0x04000000
#define BENCH_ML_DSA_65_SIGN 0x08000000
#define BENCH_ML_DSA_87_SIGN 0x10000000
#define BENCH_ML_DSA_SIGN (BENCH_ML_DSA_44_SIGN | \
BENCH_ML_DSA_65_SIGN | \
BENCH_ML_DSA_87_SIGN)
/* Post-Quantum Asymmetric algorithms. (Part 2) */
#define BENCH_SPHINCS_FAST_LEVEL1_SIGN 0x00000001
@ -959,9 +969,6 @@ static const bench_alg bench_asym_opt[] = {
#ifndef NO_DH
{ "-dh", BENCH_DH },
#endif
#ifdef WOLFSSL_HAVE_KYBER
{ "-kyber", BENCH_KYBER },
#endif
#ifdef HAVE_ECC
{ "-ecc-kg", BENCH_ECC_MAKEKEY },
{ "-ecc", BENCH_ECC },
@ -1060,7 +1067,8 @@ static const bench_pq_hash_sig_alg bench_pq_hash_sig_opt[] = {
};
#endif /* BENCH_PQ_STATEFUL_HBS */
#if defined(HAVE_FALCON) || defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS)
#if defined(WOLFSSL_HAVE_KYBER) || defined(HAVE_FALCON) || \
defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS)
/* The post-quantum-specific mapping of command line option to bit values and
* OQS name. */
typedef struct bench_pq_alg {
@ -1073,18 +1081,25 @@ typedef struct bench_pq_alg {
/* All recognized post-quantum asymmetric algorithm choosing command line
* options. */
static const bench_pq_alg bench_pq_asym_opt[] = {
{ "-pq", 0xffffffff },
{ "-pq", 0xffffffff },
#ifdef WOLFSSL_HAVE_KYBER
{ "-kyber", BENCH_KYBER },
{ "-kyber512", BENCH_KYBER512 },
{ "-kyber768", BENCH_KYBER768 },
{ "-kyber1024", BENCH_KYBER1024 },
#endif
#if defined(HAVE_FALCON)
{ "-falcon_level1", BENCH_FALCON_LEVEL1_SIGN },
{ "-falcon_level5", BENCH_FALCON_LEVEL5_SIGN },
{ "-falcon_level1", BENCH_FALCON_LEVEL1_SIGN },
{ "-falcon_level5", BENCH_FALCON_LEVEL5_SIGN },
#endif
#if defined(HAVE_DILITHIUM)
{ "-dilithium_level2", BENCH_DILITHIUM_LEVEL2_SIGN },
{ "-dilithium_level3", BENCH_DILITHIUM_LEVEL3_SIGN },
{ "-dilithium_level5", BENCH_DILITHIUM_LEVEL5_SIGN },
{ "-ml-dsa-44", BENCH_DILITHIUM_LEVEL2_SIGN },
{ "-ml-dsa-65", BENCH_DILITHIUM_LEVEL3_SIGN },
{ "-ml-dsa-87", BENCH_DILITHIUM_LEVEL5_SIGN },
{ "-dilithium_level2", BENCH_DILITHIUM_LEVEL2_SIGN },
{ "-dilithium_level3", BENCH_DILITHIUM_LEVEL3_SIGN },
{ "-dilithium_level5", BENCH_DILITHIUM_LEVEL5_SIGN },
{ "-ml-dsa", BENCH_ML_DSA_SIGN },
{ "-ml-dsa-44", BENCH_ML_DSA_44_SIGN },
{ "-ml-dsa-65", BENCH_ML_DSA_65_SIGN },
{ "-ml-dsa-87", BENCH_ML_DSA_87_SIGN },
#endif
{ NULL, 0 }
};
@ -3576,15 +3591,21 @@ static void* benchmarks_do(void* args)
#endif
#ifdef WOLFSSL_HAVE_KYBER
if (bench_all || (bench_asym_algs & BENCH_KYBER)) {
if (bench_all || (bench_pq_asym_algs & BENCH_KYBER)) {
#ifdef WOLFSSL_KYBER512
bench_kyber(KYBER512);
if (bench_pq_asym_algs & BENCH_KYBER512) {
bench_kyber(KYBER512);
}
#endif
#ifdef WOLFSSL_KYBER768
bench_kyber(KYBER768);
if (bench_pq_asym_algs & BENCH_KYBER768) {
bench_kyber(KYBER768);
}
#endif
#ifdef WOLFSSL_KYBER1024
bench_kyber(KYBER1024);
if (bench_pq_asym_algs & BENCH_KYBER1024) {
bench_kyber(KYBER1024);
}
#endif
}
#endif
@ -14523,7 +14544,8 @@ static void Usage(void)
print_alg(bench_asym_opt[i].str, &line);
for (i=0; bench_other_opt[i].str != NULL; i++)
print_alg(bench_other_opt[i].str, &line);
#if defined(HAVE_FALCON) || defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS)
#if defined(WOLFSSL_HAVE_KYBER) || defined(HAVE_FALCON) || \
defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS)
for (i=0; bench_pq_asym_opt[i].str != NULL; i++)
print_alg(bench_pq_asym_opt[i].str, &line);
#if defined(HAVE_SPHINCS)
@ -14799,8 +14821,8 @@ int wolfcrypt_benchmark_main(int argc, char** argv)
optMatched = 1;
}
}
#if defined(HAVE_FALCON) || defined(HAVE_DILITHIUM) || \
defined(HAVE_SPHINCS)
#if defined(WOLFSSL_HAVE_KYBER) || defined(HAVE_FALCON) || \
defined(HAVE_DILITHIUM) || defined(HAVE_SPHINCS)
/* Known asymmetric post-quantum algorithms */
for (i=0; !optMatched && bench_pq_asym_opt[i].str != NULL; i++) {
if (string_matches(argv[1], bench_pq_asym_opt[i].str)) {

View File

@ -32,6 +32,8 @@
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_SHA3
#ifndef WOLFSSL_ARMASM_NO_NEON
.text
.type L_sha3_arm2_neon_rt, %object
.size L_sha3_arm2_neon_rt, 192
@ -85,60 +87,6 @@ L_sha3_arm2_neon_rt:
.word 0x0
.word 0x80008008
.word 0x80000000
.text
.type L_sha3_arm2_rt, %object
.size L_sha3_arm2_rt, 192
.align 4
L_sha3_arm2_rt:
.word 0x1
.word 0x0
.word 0x8082
.word 0x0
.word 0x808a
.word 0x80000000
.word 0x80008000
.word 0x80000000
.word 0x808b
.word 0x0
.word 0x80000001
.word 0x0
.word 0x80008081
.word 0x80000000
.word 0x8009
.word 0x80000000
.word 0x8a
.word 0x0
.word 0x88
.word 0x0
.word 0x80008009
.word 0x0
.word 0x8000000a
.word 0x0
.word 0x8000808b
.word 0x0
.word 0x8b
.word 0x80000000
.word 0x8089
.word 0x80000000
.word 0x8003
.word 0x80000000
.word 0x8002
.word 0x80000000
.word 0x80
.word 0x80000000
.word 0x800a
.word 0x0
.word 0x8000000a
.word 0x80000000
.word 0x80008081
.word 0x80000000
.word 0x8080
.word 0x80000000
.word 0x80000001
.word 0x0
.word 0x80008008
.word 0x80000000
#ifndef WOLFSSL_ARMASM_NO_NEON
.text
.align 4
.globl BlockSha3
@ -407,6 +355,59 @@ L_sha3_arm32_neon_begin:
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_ARMASM_NO_NEON */
#ifdef WOLFSSL_ARMASM_NO_NEON
.text
.type L_sha3_arm2_rt, %object
.size L_sha3_arm2_rt, 192
.align 4
L_sha3_arm2_rt:
.word 0x1
.word 0x0
.word 0x8082
.word 0x0
.word 0x808a
.word 0x80000000
.word 0x80008000
.word 0x80000000
.word 0x808b
.word 0x0
.word 0x80000001
.word 0x0
.word 0x80008081
.word 0x80000000
.word 0x8009
.word 0x80000000
.word 0x8a
.word 0x0
.word 0x88
.word 0x0
.word 0x80008009
.word 0x0
.word 0x8000000a
.word 0x0
.word 0x8000808b
.word 0x0
.word 0x8b
.word 0x80000000
.word 0x8089
.word 0x80000000
.word 0x8003
.word 0x80000000
.word 0x8002
.word 0x80000000
.word 0x80
.word 0x80000000
.word 0x800a
.word 0x0
.word 0x8000000a
.word 0x80000000
.word 0x80008081
.word 0x80000000
.word 0x8080
.word 0x80000000
.word 0x80000001
.word 0x0
.word 0x80008008
.word 0x80000000
.text
.align 4
.globl BlockSha3
@ -2391,6 +2392,7 @@ L_sha3_arm32_begin:
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */

View File

@ -51,6 +51,8 @@
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef WOLFSSL_SHA3
#ifndef WOLFSSL_ARMASM_NO_NEON
static const uint64_t L_sha3_arm2_neon_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
@ -66,29 +68,12 @@ static const uint64_t L_sha3_arm2_neon_rt[] = {
0x0000000080000001UL, 0x8000000080008008UL,
};
static const uint64_t L_sha3_arm2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
0x000000000000808bUL, 0x0000000080000001UL,
0x8000000080008081UL, 0x8000000000008009UL,
0x000000000000008aUL, 0x0000000000000088UL,
0x0000000080008009UL, 0x000000008000000aUL,
0x000000008000808bUL, 0x800000000000008bUL,
0x8000000000008089UL, 0x8000000000008003UL,
0x8000000000008002UL, 0x8000000000000080UL,
0x000000000000800aUL, 0x800000008000000aUL,
0x8000000080008081UL, 0x8000000000008080UL,
0x0000000080000001UL, 0x8000000080008008UL,
};
#include <wolfssl/wolfcrypt/sha3.h>
#ifndef WOLFSSL_ARMASM_NO_NEON
void BlockSha3(word64* state_p)
{
register word64* state asm ("r0") = (word64*)state_p;
register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt;
register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt;
__asm__ __volatile__ (
"sub sp, sp, #16\n\t"
@ -348,16 +333,31 @@ void BlockSha3(word64* state_p)
"vst1.8 {d20-d23}, [%[state]]!\n\t"
"vst1.8 {d24}, [%[state]]\n\t"
"add sp, sp, #16\n\t"
: [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c)
: [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c)
:
: "memory", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc"
: "memory", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#ifdef WOLFSSL_ARMASM_NO_NEON
static const uint64_t L_sha3_arm2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
0x000000000000808bUL, 0x0000000080000001UL,
0x8000000080008081UL, 0x8000000000008009UL,
0x000000000000008aUL, 0x0000000000000088UL,
0x0000000080008009UL, 0x000000008000000aUL,
0x000000008000808bUL, 0x800000000000008bUL,
0x8000000000008089UL, 0x8000000000008003UL,
0x8000000000008002UL, 0x8000000000000080UL,
0x000000000000800aUL, 0x800000008000000aUL,
0x8000000080008081UL, 0x8000000000008080UL,
0x0000000080000001UL, 0x8000000080008008UL,
};
#include <wolfssl/wolfcrypt/sha3.h>
#ifdef WOLFSSL_ARMASM_NO_NEON
void BlockSha3(word64* state_p)
{
register word64* state asm ("r0") = (word64*)state_p;
@ -2348,6 +2348,7 @@ void BlockSha3(word64* state_p)
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */

View File

@ -28,19 +28,12 @@
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
@ -3056,7 +3049,4 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
#endif /* !NO_AES */
#endif /* !__aarch64__ && __thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -28,19 +28,12 @@
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
@ -6904,7 +6897,4 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
#endif /* HAVE_CURVE25519 || HAVE_ED25519 */
#endif /* !__aarch64__ && __thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -28,19 +28,12 @@
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
@ -1472,7 +1465,4 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
#endif /* !NO_SHA256 */
#endif /* !__aarch64__ && __thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -34,6 +34,7 @@
#ifndef WOLFSSL_ARMASM_INLINE
.thumb
.syntax unified
#ifdef WOLFSSL_SHA3
.text
.type L_sha3_thumb2_rt, %object
.size L_sha3_thumb2_rt, 192
@ -1165,6 +1166,7 @@ L_sha3_thumb2_begin:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 1505 */
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __thumb__ */
#endif /* WOLFSSL_ARMASM */

View File

@ -28,19 +28,12 @@
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
@ -50,6 +43,7 @@
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef WOLFSSL_SHA3
static const uint64_t L_sha3_thumb2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
@ -1162,9 +1156,7 @@ void BlockSha3(word64* state)
);
}
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -28,19 +28,12 @@
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
@ -3587,7 +3580,4 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
#endif /* WOLFSSL_SHA512 */
#endif /* !__aarch64__ && __thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -2056,6 +2056,8 @@ static void kyber_cbd_eta3(sword16* p, const byte* r)
{
unsigned int i;
#if defined(WOLFSSL_SMALL_STACK) || defined(WOLFSSL_KYBER_NO_LARGE_CODE) || \
defined(BIG_ENDIAN_ORDER)
#ifndef WORD64_AVAILABLE
/* Calculate four integer coefficients at a time. */
for (i = 0; i < KYBER_N; i += 4) {
@ -2129,7 +2131,59 @@ static void kyber_cbd_eta3(sword16* p, const byte* r)
/* Move over used bytes. */
r += 6;
}
#endif
#endif /* WORD64_AVAILABLE */
#else
/* Calculate eight integer coefficients at a time. */
for (i = 0; i < KYBER_N; i += 16) {
const word32* r32 = (const word32*)r;
/* Take the next 12 bytes, little endian, as 24 bit values. */
word32 t0 = r32[0] & 0xffffff;
word32 t1 = ((r32[0] >> 24) | (r32[1] << 8)) & 0xffffff;
word32 t2 = ((r32[1] >> 16) | (r32[2] << 16)) & 0xffffff;
word32 t3 = r32[2] >> 8 ;
word32 d0;
word32 d1;
word32 d2;
word32 d3;
/* Add second and third bits to first. */
d0 = (t0 >> 0) & 0x00249249;
d0 += (t0 >> 1) & 0x00249249;
d0 += (t0 >> 2) & 0x00249249;
d1 = (t1 >> 0) & 0x00249249;
d1 += (t1 >> 1) & 0x00249249;
d1 += (t1 >> 2) & 0x00249249;
d2 = (t2 >> 0) & 0x00249249;
d2 += (t2 >> 1) & 0x00249249;
d2 += (t2 >> 2) & 0x00249249;
d3 = (t3 >> 0) & 0x00249249;
d3 += (t3 >> 1) & 0x00249249;
d3 += (t3 >> 2) & 0x00249249;
/* Values 0, 1, 2 or 3 in consecutive 3 bits.
* 0 - 1/8, 1 - 3/8, 2 - 3/8, 3 - 1/8. */
p[i + 0] = ETA3_SUB(d0, 0);
p[i + 1] = ETA3_SUB(d0, 1);
p[i + 2] = ETA3_SUB(d0, 2);
p[i + 3] = ETA3_SUB(d0, 3);
p[i + 4] = ETA3_SUB(d1, 0);
p[i + 5] = ETA3_SUB(d1, 1);
p[i + 6] = ETA3_SUB(d1, 2);
p[i + 7] = ETA3_SUB(d1, 3);
p[i + 8] = ETA3_SUB(d2, 0);
p[i + 9] = ETA3_SUB(d2, 1);
p[i + 10] = ETA3_SUB(d2, 2);
p[i + 11] = ETA3_SUB(d2, 3);
p[i + 12] = ETA3_SUB(d3, 0);
p[i + 13] = ETA3_SUB(d3, 1);
p[i + 14] = ETA3_SUB(d3, 2);
p[i + 15] = ETA3_SUB(d3, 3);
/* -3-1/64, -2-6/64, -1-15/64, 0-20/64, 1-15/64, 2-6/64, 3-1/64 */
/* Move over used bytes. */
r += 12;
}
#endif /* WOLFSSL_SMALL_STACK || WOLFSSL_KYBER_NO_LARGE_CODE || BIG_ENDIAN_ORDER */
}
#endif
@ -2677,6 +2731,8 @@ static void kyber_vec_compress_10_c(byte* r, sword16* v, unsigned int kp)
/* Each polynomial. */
for (i = 0; i < kp; i++) {
#if defined(WOLFSSL_SMALL_STACK) || defined(WOLFSSL_KYBER_NO_LARGE_CODE) || \
defined(BIG_ENDIAN_ORDER)
/* Each 4 polynomial coefficients. */
for (j = 0; j < KYBER_N; j += 4) {
#ifdef WOLFSSL_KYBER_SMALL
@ -2710,6 +2766,44 @@ static void kyber_vec_compress_10_c(byte* r, sword16* v, unsigned int kp)
/* Move over set bytes. */
r += 5;
}
#else
/* Each 16 polynomial coefficients. */
for (j = 0; j < KYBER_N; j += 16) {
/* Compress four polynomial values to 10 bits each. */
sword16 t0 = TO_COMP_WORD_10(v, i, j, 0);
sword16 t1 = TO_COMP_WORD_10(v, i, j, 1);
sword16 t2 = TO_COMP_WORD_10(v, i, j, 2);
sword16 t3 = TO_COMP_WORD_10(v, i, j, 3);
sword16 t4 = TO_COMP_WORD_10(v, i, j, 4);
sword16 t5 = TO_COMP_WORD_10(v, i, j, 5);
sword16 t6 = TO_COMP_WORD_10(v, i, j, 6);
sword16 t7 = TO_COMP_WORD_10(v, i, j, 7);
sword16 t8 = TO_COMP_WORD_10(v, i, j, 8);
sword16 t9 = TO_COMP_WORD_10(v, i, j, 9);
sword16 t10 = TO_COMP_WORD_10(v, i, j, 10);
sword16 t11 = TO_COMP_WORD_10(v, i, j, 11);
sword16 t12 = TO_COMP_WORD_10(v, i, j, 12);
sword16 t13 = TO_COMP_WORD_10(v, i, j, 13);
sword16 t14 = TO_COMP_WORD_10(v, i, j, 14);
sword16 t15 = TO_COMP_WORD_10(v, i, j, 15);
word32* r32 = (word32*)r;
/* Pack sixteen 10-bit values into byte array. */
r32[0] = t0 | ((word32)t1 << 10) | ((word32)t2 << 20) |
((word32)t3 << 30);
r32[1] = (t3 >> 2) | ((word32)t4 << 8) | ((word32)t5 << 18) |
((word32)t6 << 28);
r32[2] = (t6 >> 4) | ((word32)t7 << 6) | ((word32)t8 << 16) |
((word32)t9 << 26);
r32[3] = (t9 >> 6) | ((word32)t10 << 4) | ((word32)t11 << 14) |
((word32)t12 << 24);
r32[4] = (t12 >> 8) | ((word32)t13 << 2) | ((word32)t14 << 12) |
((word32)t15 << 22);
/* Move over set bytes. */
r += 20;
}
#endif
}
}