diff --git a/configure.ac b/configure.ac index 343ab5173..bd987bca6 100644 --- a/configure.ac +++ b/configure.ac @@ -508,6 +508,14 @@ AS_CASE([$ENABLED_FIPS], ENABLED_FIPS="yes" # for dev, DEF_SP_MATH and DEF_FAST_MATH follow non-FIPS defaults (currently sp-math-all) ], + [v5-kcapi],[ + FIPS_VERSION="v5-dev" + HAVE_FIPS_VERSION_MAJOR=5 + HAVE_FIPS_VERSION_MINOR=3 + HAVE_FIPS_VERSION_PATCH=0 + ENABLED_FIPS="yes" + # for dev, DEF_SP_MATH and DEF_FAST_MATH follow non-FIPS defaults (currently sp-math-all) + ], [v6],[ FIPS_VERSION="v6" HAVE_FIPS_VERSION=6 diff --git a/tests/api.c b/tests/api.c index 2abcb912f..3dcd639b1 100644 --- a/tests/api.c +++ b/tests/api.c @@ -4933,11 +4933,13 @@ static int test_wolfSSL_FPKI(void) DYNAMIC_TYPE_TMP_BUFFER)); ExpectIntEQ(wc_GetFASCNFromCert(&cert, fascn, &fascnSz), 0); XFREE(fascn, NULL, DYNAMIC_TYPE_TMP_BUFFER); + fascn = NULL; ExpectIntEQ(wc_GetUUIDFromCert(&cert, NULL, &uuidSz), WC_NO_ERR_TRACE(LENGTH_ONLY_E)); ExpectNotNull(uuid = (byte*)XMALLOC(uuidSz, NULL, DYNAMIC_TYPE_TMP_BUFFER)); ExpectIntEQ(wc_GetUUIDFromCert(&cert, uuid, &uuidSz), 0); XFREE(uuid, NULL, DYNAMIC_TYPE_TMP_BUFFER); + uuid = NULL; wc_FreeDecodedCert(&cert); XMEMSET(buf, 0, 4096); diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c index 1bcc76cd4..98fc8cf47 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -1908,7 +1908,7 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, register word32* L_AES_Thumb2_td_ecb_c __asm__ ("r5") = (word32*)L_AES_Thumb2_td_ecb; - register byte L_AES_Thumb2_td4_c __asm__ ("r6") = (byte)&L_AES_Thumb2_td4; + register byte L_AES_Thumb2_td4_c __asm__ ("r6") = (byte)(word32)&L_AES_Thumb2_td4; #else register word32* L_AES_Thumb2_td_ecb_c = (word32*)L_AES_Thumb2_td_ecb; @@ -2134,7 +2134,7 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, register word32* L_AES_Thumb2_td_ecb_c __asm__ ("r6") = (word32*)L_AES_Thumb2_td_ecb; - register byte L_AES_Thumb2_td4_c __asm__ ("r7") = (byte)&L_AES_Thumb2_td4; + register byte L_AES_Thumb2_td4_c __asm__ ("r7") = (byte)(word32)&L_AES_Thumb2_td4; #else register word32* L_AES_Thumb2_td_ecb_c = (word32*)L_AES_Thumb2_td_ecb; diff --git a/wolfcrypt/src/wc_mlkem_poly.c b/wolfcrypt/src/wc_mlkem_poly.c index 39f6b9439..3d83a425a 100644 --- a/wolfcrypt/src/wc_mlkem_poly.c +++ b/wolfcrypt/src/wc_mlkem_poly.c @@ -540,308 +540,6 @@ static void mlkem_ntt(sword16* r) #endif } -#if !defined(WOLFSSL_MLKEM_NO_MAKE_KEY) && \ - !defined(WOLFSSL_MLKEM_SMALL) && !defined(WOLFSSL_MLKEM_NO_LARGE_CODE) -/* Number-Theoretic Transform. - * - * FIPS 203, Algorithm 9: NTT(f) - * Computes the NTT representation f_hat of the given polynomial f element of - * R_q. - * 1: f_hat <- f - * 2: i <- 1 - * 3: for (len <- 128; len >= 2; len <- len/2) - * 4: for (start <- 0; start < 256; start <- start + 2.len) - * 5: zeta <- zetas^BitRev_7(i) mod q - * 6: i <- i + 1 - * 7: for (j <- start; j < start + len; j++) - * 8: t <- zeta.f[j+len] - * 9: f_hat[j+len] <- f_hat[j] - t - * 10: f_hat[j] <- f_hat[j] - t - * 11: end for - * 12: end for - * 13: end for - * 14: return f_hat - * - * @param [in, out] r Polynomial to transform. - */ -static void mlkem_ntt_add_to(sword16* r, sword16* a) -{ -#if defined(WOLFSSL_MLKEM_NTT_UNROLL) - /* Unroll len loop (Step 3). */ - unsigned int k = 1; - unsigned int j; - unsigned int start; - sword16 zeta = zetas[k++]; - - /* len = 128 */ - for (j = 0; j < MLKEM_N / 2; ++j) { - sword32 p = (sword32)zeta * r[j + MLKEM_N / 2]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[j]; - r[j + MLKEM_N / 2] = rj - t; - r[j] = rj + t; - } - /* len = 64 */ - for (start = 0; start < MLKEM_N; start += 2 * 64) { - zeta = zetas[k++]; - for (j = 0; j < 64; ++j) { - sword32 p = (sword32)zeta * r[start + j + 64]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[start + j]; - r[start + j + 64] = rj - t; - r[start + j] = rj + t; - } - } - /* len = 32 */ - for (start = 0; start < MLKEM_N; start += 2 * 32) { - zeta = zetas[k++]; - for (j = 0; j < 32; ++j) { - sword32 p = (sword32)zeta * r[start + j + 32]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[start + j]; - r[start + j + 32] = rj - t; - r[start + j] = rj + t; - } - } - /* len = 16 */ - for (start = 0; start < MLKEM_N; start += 2 * 16) { - zeta = zetas[k++]; - for (j = 0; j < 16; ++j) { - sword32 p = (sword32)zeta * r[start + j + 16]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[start + j]; - r[start + j + 16] = rj - t; - r[start + j] = rj + t; - } - } - /* len = 8 */ - for (start = 0; start < MLKEM_N; start += 2 * 8) { - zeta = zetas[k++]; - for (j = 0; j < 8; ++j) { - sword32 p = (sword32)zeta * r[start + j + 8]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[start + j]; - r[start + j + 8] = rj - t; - r[start + j] = rj + t; - } - } - /* len = 4 */ - for (start = 0; start < MLKEM_N; start += 2 * 4) { - zeta = zetas[k++]; - for (j = 0; j < 4; ++j) { - sword32 p = (sword32)zeta * r[start + j + 4]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[start + j]; - r[start + j + 4] = rj - t; - r[start + j] = rj + t; - } - } - /* len = 2 */ - for (start = 0; start < MLKEM_N; start += 2 * 2) { - zeta = zetas[k++]; - for (j = 0; j < 2; ++j) { - sword32 p = (sword32)zeta * r[start + j + 2]; - sword16 t = MLKEM_MONT_RED(p); - sword16 rj = r[start + j]; - r[start + j + 2] = rj - t; - r[start + j] = rj + t; - } - } - /* Reduce coefficients with quick algorithm. */ - for (j = 0; j < MLKEM_N; ++j) { - sword16 t = a[j] + r[j]; - a[j] = MLKEM_BARRETT_RED(t); - } -#else - /* Unroll len (2, 3, 2) and start loops. */ - unsigned int j; - sword16 t0; - sword16 t1; - sword16 t2; - sword16 t3; - - /* len = 128,64 */ - sword16 zeta128 = zetas[1]; - sword16 zeta64_0 = zetas[2]; - sword16 zeta64_1 = zetas[3]; - for (j = 0; j < MLKEM_N / 8; j++) { - sword16 r0 = r[j + 0]; - sword16 r1 = r[j + 32]; - sword16 r2 = r[j + 64]; - sword16 r3 = r[j + 96]; - sword16 r4 = r[j + 128]; - sword16 r5 = r[j + 160]; - sword16 r6 = r[j + 192]; - sword16 r7 = r[j + 224]; - - t0 = MLKEM_MONT_RED((sword32)zeta128 * r4); - t1 = MLKEM_MONT_RED((sword32)zeta128 * r5); - t2 = MLKEM_MONT_RED((sword32)zeta128 * r6); - t3 = MLKEM_MONT_RED((sword32)zeta128 * r7); - r4 = r0 - t0; - r5 = r1 - t1; - r6 = r2 - t2; - r7 = r3 - t3; - r0 += t0; - r1 += t1; - r2 += t2; - r3 += t3; - - t0 = MLKEM_MONT_RED((sword32)zeta64_0 * r2); - t1 = MLKEM_MONT_RED((sword32)zeta64_0 * r3); - t2 = MLKEM_MONT_RED((sword32)zeta64_1 * r6); - t3 = MLKEM_MONT_RED((sword32)zeta64_1 * r7); - r2 = r0 - t0; - r3 = r1 - t1; - r6 = r4 - t2; - r7 = r5 - t3; - r0 += t0; - r1 += t1; - r4 += t2; - r5 += t3; - - r[j + 0] = r0; - r[j + 32] = r1; - r[j + 64] = r2; - r[j + 96] = r3; - r[j + 128] = r4; - r[j + 160] = r5; - r[j + 192] = r6; - r[j + 224] = r7; - } - - /* len = 32,16,8 */ - for (j = 0; j < MLKEM_N; j += 64) { - int i; - sword16 zeta32 = zetas[ 4 + j / 64 + 0]; - sword16 zeta16_0 = zetas[ 8 + j / 32 + 0]; - sword16 zeta16_1 = zetas[ 8 + j / 32 + 1]; - sword16 zeta8_0 = zetas[16 + j / 16 + 0]; - sword16 zeta8_1 = zetas[16 + j / 16 + 1]; - sword16 zeta8_2 = zetas[16 + j / 16 + 2]; - sword16 zeta8_3 = zetas[16 + j / 16 + 3]; - for (i = 0; i < 8; i++) { - sword16 r0 = r[j + i + 0]; - sword16 r1 = r[j + i + 8]; - sword16 r2 = r[j + i + 16]; - sword16 r3 = r[j + i + 24]; - sword16 r4 = r[j + i + 32]; - sword16 r5 = r[j + i + 40]; - sword16 r6 = r[j + i + 48]; - sword16 r7 = r[j + i + 56]; - - t0 = MLKEM_MONT_RED((sword32)zeta32 * r4); - t1 = MLKEM_MONT_RED((sword32)zeta32 * r5); - t2 = MLKEM_MONT_RED((sword32)zeta32 * r6); - t3 = MLKEM_MONT_RED((sword32)zeta32 * r7); - r4 = r0 - t0; - r5 = r1 - t1; - r6 = r2 - t2; - r7 = r3 - t3; - r0 += t0; - r1 += t1; - r2 += t2; - r3 += t3; - - t0 = MLKEM_MONT_RED((sword32)zeta16_0 * r2); - t1 = MLKEM_MONT_RED((sword32)zeta16_0 * r3); - t2 = MLKEM_MONT_RED((sword32)zeta16_1 * r6); - t3 = MLKEM_MONT_RED((sword32)zeta16_1 * r7); - r2 = r0 - t0; - r3 = r1 - t1; - r6 = r4 - t2; - r7 = r5 - t3; - r0 += t0; - r1 += t1; - r4 += t2; - r5 += t3; - - t0 = MLKEM_MONT_RED((sword32)zeta8_0 * r1); - t1 = MLKEM_MONT_RED((sword32)zeta8_1 * r3); - t2 = MLKEM_MONT_RED((sword32)zeta8_2 * r5); - t3 = MLKEM_MONT_RED((sword32)zeta8_3 * r7); - r1 = r0 - t0; - r3 = r2 - t1; - r5 = r4 - t2; - r7 = r6 - t3; - r0 += t0; - r2 += t1; - r4 += t2; - r6 += t3; - - r[j + i + 0] = r0; - r[j + i + 8] = r1; - r[j + i + 16] = r2; - r[j + i + 24] = r3; - r[j + i + 32] = r4; - r[j + i + 40] = r5; - r[j + i + 48] = r6; - r[j + i + 56] = r7; - } - } - - /* len = 4,2 and Final reduction */ - for (j = 0; j < MLKEM_N; j += 8) { - sword16 zeta4 = zetas[32 + j / 8 + 0]; - sword16 zeta2_0 = zetas[64 + j / 4 + 0]; - sword16 zeta2_1 = zetas[64 + j / 4 + 1]; - sword16 r0 = r[j + 0]; - sword16 r1 = r[j + 1]; - sword16 r2 = r[j + 2]; - sword16 r3 = r[j + 3]; - sword16 r4 = r[j + 4]; - sword16 r5 = r[j + 5]; - sword16 r6 = r[j + 6]; - sword16 r7 = r[j + 7]; - - t0 = MLKEM_MONT_RED((sword32)zeta4 * r4); - t1 = MLKEM_MONT_RED((sword32)zeta4 * r5); - t2 = MLKEM_MONT_RED((sword32)zeta4 * r6); - t3 = MLKEM_MONT_RED((sword32)zeta4 * r7); - r4 = r0 - t0; - r5 = r1 - t1; - r6 = r2 - t2; - r7 = r3 - t3; - r0 += t0; - r1 += t1; - r2 += t2; - r3 += t3; - - t0 = MLKEM_MONT_RED((sword32)zeta2_0 * r2); - t1 = MLKEM_MONT_RED((sword32)zeta2_0 * r3); - t2 = MLKEM_MONT_RED((sword32)zeta2_1 * r6); - t3 = MLKEM_MONT_RED((sword32)zeta2_1 * r7); - r2 = r0 - t0; - r3 = r1 - t1; - r6 = r4 - t2; - r7 = r5 - t3; - r0 += t0; - r1 += t1; - r4 += t2; - r5 += t3; - - r0 += a[j + 0]; - r1 += a[j + 1]; - r2 += a[j + 2]; - r3 += a[j + 3]; - r4 += a[j + 4]; - r5 += a[j + 5]; - r6 += a[j + 6]; - r7 += a[j + 7]; - - a[j + 0] = MLKEM_BARRETT_RED(r0); - a[j + 1] = MLKEM_BARRETT_RED(r1); - a[j + 2] = MLKEM_BARRETT_RED(r2); - a[j + 3] = MLKEM_BARRETT_RED(r3); - a[j + 4] = MLKEM_BARRETT_RED(r4); - a[j + 5] = MLKEM_BARRETT_RED(r5); - a[j + 6] = MLKEM_BARRETT_RED(r6); - a[j + 7] = MLKEM_BARRETT_RED(r7); - } -#endif -} -#endif - #if !defined(WOLFSSL_MLKEM_NO_ENCAPSULATE) || \ !defined(WOLFSSL_MLKEM_NO_DECAPSULATE) /* Zetas for inverse NTT. */ @@ -1783,6 +1481,308 @@ void mlkem_decapsulate(const sword16* s, sword16* w, sword16* u, #else #ifndef WOLFSSL_MLKEM_NO_MAKE_KEY + +#if !defined(WOLFSSL_MLKEM_SMALL) && !defined(WOLFSSL_MLKEM_NO_LARGE_CODE) +/* Number-Theoretic Transform. + * + * FIPS 203, Algorithm 9: NTT(f) + * Computes the NTT representation f_hat of the given polynomial f element of + * R_q. + * 1: f_hat <- f + * 2: i <- 1 + * 3: for (len <- 128; len >= 2; len <- len/2) + * 4: for (start <- 0; start < 256; start <- start + 2.len) + * 5: zeta <- zetas^BitRev_7(i) mod q + * 6: i <- i + 1 + * 7: for (j <- start; j < start + len; j++) + * 8: t <- zeta.f[j+len] + * 9: f_hat[j+len] <- f_hat[j] - t + * 10: f_hat[j] <- f_hat[j] - t + * 11: end for + * 12: end for + * 13: end for + * 14: return f_hat + * + * @param [in, out] r Polynomial to transform. + */ +static void mlkem_ntt_add_to(sword16* r, sword16* a) +{ +#if defined(WOLFSSL_MLKEM_NTT_UNROLL) + /* Unroll len loop (Step 3). */ + unsigned int k = 1; + unsigned int j; + unsigned int start; + sword16 zeta = zetas[k++]; + + /* len = 128 */ + for (j = 0; j < MLKEM_N / 2; ++j) { + sword32 p = (sword32)zeta * r[j + MLKEM_N / 2]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[j]; + r[j + MLKEM_N / 2] = rj - t; + r[j] = rj + t; + } + /* len = 64 */ + for (start = 0; start < MLKEM_N; start += 2 * 64) { + zeta = zetas[k++]; + for (j = 0; j < 64; ++j) { + sword32 p = (sword32)zeta * r[start + j + 64]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[start + j]; + r[start + j + 64] = rj - t; + r[start + j] = rj + t; + } + } + /* len = 32 */ + for (start = 0; start < MLKEM_N; start += 2 * 32) { + zeta = zetas[k++]; + for (j = 0; j < 32; ++j) { + sword32 p = (sword32)zeta * r[start + j + 32]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[start + j]; + r[start + j + 32] = rj - t; + r[start + j] = rj + t; + } + } + /* len = 16 */ + for (start = 0; start < MLKEM_N; start += 2 * 16) { + zeta = zetas[k++]; + for (j = 0; j < 16; ++j) { + sword32 p = (sword32)zeta * r[start + j + 16]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[start + j]; + r[start + j + 16] = rj - t; + r[start + j] = rj + t; + } + } + /* len = 8 */ + for (start = 0; start < MLKEM_N; start += 2 * 8) { + zeta = zetas[k++]; + for (j = 0; j < 8; ++j) { + sword32 p = (sword32)zeta * r[start + j + 8]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[start + j]; + r[start + j + 8] = rj - t; + r[start + j] = rj + t; + } + } + /* len = 4 */ + for (start = 0; start < MLKEM_N; start += 2 * 4) { + zeta = zetas[k++]; + for (j = 0; j < 4; ++j) { + sword32 p = (sword32)zeta * r[start + j + 4]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[start + j]; + r[start + j + 4] = rj - t; + r[start + j] = rj + t; + } + } + /* len = 2 */ + for (start = 0; start < MLKEM_N; start += 2 * 2) { + zeta = zetas[k++]; + for (j = 0; j < 2; ++j) { + sword32 p = (sword32)zeta * r[start + j + 2]; + sword16 t = MLKEM_MONT_RED(p); + sword16 rj = r[start + j]; + r[start + j + 2] = rj - t; + r[start + j] = rj + t; + } + } + /* Reduce coefficients with quick algorithm. */ + for (j = 0; j < MLKEM_N; ++j) { + sword16 t = a[j] + r[j]; + a[j] = MLKEM_BARRETT_RED(t); + } +#else /* !WOLFSSL_MLKEM_NTT_UNROLL */ + /* Unroll len (2, 3, 2) and start loops. */ + unsigned int j; + sword16 t0; + sword16 t1; + sword16 t2; + sword16 t3; + + /* len = 128,64 */ + sword16 zeta128 = zetas[1]; + sword16 zeta64_0 = zetas[2]; + sword16 zeta64_1 = zetas[3]; + for (j = 0; j < MLKEM_N / 8; j++) { + sword16 r0 = r[j + 0]; + sword16 r1 = r[j + 32]; + sword16 r2 = r[j + 64]; + sword16 r3 = r[j + 96]; + sword16 r4 = r[j + 128]; + sword16 r5 = r[j + 160]; + sword16 r6 = r[j + 192]; + sword16 r7 = r[j + 224]; + + t0 = MLKEM_MONT_RED((sword32)zeta128 * r4); + t1 = MLKEM_MONT_RED((sword32)zeta128 * r5); + t2 = MLKEM_MONT_RED((sword32)zeta128 * r6); + t3 = MLKEM_MONT_RED((sword32)zeta128 * r7); + r4 = r0 - t0; + r5 = r1 - t1; + r6 = r2 - t2; + r7 = r3 - t3; + r0 += t0; + r1 += t1; + r2 += t2; + r3 += t3; + + t0 = MLKEM_MONT_RED((sword32)zeta64_0 * r2); + t1 = MLKEM_MONT_RED((sword32)zeta64_0 * r3); + t2 = MLKEM_MONT_RED((sword32)zeta64_1 * r6); + t3 = MLKEM_MONT_RED((sword32)zeta64_1 * r7); + r2 = r0 - t0; + r3 = r1 - t1; + r6 = r4 - t2; + r7 = r5 - t3; + r0 += t0; + r1 += t1; + r4 += t2; + r5 += t3; + + r[j + 0] = r0; + r[j + 32] = r1; + r[j + 64] = r2; + r[j + 96] = r3; + r[j + 128] = r4; + r[j + 160] = r5; + r[j + 192] = r6; + r[j + 224] = r7; + } + + /* len = 32,16,8 */ + for (j = 0; j < MLKEM_N; j += 64) { + int i; + sword16 zeta32 = zetas[ 4 + j / 64 + 0]; + sword16 zeta16_0 = zetas[ 8 + j / 32 + 0]; + sword16 zeta16_1 = zetas[ 8 + j / 32 + 1]; + sword16 zeta8_0 = zetas[16 + j / 16 + 0]; + sword16 zeta8_1 = zetas[16 + j / 16 + 1]; + sword16 zeta8_2 = zetas[16 + j / 16 + 2]; + sword16 zeta8_3 = zetas[16 + j / 16 + 3]; + for (i = 0; i < 8; i++) { + sword16 r0 = r[j + i + 0]; + sword16 r1 = r[j + i + 8]; + sword16 r2 = r[j + i + 16]; + sword16 r3 = r[j + i + 24]; + sword16 r4 = r[j + i + 32]; + sword16 r5 = r[j + i + 40]; + sword16 r6 = r[j + i + 48]; + sword16 r7 = r[j + i + 56]; + + t0 = MLKEM_MONT_RED((sword32)zeta32 * r4); + t1 = MLKEM_MONT_RED((sword32)zeta32 * r5); + t2 = MLKEM_MONT_RED((sword32)zeta32 * r6); + t3 = MLKEM_MONT_RED((sword32)zeta32 * r7); + r4 = r0 - t0; + r5 = r1 - t1; + r6 = r2 - t2; + r7 = r3 - t3; + r0 += t0; + r1 += t1; + r2 += t2; + r3 += t3; + + t0 = MLKEM_MONT_RED((sword32)zeta16_0 * r2); + t1 = MLKEM_MONT_RED((sword32)zeta16_0 * r3); + t2 = MLKEM_MONT_RED((sword32)zeta16_1 * r6); + t3 = MLKEM_MONT_RED((sword32)zeta16_1 * r7); + r2 = r0 - t0; + r3 = r1 - t1; + r6 = r4 - t2; + r7 = r5 - t3; + r0 += t0; + r1 += t1; + r4 += t2; + r5 += t3; + + t0 = MLKEM_MONT_RED((sword32)zeta8_0 * r1); + t1 = MLKEM_MONT_RED((sword32)zeta8_1 * r3); + t2 = MLKEM_MONT_RED((sword32)zeta8_2 * r5); + t3 = MLKEM_MONT_RED((sword32)zeta8_3 * r7); + r1 = r0 - t0; + r3 = r2 - t1; + r5 = r4 - t2; + r7 = r6 - t3; + r0 += t0; + r2 += t1; + r4 += t2; + r6 += t3; + + r[j + i + 0] = r0; + r[j + i + 8] = r1; + r[j + i + 16] = r2; + r[j + i + 24] = r3; + r[j + i + 32] = r4; + r[j + i + 40] = r5; + r[j + i + 48] = r6; + r[j + i + 56] = r7; + } + } + + /* len = 4,2 and Final reduction */ + for (j = 0; j < MLKEM_N; j += 8) { + sword16 zeta4 = zetas[32 + j / 8 + 0]; + sword16 zeta2_0 = zetas[64 + j / 4 + 0]; + sword16 zeta2_1 = zetas[64 + j / 4 + 1]; + sword16 r0 = r[j + 0]; + sword16 r1 = r[j + 1]; + sword16 r2 = r[j + 2]; + sword16 r3 = r[j + 3]; + sword16 r4 = r[j + 4]; + sword16 r5 = r[j + 5]; + sword16 r6 = r[j + 6]; + sword16 r7 = r[j + 7]; + + t0 = MLKEM_MONT_RED((sword32)zeta4 * r4); + t1 = MLKEM_MONT_RED((sword32)zeta4 * r5); + t2 = MLKEM_MONT_RED((sword32)zeta4 * r6); + t3 = MLKEM_MONT_RED((sword32)zeta4 * r7); + r4 = r0 - t0; + r5 = r1 - t1; + r6 = r2 - t2; + r7 = r3 - t3; + r0 += t0; + r1 += t1; + r2 += t2; + r3 += t3; + + t0 = MLKEM_MONT_RED((sword32)zeta2_0 * r2); + t1 = MLKEM_MONT_RED((sword32)zeta2_0 * r3); + t2 = MLKEM_MONT_RED((sword32)zeta2_1 * r6); + t3 = MLKEM_MONT_RED((sword32)zeta2_1 * r7); + r2 = r0 - t0; + r3 = r1 - t1; + r6 = r4 - t2; + r7 = r5 - t3; + r0 += t0; + r1 += t1; + r4 += t2; + r5 += t3; + + r0 += a[j + 0]; + r1 += a[j + 1]; + r2 += a[j + 2]; + r3 += a[j + 3]; + r4 += a[j + 4]; + r5 += a[j + 5]; + r6 += a[j + 6]; + r7 += a[j + 7]; + + a[j + 0] = MLKEM_BARRETT_RED(r0); + a[j + 1] = MLKEM_BARRETT_RED(r1); + a[j + 2] = MLKEM_BARRETT_RED(r2); + a[j + 3] = MLKEM_BARRETT_RED(r3); + a[j + 4] = MLKEM_BARRETT_RED(r4); + a[j + 5] = MLKEM_BARRETT_RED(r5); + a[j + 6] = MLKEM_BARRETT_RED(r6); + a[j + 7] = MLKEM_BARRETT_RED(r7); + } +#endif /* !WOLFSSL_MLKEM_NTT_UNROLL */ +} +#endif /* !WOLFSSL_MLKEM_SMALL && !WOLFSSL_MLKEM_NO_LARGE_CODE */ + #ifndef WOLFSSL_MLKEM_MAKEKEY_SMALL_MEM /* Generate a public-private key pair from randomly generated data. * @@ -1873,7 +1873,7 @@ void mlkem_keygen(sword16* s, sword16* t, sword16* e, const sword16* a, int k) } } -#else +#else /* WOLFSSL_MLKEM_MAKEKEY_SMALL_MEM */ /* Generate a public-private key pair from randomly generated data. * @@ -1959,7 +1959,7 @@ int mlkem_keygen_seeds(sword16* s, sword16* t, MLKEM_PRF_T* prf, return ret; } -#endif +#endif /* WOLFSSL_MLKEM_MAKEKEY_SMALL_MEM */ #endif /* !WOLFSSL_MLKEM_NO_MAKE_KEY */ #if !defined(WOLFSSL_MLKEM_NO_ENCAPSULATE) || \