diff --git a/linuxkm/Kbuild b/linuxkm/Kbuild index 3f807d18b..93a440bd7 100644 --- a/linuxkm/Kbuild +++ b/linuxkm/Kbuild @@ -64,18 +64,38 @@ $(obj)/wolfcrypt/test/test.o: ccflags-y += -DNO_MAIN_DRIVER $(obj)/wolfcrypt/src/aes.o: ccflags-y = $(WOLFSSL_CFLAGS) $(WOLFSSL_CFLAGS_YES_VECTOR_INSNS) -asflags-y := $(WOLFSSL_ASFLAGS) +asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPUSIMD_DISABLE) -# these two _asms are kernel-compatible (they don't reference the pic-related _GLOBAL_OFFSET_TABLE_) -# but they still irritate objtool: "unannotated intra-function call" and "BP used as a scratch register" +# after the C wrapper for a vectorized algorithm has been equipped with {SAVE,RESTORE}_VECTOR_REGISTERS(), +# it can be safely included here: +$(obj)/wolfcrypt/src/aes_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/aes_gcm_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/chacha_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/poly1305_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/sha256_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/sha512_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/fe_x25519_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) + +# these _asms are kernel-compatible, but they still irritate objtool: $(obj)/wolfcrypt/src/aes_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/aes_gcm_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/chacha_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/poly1305_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/sha256_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/sha512_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/fe_x25519_asm.o: OBJECT_FILES_NON_STANDARD := y # auto-generate the exported symbol list, leveraging the WOLFSSL_API visibility tags. # exclude symbols that don't match wc_* or wolf*. $(src)/linuxkm/module_exports.c: $(src)/linuxkm/module_exports.c.template $(WOLFSSL_OBJ_TARGETS) @cp $< $@ - @readelf --symbols --wide $(WOLFSSL_OBJ_TARGETS) | awk '/^ *[0-9]+: /{if ($$8 !~ /^(wc_|wolf)/){next;} if (($$4 == "FUNC") && ($$5 == "GLOBAL") && ($$6 == "DEFAULT")) { print "EXPORT_SYMBOL(" $$8 ");"; }}' >> $@ + @readelf --symbols --wide $(WOLFSSL_OBJ_TARGETS) | \ + awk '/^ *[0-9]+: / { \ + if ($$8 !~ /^(wc_|wolf)/){next;} \ + if (($$4 == "FUNC") && ($$5 == "GLOBAL") && ($$6 == "DEFAULT")) { \ + print "EXPORT_SYMBOL(" $$8 ");"; \ + } \ + }' >> $@ @echo -e '#ifndef NO_CRYPT_TEST\nEXPORT_SYMBOL(wolfcrypt_test);\n#endif' >> $@ clean-files := module_exports.c diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index a7ca6ef95..1a9b634d0 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -339,12 +339,16 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(cpuidFlags)) { + SAVE_VECTOR_REGISTERS(); chacha_encrypt_avx2(ctx, input, output, msglen); + RESTORE_VECTOR_REGISTERS(); return 0; } #endif if (IS_INTEL_AVX1(cpuidFlags)) { + SAVE_VECTOR_REGISTERS(); chacha_encrypt_avx1(ctx, input, output, msglen); + RESTORE_VECTOR_REGISTERS(); return 0; } else { diff --git a/wolfcrypt/src/curve25519.c b/wolfcrypt/src/curve25519.c index f5a80ffc1..a0e0b19cb 100644 --- a/wolfcrypt/src/curve25519.c +++ b/wolfcrypt/src/curve25519.c @@ -85,7 +85,16 @@ int wc_curve25519_make_pub(int public_size, byte* pub, int private_size, } #else fe_init(); + + #if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM) + SAVE_VECTOR_REGISTERS(); + #endif + ret = curve25519(pub, priv, kCurve25519BasePoint); + + #if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM) + RESTORE_VECTOR_REGISTERS(); + #endif #endif return ret; @@ -148,7 +157,15 @@ int wc_curve25519_shared_secret_ex(curve25519_key* private_key, #ifdef FREESCALE_LTC_ECC ret = nxp_ltc_curve25519(&o, private_key->k.point, &public_key->p, kLTC_Curve25519 /* input point P on Curve25519 */); #else + #if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM) + SAVE_VECTOR_REGISTERS(); + #endif + ret = curve25519(o, private_key->k.point, public_key->p.point); + + #if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM) + RESTORE_VECTOR_REGISTERS(); + #endif #endif if (ret != 0) { #ifdef FREESCALE_LTC_ECC diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index d76a27a23..2fdd51436 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -267,7 +267,9 @@ static void poly1305_blocks(Poly1305* ctx, const unsigned char *m, { #ifdef USE_INTEL_SPEEDUP /* AVX2 is handled in wc_Poly1305Update. */ + SAVE_VECTOR_REGISTERS(); poly1305_blocks_avx(ctx, m, bytes); + RESTORE_VECTOR_REGISTERS(); #elif defined(POLY130564) const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */ word64 r0,r1,r2; @@ -394,7 +396,9 @@ static void poly1305_block(Poly1305* ctx, const unsigned char *m) { #ifdef USE_INTEL_SPEEDUP /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */ + SAVE_VECTOR_REGISTERS(); poly1305_block_avx(ctx, m); + RESTORE_VECTOR_REGISTERS(); #else poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE); #endif @@ -430,12 +434,14 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) intel_flags = cpuid_get_flags(); cpu_flags_set = 1; } + SAVE_VECTOR_REGISTERS(); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) poly1305_setkey_avx2(ctx, key); else #endif poly1305_setkey_avx(ctx, key); + RESTORE_VECTOR_REGISTERS(); #elif defined(POLY130564) /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ @@ -510,12 +516,14 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return BAD_FUNC_ARG; #ifdef USE_INTEL_SPEEDUP + SAVE_VECTOR_REGISTERS(); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) poly1305_final_avx2(ctx, mac); else #endif poly1305_final_avx(ctx, mac); + RESTORE_VECTOR_REGISTERS(); #elif defined(POLY130564) /* process the remaining block */ @@ -712,11 +720,13 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) if (ctx->leftover < sizeof(ctx->buffer)) return 0; + SAVE_VECTOR_REGISTERS(); if (!ctx->started) poly1305_calc_powers_avx2(ctx); poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer)); ctx->leftover = 0; - } + } else + SAVE_VECTOR_REGISTERS(); /* process full blocks */ if (bytes >= sizeof(ctx->buffer)) { @@ -735,6 +745,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) ctx->buffer[ctx->leftover + i] = m[i]; ctx->leftover += bytes; } + RESTORE_VECTOR_REGISTERS(); } else #endif diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index 3b9d2f169..41b01712b 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -309,9 +309,26 @@ static int InitSha256(wc_Sha256* sha256) /* = NULL */ static int transform_check = 0; static word32 intel_flags; + static int Transform_Sha256_is_vectorized = 0; - #define XTRANSFORM(S, D) (*Transform_Sha256_p)((S),(D)) - #define XTRANSFORM_LEN(S, D, L) (*Transform_Sha256_Len_p)((S),(D),(L)) + #define XTRANSFORM(S, D) ({ \ + int _ret; \ + if (Transform_Sha256_is_vectorized) \ + SAVE_VECTOR_REGISTERS(); \ + _ret = (*Transform_Sha256_p)((S),(D)); \ + if (Transform_Sha256_is_vectorized) \ + RESTORE_VECTOR_REGISTERS(); \ + _ret; \ + }) + #define XTRANSFORM_LEN(S, D, L) ({ \ + int _ret; \ + if (Transform_Sha256_is_vectorized) \ + SAVE_VECTOR_REGISTERS(); \ + _ret = (*Transform_Sha256_Len_p)((S),(D),(L)); \ + if (Transform_Sha256_is_vectorized) \ + RESTORE_VECTOR_REGISTERS(); \ + _ret; \ + }) static void Sha256_SetTransform(void) { @@ -327,6 +344,7 @@ static int InitSha256(wc_Sha256* sha256) if (IS_INTEL_BMI2(intel_flags)) { Transform_Sha256_p = Transform_Sha256_AVX2_RORX; Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len; + Transform_Sha256_is_vectorized = 1; } else #endif @@ -334,11 +352,13 @@ static int InitSha256(wc_Sha256* sha256) { Transform_Sha256_p = Transform_Sha256_AVX2; Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len; + Transform_Sha256_is_vectorized = 1; } #ifdef HAVE_INTEL_RORX else { Transform_Sha256_p = Transform_Sha256_AVX1_RORX; Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len; + Transform_Sha256_is_vectorized = 1; } #endif } @@ -348,12 +368,14 @@ static int InitSha256(wc_Sha256* sha256) if (IS_INTEL_AVX1(intel_flags)) { Transform_Sha256_p = Transform_Sha256_AVX1; Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len; + Transform_Sha256_is_vectorized = 1; } else #endif { Transform_Sha256_p = Transform_Sha256; Transform_Sha256_Len_p = NULL; + Transform_Sha256_is_vectorized = 0; } transform_check = 1; diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index 5bceb44b6..aa0df73f9 100644 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -337,9 +337,31 @@ static int InitSha512(wc_Sha512* sha512) static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL; static int transform_check = 0; static int intel_flags; + static int Transform_Sha512_is_vectorized = 0; +#if 0 #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512) #define Transform_Sha512_Len(sha512, len) \ (*Transform_Sha512_Len_p)(sha512, len) +#endif + + #define Transform_Sha512(sha512) ({ \ + int _ret; \ + if (Transform_Sha512_is_vectorized) \ + SAVE_VECTOR_REGISTERS(); \ + _ret = (*Transform_Sha512_p)(sha512); \ + if (Transform_Sha512_is_vectorized) \ + RESTORE_VECTOR_REGISTERS(); \ + _ret; \ + }) +#define Transform_Sha512_Len(sha512, len) ({ \ + int _ret; \ + if (Transform_Sha512_is_vectorized) \ + SAVE_VECTOR_REGISTERS(); \ + _ret = (*Transform_Sha512_Len_p)(sha512, len); \ + if (Transform_Sha512_is_vectorized) \ + RESTORE_VECTOR_REGISTERS(); \ + _ret; \ + }) static void Sha512_SetTransform(void) { @@ -354,17 +376,20 @@ static int InitSha512(wc_Sha512* sha512) if (IS_INTEL_BMI2(intel_flags)) { Transform_Sha512_p = Transform_Sha512_AVX2_RORX; Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len; + Transform_Sha512_is_vectorized = 1; } else #endif if (1) { Transform_Sha512_p = Transform_Sha512_AVX2; Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len; + Transform_Sha512_is_vectorized = 1; } #ifdef HAVE_INTEL_RORX else { Transform_Sha512_p = Transform_Sha512_AVX1_RORX; Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len; + Transform_Sha512_is_vectorized = 1; } #endif } @@ -374,10 +399,14 @@ static int InitSha512(wc_Sha512* sha512) if (IS_INTEL_AVX1(intel_flags)) { Transform_Sha512_p = Transform_Sha512_AVX1; Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len; + Transform_Sha512_is_vectorized = 1; } else #endif + { Transform_Sha512_p = _Transform_Sha512; + Transform_Sha512_is_vectorized = 1; + } transform_check = 1; } diff --git a/wolfssl/wolfcrypt/wc_port.h b/wolfssl/wolfcrypt/wc_port.h index 8ebc0808a..e876cdc4d 100644 --- a/wolfssl/wolfcrypt/wc_port.h +++ b/wolfssl/wolfcrypt/wc_port.h @@ -88,12 +88,21 @@ #endif #include #include - #if defined(WOLFSSL_AESNI) || defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM) - #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) - #include - #else - #include - #endif + #if defined(WOLFSSL_AESNI) || defined(USE_INTEL_SPEEDUP) + #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) + #include + #else + #include + #endif + #define SAVE_VECTOR_REGISTERS() kernel_fpu_begin() + #define RESTORE_VECTOR_REGISTERS() kernel_fpu_end() + #elif defined(WOLFSSL_ARMASM) + #include +#define SAVE_VECTOR_REGISTERS() ({ preempt_disable(); fpsimd_preserve_current_state(); }) +#define SAVE_VECTOR_REGISTERS() ({ fpsimd_restore_current_state(); preempt_enable(); }) + #else + #define SAVE_VECTOR_REGISTERS() ({}) + #define RESTORE_VECTOR_REGISTERS() ({}) #endif _Pragma("GCC diagnostic pop"); @@ -122,13 +131,10 @@ /* the rigmarole around kstrtol() here is to accommodate its warn-unused-result attribute. */ #define XATOI(s) ({ long _xatoi_res = 0; int _xatoi_ret = kstrtol(s, 10, &_xatoi_res); if (_xatoi_ret != 0) { _xatoi_res = 0; } (int)_xatoi_res; }) - #define SAVE_VECTOR_REGISTERS() kernel_fpu_begin() - #define RESTORE_VECTOR_REGISTERS() kernel_fpu_end() - #else /* ! WOLFSSL_LINUXKM */ - #define SAVE_VECTOR_REGISTERS() - #define RESTORE_VECTOR_REGISTERS() + #define SAVE_VECTOR_REGISTERS() ({}) + #define RESTORE_VECTOR_REGISTERS() ({}) #endif /* WOLFSSL_LINUXKM */