linuxkm: enable the rest of the _asm implementations for x86, wrapped in {SAVE,RESTORE}_VECTOR_REGISTERS().

This commit is contained in:
Daniel Pouzzner
2020-09-08 23:11:03 -05:00
parent 331fe47eb6
commit 7c2aefcfdd
7 changed files with 127 additions and 18 deletions

View File

@@ -64,18 +64,38 @@ $(obj)/wolfcrypt/test/test.o: ccflags-y += -DNO_MAIN_DRIVER
$(obj)/wolfcrypt/src/aes.o: ccflags-y = $(WOLFSSL_CFLAGS) $(WOLFSSL_CFLAGS_YES_VECTOR_INSNS)
asflags-y := $(WOLFSSL_ASFLAGS)
asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPUSIMD_DISABLE)
# these two _asms are kernel-compatible (they don't reference the pic-related _GLOBAL_OFFSET_TABLE_)
# but they still irritate objtool: "unannotated intra-function call" and "BP used as a scratch register"
# after the C wrapper for a vectorized algorithm has been equipped with {SAVE,RESTORE}_VECTOR_REGISTERS(),
# it can be safely included here:
$(obj)/wolfcrypt/src/aes_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/aes_gcm_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/chacha_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/poly1305_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/sha256_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/sha512_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/fe_x25519_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
# these _asms are kernel-compatible, but they still irritate objtool:
$(obj)/wolfcrypt/src/aes_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/aes_gcm_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/chacha_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/poly1305_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/sha256_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/sha512_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/fe_x25519_asm.o: OBJECT_FILES_NON_STANDARD := y
# auto-generate the exported symbol list, leveraging the WOLFSSL_API visibility tags.
# exclude symbols that don't match wc_* or wolf*.
$(src)/linuxkm/module_exports.c: $(src)/linuxkm/module_exports.c.template $(WOLFSSL_OBJ_TARGETS)
@cp $< $@
@readelf --symbols --wide $(WOLFSSL_OBJ_TARGETS) | awk '/^ *[0-9]+: /{if ($$8 !~ /^(wc_|wolf)/){next;} if (($$4 == "FUNC") && ($$5 == "GLOBAL") && ($$6 == "DEFAULT")) { print "EXPORT_SYMBOL(" $$8 ");"; }}' >> $@
@readelf --symbols --wide $(WOLFSSL_OBJ_TARGETS) | \
awk '/^ *[0-9]+: / { \
if ($$8 !~ /^(wc_|wolf)/){next;} \
if (($$4 == "FUNC") && ($$5 == "GLOBAL") && ($$6 == "DEFAULT")) { \
print "EXPORT_SYMBOL(" $$8 ");"; \
} \
}' >> $@
@echo -e '#ifndef NO_CRYPT_TEST\nEXPORT_SYMBOL(wolfcrypt_test);\n#endif' >> $@
clean-files := module_exports.c

View File

@@ -339,12 +339,16 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(cpuidFlags)) {
SAVE_VECTOR_REGISTERS();
chacha_encrypt_avx2(ctx, input, output, msglen);
RESTORE_VECTOR_REGISTERS();
return 0;
}
#endif
if (IS_INTEL_AVX1(cpuidFlags)) {
SAVE_VECTOR_REGISTERS();
chacha_encrypt_avx1(ctx, input, output, msglen);
RESTORE_VECTOR_REGISTERS();
return 0;
}
else {

View File

@@ -85,7 +85,16 @@ int wc_curve25519_make_pub(int public_size, byte* pub, int private_size,
}
#else
fe_init();
#if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM)
SAVE_VECTOR_REGISTERS();
#endif
ret = curve25519(pub, priv, kCurve25519BasePoint);
#if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM)
RESTORE_VECTOR_REGISTERS();
#endif
#endif
return ret;
@@ -148,7 +157,15 @@ int wc_curve25519_shared_secret_ex(curve25519_key* private_key,
#ifdef FREESCALE_LTC_ECC
ret = nxp_ltc_curve25519(&o, private_key->k.point, &public_key->p, kLTC_Curve25519 /* input point P on Curve25519 */);
#else
#if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM)
SAVE_VECTOR_REGISTERS();
#endif
ret = curve25519(o, private_key->k.point, public_key->p.point);
#if defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM)
RESTORE_VECTOR_REGISTERS();
#endif
#endif
if (ret != 0) {
#ifdef FREESCALE_LTC_ECC

View File

@@ -267,7 +267,9 @@ static void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
{
#ifdef USE_INTEL_SPEEDUP
/* AVX2 is handled in wc_Poly1305Update. */
SAVE_VECTOR_REGISTERS();
poly1305_blocks_avx(ctx, m, bytes);
RESTORE_VECTOR_REGISTERS();
#elif defined(POLY130564)
const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
word64 r0,r1,r2;
@@ -394,7 +396,9 @@ static void poly1305_block(Poly1305* ctx, const unsigned char *m)
{
#ifdef USE_INTEL_SPEEDUP
/* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
SAVE_VECTOR_REGISTERS();
poly1305_block_avx(ctx, m);
RESTORE_VECTOR_REGISTERS();
#else
poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
#endif
@@ -430,12 +434,14 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
intel_flags = cpuid_get_flags();
cpu_flags_set = 1;
}
SAVE_VECTOR_REGISTERS();
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags))
poly1305_setkey_avx2(ctx, key);
else
#endif
poly1305_setkey_avx(ctx, key);
RESTORE_VECTOR_REGISTERS();
#elif defined(POLY130564)
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
@@ -510,12 +516,14 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
return BAD_FUNC_ARG;
#ifdef USE_INTEL_SPEEDUP
SAVE_VECTOR_REGISTERS();
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags))
poly1305_final_avx2(ctx, mac);
else
#endif
poly1305_final_avx(ctx, mac);
RESTORE_VECTOR_REGISTERS();
#elif defined(POLY130564)
/* process the remaining block */
@@ -712,11 +720,13 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
if (ctx->leftover < sizeof(ctx->buffer))
return 0;
SAVE_VECTOR_REGISTERS();
if (!ctx->started)
poly1305_calc_powers_avx2(ctx);
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
ctx->leftover = 0;
}
} else
SAVE_VECTOR_REGISTERS();
/* process full blocks */
if (bytes >= sizeof(ctx->buffer)) {
@@ -735,6 +745,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
ctx->buffer[ctx->leftover + i] = m[i];
ctx->leftover += bytes;
}
RESTORE_VECTOR_REGISTERS();
}
else
#endif

View File

@@ -309,9 +309,26 @@ static int InitSha256(wc_Sha256* sha256)
/* = NULL */
static int transform_check = 0;
static word32 intel_flags;
static int Transform_Sha256_is_vectorized = 0;
#define XTRANSFORM(S, D) (*Transform_Sha256_p)((S),(D))
#define XTRANSFORM_LEN(S, D, L) (*Transform_Sha256_Len_p)((S),(D),(L))
#define XTRANSFORM(S, D) ({ \
int _ret; \
if (Transform_Sha256_is_vectorized) \
SAVE_VECTOR_REGISTERS(); \
_ret = (*Transform_Sha256_p)((S),(D)); \
if (Transform_Sha256_is_vectorized) \
RESTORE_VECTOR_REGISTERS(); \
_ret; \
})
#define XTRANSFORM_LEN(S, D, L) ({ \
int _ret; \
if (Transform_Sha256_is_vectorized) \
SAVE_VECTOR_REGISTERS(); \
_ret = (*Transform_Sha256_Len_p)((S),(D),(L)); \
if (Transform_Sha256_is_vectorized) \
RESTORE_VECTOR_REGISTERS(); \
_ret; \
})
static void Sha256_SetTransform(void)
{
@@ -327,6 +344,7 @@ static int InitSha256(wc_Sha256* sha256)
if (IS_INTEL_BMI2(intel_flags)) {
Transform_Sha256_p = Transform_Sha256_AVX2_RORX;
Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len;
Transform_Sha256_is_vectorized = 1;
}
else
#endif
@@ -334,11 +352,13 @@ static int InitSha256(wc_Sha256* sha256)
{
Transform_Sha256_p = Transform_Sha256_AVX2;
Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len;
Transform_Sha256_is_vectorized = 1;
}
#ifdef HAVE_INTEL_RORX
else {
Transform_Sha256_p = Transform_Sha256_AVX1_RORX;
Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len;
Transform_Sha256_is_vectorized = 1;
}
#endif
}
@@ -348,12 +368,14 @@ static int InitSha256(wc_Sha256* sha256)
if (IS_INTEL_AVX1(intel_flags)) {
Transform_Sha256_p = Transform_Sha256_AVX1;
Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len;
Transform_Sha256_is_vectorized = 1;
}
else
#endif
{
Transform_Sha256_p = Transform_Sha256;
Transform_Sha256_Len_p = NULL;
Transform_Sha256_is_vectorized = 0;
}
transform_check = 1;

View File

@@ -337,9 +337,31 @@ static int InitSha512(wc_Sha512* sha512)
static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL;
static int transform_check = 0;
static int intel_flags;
static int Transform_Sha512_is_vectorized = 0;
#if 0
#define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512)
#define Transform_Sha512_Len(sha512, len) \
(*Transform_Sha512_Len_p)(sha512, len)
#endif
#define Transform_Sha512(sha512) ({ \
int _ret; \
if (Transform_Sha512_is_vectorized) \
SAVE_VECTOR_REGISTERS(); \
_ret = (*Transform_Sha512_p)(sha512); \
if (Transform_Sha512_is_vectorized) \
RESTORE_VECTOR_REGISTERS(); \
_ret; \
})
#define Transform_Sha512_Len(sha512, len) ({ \
int _ret; \
if (Transform_Sha512_is_vectorized) \
SAVE_VECTOR_REGISTERS(); \
_ret = (*Transform_Sha512_Len_p)(sha512, len); \
if (Transform_Sha512_is_vectorized) \
RESTORE_VECTOR_REGISTERS(); \
_ret; \
})
static void Sha512_SetTransform(void)
{
@@ -354,17 +376,20 @@ static int InitSha512(wc_Sha512* sha512)
if (IS_INTEL_BMI2(intel_flags)) {
Transform_Sha512_p = Transform_Sha512_AVX2_RORX;
Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len;
Transform_Sha512_is_vectorized = 1;
}
else
#endif
if (1) {
Transform_Sha512_p = Transform_Sha512_AVX2;
Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len;
Transform_Sha512_is_vectorized = 1;
}
#ifdef HAVE_INTEL_RORX
else {
Transform_Sha512_p = Transform_Sha512_AVX1_RORX;
Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len;
Transform_Sha512_is_vectorized = 1;
}
#endif
}
@@ -374,10 +399,14 @@ static int InitSha512(wc_Sha512* sha512)
if (IS_INTEL_AVX1(intel_flags)) {
Transform_Sha512_p = Transform_Sha512_AVX1;
Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len;
Transform_Sha512_is_vectorized = 1;
}
else
#endif
{
Transform_Sha512_p = _Transform_Sha512;
Transform_Sha512_is_vectorized = 1;
}
transform_check = 1;
}

View File

@@ -88,12 +88,21 @@
#endif
#include <linux/net.h>
#include <linux/slab.h>
#if defined(WOLFSSL_AESNI) || defined(USE_INTEL_SPEEDUP) || defined(WOLFSSL_ARMASM)
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
#include <asm/i387.h>
#else
#include <asm/simd.h>
#endif
#if defined(WOLFSSL_AESNI) || defined(USE_INTEL_SPEEDUP)
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
#include <asm/i387.h>
#else
#include <asm/simd.h>
#endif
#define SAVE_VECTOR_REGISTERS() kernel_fpu_begin()
#define RESTORE_VECTOR_REGISTERS() kernel_fpu_end()
#elif defined(WOLFSSL_ARMASM)
#include <asm/fpsimd.h>
#define SAVE_VECTOR_REGISTERS() ({ preempt_disable(); fpsimd_preserve_current_state(); })
#define SAVE_VECTOR_REGISTERS() ({ fpsimd_restore_current_state(); preempt_enable(); })
#else
#define SAVE_VECTOR_REGISTERS() ({})
#define RESTORE_VECTOR_REGISTERS() ({})
#endif
_Pragma("GCC diagnostic pop");
@@ -122,13 +131,10 @@
/* the rigmarole around kstrtol() here is to accommodate its warn-unused-result attribute. */
#define XATOI(s) ({ long _xatoi_res = 0; int _xatoi_ret = kstrtol(s, 10, &_xatoi_res); if (_xatoi_ret != 0) { _xatoi_res = 0; } (int)_xatoi_res; })
#define SAVE_VECTOR_REGISTERS() kernel_fpu_begin()
#define RESTORE_VECTOR_REGISTERS() kernel_fpu_end()
#else /* ! WOLFSSL_LINUXKM */
#define SAVE_VECTOR_REGISTERS()
#define RESTORE_VECTOR_REGISTERS()
#define SAVE_VECTOR_REGISTERS() ({})
#define RESTORE_VECTOR_REGISTERS() ({})
#endif /* WOLFSSL_LINUXKM */