Merge pull request #2453 from SparkiDev/armv8_x18

ARM64 assembly - x18 not able to be used
This commit is contained in:
toddouska
2019-09-06 15:45:02 -07:00
committed by GitHub
5 changed files with 6630 additions and 6626 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -23,8 +23,6 @@
* cd ../scripts * cd ../scripts
* ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S
*/ */
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__ #ifdef __aarch64__
.text .text
.section .rodata .section .rodata
@ -127,16 +125,16 @@ Transform_Sha512_Len:
stp x29, x30, [sp, #-128]! stp x29, x30, [sp, #-128]!
add x29, sp, #0 add x29, sp, #0
str x17, [x29, #16] str x17, [x29, #16]
stp x18, x19, [x29, #24] str x19, [x29, #24]
stp x20, x21, [x29, #40] stp x20, x21, [x29, #32]
stp x22, x23, [x29, #56] stp x22, x23, [x29, #48]
stp x24, x25, [x29, #72] stp x24, x25, [x29, #64]
str x26, [x29, #88] stp x26, x27, [x29, #80]
stp d8, d9, [x29, #96] stp d8, d9, [x29, #96]
stp d10, d11, [x29, #112] stp d10, d11, [x29, #112]
adr x3, L_SHA512_transform_neon_len_k adr x3, L_SHA512_transform_neon_len_k
adr x26, L_SHA512_transform_neon_len_ror8 adr x27, L_SHA512_transform_neon_len_ror8
ld1 {v11.16b}, [x26] ld1 {v11.16b}, [x27]
# Load digest into working vars # Load digest into working vars
ldp x4, x5, [x0] ldp x4, x5, [x0]
ldp x6, x7, [x0, #16] ldp x6, x7, [x0, #16]
@ -147,26 +145,26 @@ L_sha512_len_neon_begin:
# Load W # Load W
# Copy digest to add in at end # Copy digest to add in at end
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40 ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40
mov x18, x4 mov x19, x4
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40 ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40
mov x19, x5 mov x20, x5
rev64 v0.16b, v0.16b rev64 v0.16b, v0.16b
mov x20, x6 mov x21, x6
rev64 v1.16b, v1.16b rev64 v1.16b, v1.16b
mov x21, x7 mov x22, x7
rev64 v2.16b, v2.16b rev64 v2.16b, v2.16b
mov x22, x8 mov x23, x8
rev64 v3.16b, v3.16b rev64 v3.16b, v3.16b
mov x23, x9 mov x24, x9
rev64 v4.16b, v4.16b rev64 v4.16b, v4.16b
mov x24, x10 mov x25, x10
rev64 v5.16b, v5.16b rev64 v5.16b, v5.16b
mov x25, x11 mov x26, x11
rev64 v6.16b, v6.16b rev64 v6.16b, v6.16b
rev64 v7.16b, v7.16b rev64 v7.16b, v7.16b
# Pre-calc: b ^ c # Pre-calc: b ^ c
eor x16, x5, x6 eor x16, x5, x6
mov x26, #4 mov x27, #4
# Start of 16 rounds # Start of 16 rounds
L_sha512_len_neon_start: L_sha512_len_neon_start:
# Round 0 # Round 0
@ -665,7 +663,7 @@ L_sha512_len_neon_start:
add v7.2d, v7.2d, v9.2d add v7.2d, v7.2d, v9.2d
add x8, x8, x4 add x8, x8, x4
add x4, x4, x14 add x4, x4, x14
subs x26, x26, #1 subs x27, x27, #1
bne L_sha512_len_neon_start bne L_sha512_len_neon_start
# Round 0 # Round 0
mov x13, v0.d[0] mov x13, v0.d[0]
@ -1019,14 +1017,14 @@ L_sha512_len_neon_start:
add x14, x14, x17 add x14, x14, x17
add x8, x8, x4 add x8, x8, x4
add x4, x4, x14 add x4, x4, x14
add x11, x11, x25 add x11, x11, x26
add x10, x10, x24 add x10, x10, x25
add x9, x9, x23 add x9, x9, x24
add x8, x8, x22 add x8, x8, x23
add x7, x7, x21 add x7, x7, x22
add x6, x6, x20 add x6, x6, x21
add x5, x5, x19 add x5, x5, x20
add x4, x4, x18 add x4, x4, x19
adr x3, L_SHA512_transform_neon_len_k adr x3, L_SHA512_transform_neon_len_k
subs w2, w2, #0x80 subs w2, w2, #0x80
bne L_sha512_len_neon_begin bne L_sha512_len_neon_begin
@ -1035,15 +1033,14 @@ L_sha512_len_neon_start:
stp x8, x9, [x0, #32] stp x8, x9, [x0, #32]
stp x10, x11, [x0, #48] stp x10, x11, [x0, #48]
ldr x17, [x29, #16] ldr x17, [x29, #16]
ldp x18, x19, [x29, #24] ldr x19, [x29, #24]
ldp x20, x21, [x29, #40] ldp x20, x21, [x29, #32]
ldp x22, x23, [x29, #56] ldp x22, x23, [x29, #48]
ldp x24, x25, [x29, #72] ldp x24, x25, [x29, #64]
ldr x26, [x29, #88] ldp x26, x27, [x29, #80]
ldp d8, d9, [x29, #96] ldp d8, d9, [x29, #96]
ldp d10, d11, [x29, #112] ldp d10, d11, [x29, #112]
ldp x29, x30, [sp], #0x80 ldp x29, x30, [sp], #0x80
ret ret
.size Transform_Sha512_Len,.-Transform_Sha512_Len .size Transform_Sha512_Len,.-Transform_Sha512_Len
#endif /* __aarch64__ */ #endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */

View File

@ -24,16 +24,7 @@
* ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c
*/ */
#ifdef __aarch64__ #ifdef __aarch64__
#include <stdint.h> #include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM
#include <wolfssl/wolfcrypt/sha512.h> #include <wolfssl/wolfcrypt/sha512.h>
static const uint64_t L_SHA512_transform_neon_len_k[] = { static const uint64_t L_SHA512_transform_neon_len_k[] = {
@ -130,8 +121,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"stp x29, x30, [sp, #-16]!\n\t" "stp x29, x30, [sp, #-16]!\n\t"
"add x29, sp, #0\n\t" "add x29, sp, #0\n\t"
"adr x3, %[L_SHA512_transform_neon_len_k]\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t"
"adr x26, %[L_SHA512_transform_neon_len_ror8]\n\t" "adr x27, %[L_SHA512_transform_neon_len_ror8]\n\t"
"ld1 {v11.16b}, [x26]\n\t" "ld1 {v11.16b}, [x27]\n\t"
/* Load digest into working vars */ /* Load digest into working vars */
"ldp x4, x5, [%x[sha512]]\n\t" "ldp x4, x5, [%x[sha512]]\n\t"
"ldp x6, x7, [%x[sha512], #16]\n\t" "ldp x6, x7, [%x[sha512], #16]\n\t"
@ -143,26 +134,26 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
/* Load W */ /* Load W */
/* Copy digest to add in at end */ /* Copy digest to add in at end */
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t"
"mov x18, x4\n\t" "mov x19, x4\n\t"
"ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t"
"mov x19, x5\n\t" "mov x20, x5\n\t"
"rev64 v0.16b, v0.16b\n\t" "rev64 v0.16b, v0.16b\n\t"
"mov x20, x6\n\t" "mov x21, x6\n\t"
"rev64 v1.16b, v1.16b\n\t" "rev64 v1.16b, v1.16b\n\t"
"mov x21, x7\n\t" "mov x22, x7\n\t"
"rev64 v2.16b, v2.16b\n\t" "rev64 v2.16b, v2.16b\n\t"
"mov x22, x8\n\t" "mov x23, x8\n\t"
"rev64 v3.16b, v3.16b\n\t" "rev64 v3.16b, v3.16b\n\t"
"mov x23, x9\n\t" "mov x24, x9\n\t"
"rev64 v4.16b, v4.16b\n\t" "rev64 v4.16b, v4.16b\n\t"
"mov x24, x10\n\t" "mov x25, x10\n\t"
"rev64 v5.16b, v5.16b\n\t" "rev64 v5.16b, v5.16b\n\t"
"mov x25, x11\n\t" "mov x26, x11\n\t"
"rev64 v6.16b, v6.16b\n\t" "rev64 v6.16b, v6.16b\n\t"
"rev64 v7.16b, v7.16b\n\t" "rev64 v7.16b, v7.16b\n\t"
/* Pre-calc: b ^ c */ /* Pre-calc: b ^ c */
"eor x16, x5, x6\n\t" "eor x16, x5, x6\n\t"
"mov x26, #4\n\t" "mov x27, #4\n\t"
/* Start of 16 rounds */ /* Start of 16 rounds */
"\n" "\n"
"L_sha512_len_neon_start_%=: \n\t" "L_sha512_len_neon_start_%=: \n\t"
@ -662,7 +653,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"add v7.2d, v7.2d, v9.2d\n\t" "add v7.2d, v7.2d, v9.2d\n\t"
"add x8, x8, x4\n\t" "add x8, x8, x4\n\t"
"add x4, x4, x14\n\t" "add x4, x4, x14\n\t"
"subs x26, x26, #1\n\t" "subs x27, x27, #1\n\t"
"bne L_sha512_len_neon_start_%=\n\t" "bne L_sha512_len_neon_start_%=\n\t"
/* Round 0 */ /* Round 0 */
"mov x13, v0.d[0]\n\t" "mov x13, v0.d[0]\n\t"
@ -1016,14 +1007,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"add x14, x14, x17\n\t" "add x14, x14, x17\n\t"
"add x8, x8, x4\n\t" "add x8, x8, x4\n\t"
"add x4, x4, x14\n\t" "add x4, x4, x14\n\t"
"add x11, x11, x25\n\t" "add x11, x11, x26\n\t"
"add x10, x10, x24\n\t" "add x10, x10, x25\n\t"
"add x9, x9, x23\n\t" "add x9, x9, x24\n\t"
"add x8, x8, x22\n\t" "add x8, x8, x23\n\t"
"add x7, x7, x21\n\t" "add x7, x7, x22\n\t"
"add x6, x6, x20\n\t" "add x6, x6, x21\n\t"
"add x5, x5, x19\n\t" "add x5, x5, x20\n\t"
"add x4, x4, x18\n\t" "add x4, x4, x19\n\t"
"adr x3, %[L_SHA512_transform_neon_len_k]\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t"
"subs %w[len], %w[len], #0x80\n\t" "subs %w[len], %w[len], #0x80\n\t"
"bne L_sha512_len_neon_begin_%=\n\t" "bne L_sha512_len_neon_begin_%=\n\t"
@ -1034,9 +1025,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"ldp x29, x30, [sp], #16\n\t" "ldp x29, x30, [sp], #16\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
: [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8) : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8)
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
); );
} }
#endif /* WOLFSSL_ARMASM */
#endif /* __aarch64__ */ #endif /* __aarch64__ */

File diff suppressed because it is too large Load Diff