diff --git a/src/include.am b/src/include.am index a8b5cecb6..4861f00ce 100644 --- a/src/include.am +++ b/src/include.am @@ -233,12 +233,17 @@ endif if !BUILD_FIPS_V2 if BUILD_SHA512 +if BUILD_ARMASM +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S +else src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512_asm.S endif endif endif +endif if !BUILD_FIPS_V2 if BUILD_SHA3 diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 4274916ba..3e7b3a377 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -48,12 +48,9 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \ wolfcrypt/src/port/ti/ti-ccm.c \ wolfcrypt/src/port/pic32/pic32mz-crypt.c \ wolfcrypt/src/port/nrf51.c \ - wolfcrypt/src/port/arm/armv8-aes.c \ - wolfcrypt/src/port/arm/armv8-sha256.c \ wolfcrypt/src/port/arm/armv8-curve25519.c \ - wolfcrypt/src/port/arm/armv8-curve25519.S \ wolfcrypt/src/port/arm/armv7-curve25519.c \ - wolfcrypt/src/port/arm/armv7-curve25519.S \ + wolfcrypt/src/port/arm/armv8-sha512-asm.c \ wolfcrypt/src/port/nxp/ksdk_port.c \ wolfcrypt/src/port/atmel/README.md \ wolfcrypt/src/port/xilinx/xil-sha3.c \ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index 9426d9987..aa8b25198 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -19,18 +19,22 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S + */ #ifdef __aarch64__ -.text -.globl fe_init -.type fe_init,@function -.align 4 + .text + .align 2 + .globl fe_init + .type fe_init, %function fe_init: ret -.size fe_init,.-fe_init -.text -.globl fe_frombytes -.type fe_frombytes,@function -.align 4 + .size fe_init,.-fe_init + .text + .align 2 + .globl fe_frombytes + .type fe_frombytes, %function fe_frombytes: ldp x2, x3, [x1] ldp x4, x5, [x1, #16] @@ -38,11 +42,11 @@ fe_frombytes: stp x2, x3, [x0] stp x4, x5, [x0, #16] ret -.size fe_frombytes,.-fe_frombytes -.text -.globl fe_tobytes -.type fe_tobytes,@function -.align 4 + .size fe_frombytes,.-fe_frombytes + .text + .align 2 + .globl fe_tobytes + .type fe_tobytes, %function fe_tobytes: mov x7, #19 ldp x2, x3, [x1] @@ -51,8 +55,7 @@ fe_tobytes: adcs x6, x3, xzr adcs x6, x4, xzr adc x6, x5, xzr - asr x6, x6, #63 - and x6, x6, x7 + and x6, x7, x6, asr 63 adds x2, x2, x6 adcs x3, x3, xzr adcs x4, x4, xzr @@ -61,32 +64,32 @@ fe_tobytes: stp x2, x3, [x0] stp x4, x5, [x0, #16] ret -.size fe_tobytes,.-fe_tobytes -.text -.globl fe_1 -.type fe_1,@function -.align 4 + .size fe_tobytes,.-fe_tobytes + .text + .align 2 + .globl fe_1 + .type fe_1, %function fe_1: # Set one mov x1, #1 stp x1, xzr, [x0] stp xzr, xzr, [x0, #16] ret -.size fe_1,.-fe_1 -.text -.globl fe_0 -.type fe_0,@function -.align 4 + .size fe_1,.-fe_1 + .text + .align 2 + .globl fe_0 + .type fe_0, %function fe_0: # Set zero stp xzr, xzr, [x0] stp xzr, xzr, [x0, #16] ret -.size fe_0,.-fe_0 -.text -.globl fe_copy -.type fe_copy,@function -.align 4 + .size fe_0,.-fe_0 + .text + .align 2 + .globl fe_copy + .type fe_copy, %function fe_copy: # Copy ldp x2, x3, [x1] @@ -94,36 +97,11 @@ fe_copy: stp x2, x3, [x0] stp x4, x5, [x0, #16] ret -.size fe_copy,.-fe_copy -.text -.globl fe_cswap -.type fe_cswap,@function -.align 4 -fe_cswap: - # Conditional Swap - cmp x2, #1 - ldp x3, x4, [x0] - ldp x5, x6, [x0, #16] - ldp x7, x8, [x1] - ldp x9, x10, [x1, #16] - csel x11, x3, x7, eq - csel x3, x7, x3, eq - csel x12, x4, x8, eq - csel x4, x8, x4, eq - csel x13, x5, x9, eq - csel x5, x9, x5, eq - csel x14, x6, x10, eq - csel x6, x10, x6, eq - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - stp x11, x12, [x1] - stp x13, x14, [x1, #16] - ret -.size fe_cswap,.-fe_cswap -.text -.globl fe_sub -.type fe_sub,@function -.align 4 + .size fe_copy,.-fe_copy + .text + .align 2 + .globl fe_sub + .type fe_sub, %function fe_sub: # Sub ldp x3, x4, [x1] @@ -147,11 +125,11 @@ fe_sub: stp x3, x4, [x0] stp x5, x6, [x0, #16] ret -.size fe_sub,.-fe_sub -.text -.globl fe_add -.type fe_add,@function -.align 4 + .size fe_sub,.-fe_sub + .text + .align 2 + .globl fe_add + .type fe_add, %function fe_add: # Add ldp x3, x4, [x1] @@ -175,11 +153,11 @@ fe_add: stp x3, x4, [x0] stp x5, x6, [x0, #16] ret -.size fe_add,.-fe_add -.text -.globl fe_neg -.type fe_neg,@function -.align 4 + .size fe_add,.-fe_add + .text + .align 2 + .globl fe_neg + .type fe_neg, %function fe_neg: ldp x2, x3, [x1] ldp x4, x5, [x1, #16] @@ -194,29 +172,11 @@ fe_neg: stp x6, x7, [x0] stp x8, x9, [x0, #16] ret -.size fe_neg,.-fe_neg -.text -.globl fe_cmov -.type fe_cmov,@function -.align 4 -fe_cmov: - ldp x4, x5, [x0] - ldp x6, x7, [x0, #16] - ldp x8, x9, [x1] - ldp x10, x11, [x1, #16] - cmp x2, #1 - csel x4, x4, x8, eq - csel x5, x5, x9, eq - csel x6, x6, x10, eq - csel x7, x7, x11, eq - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ret -.size fe_cmov,.-fe_cmov -.text -.globl fe_isnonzero -.type fe_isnonzero,@function -.align 4 + .size fe_neg,.-fe_neg + .text + .align 2 + .globl fe_isnonzero + .type fe_isnonzero, %function fe_isnonzero: mov x6, #19 ldp x1, x2, [x0] @@ -225,8 +185,7 @@ fe_isnonzero: adcs x5, x2, xzr adcs x5, x3, xzr adc x5, x4, xzr - asr x5, x5, #63 - and x5, x5, x6 + and x5, x6, x5, asr 63 adds x1, x1, x5 adcs x2, x2, xzr adcs x3, x3, xzr @@ -236,11 +195,11 @@ fe_isnonzero: orr x3, x3, x4 orr x0, x0, x3 ret -.size fe_isnonzero,.-fe_isnonzero -.text -.globl fe_isnegative -.type fe_isnegative,@function -.align 4 + .size fe_isnonzero,.-fe_isnonzero + .text + .align 2 + .globl fe_isnegative + .type fe_isnegative, %function fe_isnegative: mov x6, #19 ldp x1, x2, [x0] @@ -250,28 +209,22 @@ fe_isnegative: adcs x5, x3, xzr adc x5, x4, xzr and x0, x1, #1 - lsr x5, x5, #63 - eor x0, x0, x5 + eor x0, x0, x5, lsr 63 ret -.size fe_isnegative,.-fe_isnegative -.text -.globl fe_cmov_table -.type fe_cmov_table,@function -.align 4 + .size fe_isnegative,.-fe_isnegative + .text + .align 2 + .globl fe_cmov_table + .type fe_cmov_table, %function fe_cmov_table: stp x29, x30, [sp, #-112]! add x29, sp, #0 str x17, [x29, #16] - str x18, [x29, #24] - str x19, [x29, #32] - str x20, [x29, #40] - str x21, [x29, #48] - str x22, [x29, #56] - str x23, [x29, #64] - str x24, [x29, #72] - str x25, [x29, #80] - str x26, [x29, #88] - str x27, [x29, #96] + stp x18, x19, [x29, #24] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + stp x24, x25, [x29, #72] + stp x26, x27, [x29, #88] str x28, [x29, #104] sxtb x2, w2 sbfx x15, x2, #7, #1 @@ -474,32 +427,25 @@ fe_cmov_table: stp x11, x12, [x0, #64] stp x13, x14, [x0, #80] ldr x17, [x29, #16] - ldr x18, [x29, #24] - ldr x19, [x29, #32] - ldr x20, [x29, #40] - ldr x21, [x29, #48] - ldr x22, [x29, #56] - ldr x23, [x29, #64] - ldr x24, [x29, #72] - ldr x25, [x29, #80] - ldr x26, [x29, #88] - ldr x27, [x29, #96] + ldp x18, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldp x24, x25, [x29, #72] + ldp x26, x27, [x29, #88] ldr x28, [x29, #104] ldp x29, x30, [sp], #0x70 ret -.size fe_cmov_table,.-fe_cmov_table -.text -.globl fe_mul -.type fe_mul,@function -.align 4 + .size fe_cmov_table,.-fe_cmov_table + .text + .align 2 + .globl fe_mul + .type fe_mul, %function fe_mul: stp x29, x30, [sp, #-64]! add x29, sp, #0 str x17, [x29, #24] - str x18, [x29, #32] - str x19, [x29, #40] - str x20, [x29, #48] - str x21, [x29, #56] + stp x18, x19, [x29, #32] + stp x20, x21, [x29, #48] # Multiply ldp x14, x15, [x1] ldp x16, x17, [x1, #16] @@ -631,8 +577,7 @@ fe_mul: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -642,17 +587,15 @@ fe_mul: stp x6, x7, [x0] stp x8, x9, [x0, #16] ldr x17, [x29, #24] - ldr x18, [x29, #32] - ldr x19, [x29, #40] - ldr x20, [x29, #48] - ldr x21, [x29, #56] + ldp x18, x19, [x29, #32] + ldp x20, x21, [x29, #48] ldp x29, x30, [sp], #0x40 ret -.size fe_mul,.-fe_mul -.text -.globl fe_sq -.type fe_sq,@function -.align 4 + .size fe_mul,.-fe_mul + .text + .align 2 + .globl fe_sq + .type fe_sq, %function fe_sq: # Square ldp x13, x14, [x1] @@ -751,8 +694,7 @@ fe_sq: adcs x7, x7, xzr adc x8, x8, xzr # Reduce if top bit set - asr x4, x8, #63 - and x4, x4, x2 + and x4, x2, x8, asr 63 and x8, x8, #0x7fffffffffffffff adds x5, x5, x4 adcs x6, x6, xzr @@ -762,177 +704,11 @@ fe_sq: stp x5, x6, [x0] stp x7, x8, [x0, #16] ret -.size fe_sq,.-fe_sq -.text -.globl fe_mul121666 -.type fe_mul121666,@function -.align 4 -fe_mul121666: - # Multiply by 121666 - ldp x5, x6, [x1] - ldp x7, x8, [x1, #16] - mov x4, #0xdb42 - movk x4, #1, lsl 16 - mul x9, x5, x4 - umulh x10, x5, x4 - mul x2, x6, x4 - umulh x3, x6, x4 - adds x10, x10, x2 - adc x11, xzr, x3 - mul x2, x7, x4 - umulh x3, x7, x4 - adds x11, x11, x2 - adc x12, xzr, x3 - mul x2, x8, x4 - umulh x3, x8, x4 - adds x12, x12, x2 - adc x3, xzr, x3 - mov x4, #19 - extr x3, x3, x12, #63 - mul x3, x3, x4 - and x12, x12, #0x7fffffffffffffff - adds x9, x9, x3 - adcs x10, x10, xzr - adcs x11, x11, xzr - adc x12, x12, xzr - stp x9, x10, [x0] - stp x11, x12, [x0, #16] - ret -.size fe_mul121666,.-fe_mul121666 -.text -.globl fe_sq2 -.type fe_sq2,@function -.align 4 -fe_sq2: - stp x29, x30, [sp, #-32]! - add x29, sp, #0 - str x17, [x29, #24] - # Square * 2 - ldp x5, x6, [x1] - ldp x7, x8, [x1, #16] - # A[0] * A[1] - mul x10, x5, x6 - umulh x11, x5, x6 - # A[0] * A[2] - mul x2, x5, x7 - umulh x12, x5, x7 - adds x11, x11, x2 - adc x12, x12, xzr - # A[0] * A[3] - mul x2, x5, x8 - umulh x13, x5, x8 - adds x12, x12, x2 - adc x13, x13, xzr - # A[1] * A[2] - mul x2, x6, x7 - umulh x3, x6, x7 - adds x12, x12, x2 - adcs x13, x13, x3 - adc x14, xzr, xzr - # A[1] * A[3] - mul x2, x6, x8 - umulh x3, x6, x8 - adds x13, x13, x2 - adc x14, x14, x3 - # A[2] * A[3] - mul x2, x7, x8 - umulh x15, x7, x8 - adds x14, x14, x2 - adc x15, x15, xzr - # Double - adds x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adcs x13, x13, x13 - adcs x14, x14, x14 - adcs x15, x15, x15 - adc x16, xzr, xzr - # A[0] * A[0] - mul x9, x5, x5 - umulh x17, x5, x5 - # A[1] * A[1] - mul x2, x6, x6 - umulh x3, x6, x6 - adds x10, x10, x17 - adcs x11, x11, x2 - adc x17, x3, xzr - # A[2] * A[2] - mul x2, x7, x7 - umulh x3, x7, x7 - adds x12, x12, x17 - adcs x13, x13, x2 - adc x17, x3, xzr - # A[3] * A[3] - mul x2, x8, x8 - umulh x3, x8, x8 - adds x14, x14, x17 - adcs x15, x15, x2 - adc x16, x16, x3 - # Double and Reduce - mov x2, #0x169 - # Move top half into t4-t7 and remove top bit from t3 - lsr x17, x16, #61 - extr x16, x16, x15, #62 - extr x15, x15, x14, #62 - extr x14, x14, x13, #62 - extr x13, x13, x12, #62 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - lsl x9, x9, #1 - and x12, x12, #0x7fffffffffffffff - # Two left, only one right - and x16, x16, #0x7fffffffffffffff - # Multiply top bits by 19*19 - mul x17, x17, x2 - # Multiply top half by 19 - mov x2, #19 - mul x3, x2, x13 - umulh x13, x2, x13 - adds x9, x9, x3 - mul x3, x2, x14 - umulh x14, x2, x14 - adcs x10, x10, x3 - mul x3, x2, x15 - umulh x15, x2, x15 - adcs x11, x11, x3 - mul x3, x2, x16 - umulh x4, x2, x16 - adcs x12, x12, x3 - adc x4, x4, xzr - # Add remaining product results in - adds x9, x9, x17 - adcs x10, x10, x13 - adcs x11, x11, x14 - adcs x12, x12, x15 - adc x4, x4, xzr - # Overflow - extr x4, x4, x12, #63 - mul x4, x4, x2 - and x12, x12, #0x7fffffffffffffff - adds x9, x9, x4 - adcs x10, x10, xzr - adcs x11, x11, xzr - adc x12, x12, xzr - # Reduce if top bit set - asr x4, x12, #63 - and x4, x4, x2 - and x12, x12, #0x7fffffffffffffff - adds x9, x9, x4 - adcs x10, x10, xzr - adcs x11, x11, xzr - adc x12, x12, xzr - # Store - stp x9, x10, [x0] - stp x11, x12, [x0, #16] - ldr x17, [x29, #24] - ldp x29, x30, [sp], #32 - ret -.size fe_sq2,.-fe_sq2 -.text -.globl fe_invert -.type fe_invert,@function -.align 4 + .size fe_sq,.-fe_sq + .text + .align 2 + .globl fe_invert + .type fe_invert, %function fe_invert: stp x29, x30, [sp, #-176]! add x29, sp, #0 @@ -954,16 +730,16 @@ fe_invert: add x1, x29, #16 add x2, x29, #48 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq add x0, x29, #48 add x1, x29, #48 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq mov x20, #4 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert1: bl fe_sq sub x20, x20, #1 @@ -972,11 +748,11 @@ L_fe_invert1: add x0, x29, #48 add x2, x29, #48 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 add x1, x29, #48 bl fe_sq mov x20, #9 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert2: bl fe_sq sub x20, x20, #1 @@ -984,20 +760,20 @@ L_fe_invert2: bne L_fe_invert2 add x2, x29, #48 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq mov x20, #19 - add x1, x29, #112 + add x1, x29, #0x70 L_fe_invert3: bl fe_sq sub x20, x20, #1 cmp x20, #0 bne L_fe_invert3 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul mov x20, #10 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert4: bl fe_sq sub x20, x20, #1 @@ -1006,11 +782,11 @@ L_fe_invert4: add x0, x29, #48 add x2, x29, #48 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 add x1, x29, #48 bl fe_sq mov x20, #49 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert5: bl fe_sq sub x20, x20, #1 @@ -1018,20 +794,20 @@ L_fe_invert5: bne L_fe_invert5 add x2, x29, #48 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq mov x20, #0x63 - add x1, x29, #112 + add x1, x29, #0x70 L_fe_invert6: bl fe_sq sub x20, x20, #1 cmp x20, #0 bne L_fe_invert6 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul mov x20, #50 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert7: bl fe_sq sub x20, x20, #1 @@ -1053,25 +829,20 @@ L_fe_invert8: ldr x20, [x29, #168] ldp x29, x30, [sp], #0xb0 ret -.size fe_invert,.-fe_invert -.text -.globl curve25519 -.type curve25519,@function -.align 4 + .size fe_invert,.-fe_invert + .text + .align 2 + .globl curve25519 + .type curve25519, %function curve25519: stp x29, x30, [sp, #-288]! add x29, sp, #0 str x17, [x29, #192] - str x18, [x29, #200] - str x19, [x29, #208] - str x20, [x29, #216] - str x21, [x29, #224] - str x22, [x29, #232] - str x23, [x29, #240] - str x24, [x29, #248] - str x25, [x29, #256] - str x26, [x29, #264] - str x27, [x29, #272] + stp x18, x19, [x29, #200] + stp x20, x21, [x29, #216] + stp x22, x23, [x29, #232] + stp x24, x25, [x29, #248] + stp x26, x27, [x29, #264] str x28, [x29, #280] mov x22, xzr str x0, [x29, #176] @@ -1317,8 +1088,7 @@ L_curve25519_bits: adcs x20, x20, xzr adc x21, x21, xzr # Reduce if top bit set - asr x5, x21, #63 - and x5, x5, x3 + and x5, x3, x21, asr 63 and x21, x21, #0x7fffffffffffffff adds x18, x18, x5 adcs x19, x19, xzr @@ -1456,8 +1226,7 @@ L_curve25519_bits: adcs x20, x20, xzr adc x21, x21, xzr # Reduce if top bit set - asr x5, x21, #63 - and x5, x5, x3 + and x5, x3, x21, asr 63 and x21, x21, #0x7fffffffffffffff adds x18, x18, x5 adcs x19, x19, xzr @@ -1559,8 +1328,7 @@ L_curve25519_bits: adcs x12, x12, xzr adc x13, x13, xzr # Reduce if top bit set - asr x5, x13, #63 - and x5, x5, x3 + and x5, x3, x13, asr 63 and x13, x13, #0x7fffffffffffffff adds x10, x10, x5 adcs x11, x11, xzr @@ -1662,8 +1430,7 @@ L_curve25519_bits: adcs x16, x16, xzr adc x17, x17, xzr # Reduce if top bit set - asr x5, x17, #63 - and x5, x5, x3 + and x5, x3, x17, asr 63 and x17, x17, #0x7fffffffffffffff adds x14, x14, x5 adcs x15, x15, xzr @@ -1797,8 +1564,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -1989,8 +1755,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2126,8 +1891,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2231,8 +1995,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2368,8 +2131,7 @@ L_curve25519_bits: adcs x12, x12, xzr adc x13, x13, xzr # Reduce if top bit set - asr x5, x13, #63 - and x5, x5, x3 + and x5, x3, x13, asr 63 and x13, x13, #0x7fffffffffffffff adds x10, x10, x5 adcs x11, x11, xzr @@ -2389,106 +2151,106 @@ L_curve25519_bits: add x0, x29, #48 add x1, x29, #16 bl fe_sq - add x0, x29, #80 + add x0, x29, #0x50 add x1, x29, #48 bl fe_sq - add x1, x29, #80 + add x1, x29, #0x50 bl fe_sq add x1, x29, #16 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul add x0, x29, #48 add x1, x29, #48 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq - add x0, x29, #80 - add x1, x29, #80 - add x2, x29, #112 + add x0, x29, #0x50 + add x1, x29, #0x50 + add x2, x29, #0x70 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq mov x24, #4 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_1: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_1 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #112 - add x1, x29, #80 + add x0, x29, #0x70 + add x1, x29, #0x50 bl fe_sq mov x24, #9 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_2: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_2 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #144 + add x0, x29, #0x90 bl fe_sq mov x24, #19 - add x1, x29, #144 + add x1, x29, #0x90 L_curve25519_inv_3: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_3 - add x0, x29, #112 - add x2, x29, #112 + add x0, x29, #0x70 + add x2, x29, #0x70 bl fe_mul mov x24, #10 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_4: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_4 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #112 - add x1, x29, #80 + add x0, x29, #0x70 + add x1, x29, #0x50 bl fe_sq mov x24, #49 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_5: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_5 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #144 + add x0, x29, #0x90 bl fe_sq mov x24, #0x63 - add x1, x29, #144 + add x1, x29, #0x90 L_curve25519_inv_6: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_6 - add x0, x29, #112 - add x2, x29, #112 + add x0, x29, #0x70 + add x2, x29, #0x70 bl fe_mul mov x24, #50 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_7: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_7 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul mov x24, #5 - add x1, x29, #80 + add x1, x29, #0x50 L_curve25519_inv_8: bl fe_sq sub x24, x24, #1 @@ -2629,8 +2391,7 @@ L_curve25519_inv_8: adcs x16, x16, xzr adc x17, x17, xzr # Reduce if top bit set - asr x5, x17, #63 - and x5, x5, x3 + and x5, x3, x17, asr 63 and x17, x17, #0x7fffffffffffffff adds x14, x14, x5 adcs x15, x15, xzr @@ -2641,24 +2402,19 @@ L_curve25519_inv_8: stp x16, x17, [x0, #16] mov x0, xzr ldr x17, [x29, #192] - ldr x18, [x29, #200] - ldr x19, [x29, #208] - ldr x20, [x29, #216] - ldr x21, [x29, #224] - ldr x22, [x29, #232] - ldr x23, [x29, #240] - ldr x24, [x29, #248] - ldr x25, [x29, #256] - ldr x26, [x29, #264] - ldr x27, [x29, #272] + ldp x18, x19, [x29, #200] + ldp x20, x21, [x29, #216] + ldp x22, x23, [x29, #232] + ldp x24, x25, [x29, #248] + ldp x26, x27, [x29, #264] ldr x28, [x29, #280] ldp x29, x30, [sp], #0x120 ret -.size curve25519,.-curve25519 -.text -.globl fe_pow22523 -.type fe_pow22523,@function -.align 4 + .size curve25519,.-curve25519 + .text + .align 2 + .globl fe_pow22523 + .type fe_pow22523, %function fe_pow22523: stp x29, x30, [sp, #-144]! add x29, sp, #0 @@ -2709,10 +2465,10 @@ L_fe_pow22523_2: bne L_fe_pow22523_2 add x2, x29, #16 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq mov x21, #19 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_pow22523_3: bl fe_sq sub x21, x21, #1 @@ -2743,10 +2499,10 @@ L_fe_pow22523_5: bne L_fe_pow22523_5 add x2, x29, #16 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq mov x21, #0x63 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_pow22523_6: bl fe_sq sub x21, x21, #1 @@ -2778,19 +2534,17 @@ L_fe_pow22523_8: ldr x21, [x29, #136] ldp x29, x30, [sp], #0x90 ret -.size fe_pow22523,.-fe_pow22523 -.text -.globl fe_ge_to_p2 -.type fe_ge_to_p2,@function -.align 4 + .size fe_pow22523,.-fe_pow22523 + .text + .align 2 + .globl fe_ge_to_p2 + .type fe_ge_to_p2, %function fe_ge_to_p2: stp x29, x30, [sp, #-112]! add x29, sp, #0 str x17, [x29, #72] - str x18, [x29, #80] - str x19, [x29, #88] - str x20, [x29, #96] - str x21, [x29, #104] + stp x18, x19, [x29, #80] + stp x20, x21, [x29, #96] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -2930,8 +2684,7 @@ fe_ge_to_p2: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x21, x6, #63 - and x21, x21, x19 + and x21, x19, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x21 adcs x4, x4, xzr @@ -3074,8 +2827,7 @@ fe_ge_to_p2: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x21, x6, #63 - and x21, x21, x19 + and x21, x19, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x21 adcs x4, x4, xzr @@ -3215,8 +2967,7 @@ fe_ge_to_p2: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x21, x6, #63 - and x21, x21, x19 + and x21, x19, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x21 adcs x4, x4, xzr @@ -3226,29 +2977,23 @@ fe_ge_to_p2: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x17, [x29, #72] - ldr x18, [x29, #80] - ldr x19, [x29, #88] - ldr x20, [x29, #96] - ldr x21, [x29, #104] + ldp x18, x19, [x29, #80] + ldp x20, x21, [x29, #96] ldp x29, x30, [sp], #0x70 ret -.size fe_ge_to_p2,.-fe_ge_to_p2 -.text -.globl fe_ge_to_p3 -.type fe_ge_to_p3,@function -.align 4 + .size fe_ge_to_p2,.-fe_ge_to_p2 + .text + .align 2 + .globl fe_ge_to_p3 + .type fe_ge_to_p3, %function fe_ge_to_p3: stp x29, x30, [sp, #-160]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -3389,8 +3134,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3530,8 +3274,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3671,8 +3414,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3809,8 +3551,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3820,35 +3561,26 @@ fe_ge_to_p3: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] ldp x29, x30, [sp], #0xa0 ret -.size fe_ge_to_p3,.-fe_ge_to_p3 -.text -.globl fe_ge_dbl -.type fe_ge_dbl,@function -.align 4 + .size fe_ge_to_p3,.-fe_ge_to_p3 + .text + .align 2 + .globl fe_ge_dbl + .type fe_ge_dbl, %function fe_ge_dbl: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -3954,8 +3686,7 @@ fe_ge_dbl: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4063,8 +3794,7 @@ fe_ge_dbl: adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - asr x26, x11, #63 - and x26, x26, x24 + and x26, x24, x11, asr 63 and x11, x11, #0x7fffffffffffffff adds x8, x8, x26 adcs x9, x9, xzr @@ -4185,8 +3915,7 @@ fe_ge_dbl: adcs x18, x18, xzr adc x19, x19, xzr # Reduce if top bit set - asr x26, x19, #63 - and x26, x26, x24 + and x26, x24, x19, asr 63 and x19, x19, #0x7fffffffffffffff adds x16, x16, x26 adcs x17, x17, xzr @@ -4359,8 +4088,7 @@ fe_ge_dbl: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4386,37 +4114,27 @@ fe_ge_dbl: stp x4, x5, [x0] stp x6, x7, [x0, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_dbl,.-fe_ge_dbl -.text -.globl fe_ge_madd -.type fe_ge_madd,@function -.align 4 + .size fe_ge_dbl,.-fe_ge_dbl + .text + .align 2 + .globl fe_ge_madd + .type fe_ge_madd, %function fe_ge_madd: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -4592,8 +4310,7 @@ fe_ge_madd: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -4731,8 +4448,7 @@ fe_ge_madd: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4909,8 +4625,7 @@ fe_ge_madd: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4972,37 +4687,27 @@ fe_ge_madd: stp x16, x17, [x1] stp x18, x19, [x1, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_madd,.-fe_ge_madd -.text -.globl fe_ge_msub -.type fe_ge_msub,@function -.align 4 + .size fe_ge_madd,.-fe_ge_madd + .text + .align 2 + .globl fe_ge_msub + .type fe_ge_msub, %function fe_ge_msub: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -5178,8 +4883,7 @@ fe_ge_msub: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -5317,8 +5021,7 @@ fe_ge_msub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -5495,8 +5198,7 @@ fe_ge_msub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -5558,37 +5260,27 @@ fe_ge_msub: stp x16, x17, [x0] stp x18, x19, [x0, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_msub,.-fe_ge_msub -.text -.globl fe_ge_add -.type fe_ge_add,@function -.align 4 + .size fe_ge_msub,.-fe_ge_msub + .text + .align 2 + .globl fe_ge_add + .type fe_ge_add, %function fe_ge_add: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -5764,8 +5456,7 @@ fe_ge_add: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -5903,8 +5594,7 @@ fe_ge_add: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6081,8 +5771,7 @@ fe_ge_add: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6239,8 +5928,7 @@ fe_ge_add: adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - asr x26, x11, #63 - and x26, x26, x24 + and x26, x24, x11, asr 63 and x11, x11, #0x7fffffffffffffff adds x8, x8, x26 adcs x9, x9, xzr @@ -6284,37 +5972,27 @@ fe_ge_add: stp x16, x17, [x1] stp x18, x19, [x1, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_add,.-fe_ge_add -.text -.globl fe_ge_sub -.type fe_ge_sub,@function -.align 4 + .size fe_ge_add,.-fe_ge_add + .text + .align 2 + .globl fe_ge_sub + .type fe_ge_sub, %function fe_ge_sub: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -6490,8 +6168,7 @@ fe_ge_sub: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -6629,8 +6306,7 @@ fe_ge_sub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6807,8 +6483,7 @@ fe_ge_sub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6965,8 +6640,7 @@ fe_ge_sub: adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - asr x26, x11, #63 - and x26, x26, x24 + and x26, x24, x11, asr 63 and x11, x11, #0x7fffffffffffffff adds x8, x8, x26 adcs x9, x9, xzr @@ -7010,17 +6684,12 @@ fe_ge_sub: stp x16, x17, [x1] stp x18, x19, [x1, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_sub,.-fe_ge_sub + .size fe_ge_sub,.-fe_ge_sub #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c index 2d0b0642c..d42daee4c 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -19,7 +19,12 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c + */ #ifdef __aarch64__ +#include #ifdef HAVE_CONFIG_H #include #endif @@ -46,11 +51,11 @@ void fe_frombytes(fe out, const unsigned char* in) __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x2, x3, [%x[in]]\n\t" + "ldp x4, x5, [%x[in], #16]\n\t" "and x5, x5, #0x7fffffffffffffff\n\t" - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "stp x2, x3, [%x[out]]\n\t" + "stp x4, x5, [%x[out], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [out] "+r" (out), [in] "+r" (in) : @@ -64,21 +69,20 @@ void fe_tobytes(unsigned char* out, const fe n) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "mov x7, #19\n\t" - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x2, x3, [%x[n]]\n\t" + "ldp x4, x5, [%x[n], #16]\n\t" "adds x6, x2, x7\n\t" "adcs x6, x3, xzr\n\t" "adcs x6, x4, xzr\n\t" "adc x6, x5, xzr\n\t" - "asr x6, x6, #63\n\t" - "and x6, x6, x7\n\t" + "and x6, x7, x6, asr 63\n\t" "adds x2, x2, x6\n\t" "adcs x3, x3, xzr\n\t" "adcs x4, x4, xzr\n\t" "adc x5, x5, xzr\n\t" "and x5, x5, #0x7fffffffffffffff\n\t" - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "stp x2, x3, [%x[out]]\n\t" + "stp x4, x5, [%x[out], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [out] "+r" (out), [n] "+r" (n) : @@ -93,8 +97,8 @@ void fe_1(fe n) "add x29, sp, #0\n\t" /* Set one */ "mov x1, #1\n\t" - "stp x1, xzr, [x0]\n\t" - "stp xzr, xzr, [x0, #16]\n\t" + "stp x1, xzr, [%x[n]]\n\t" + "stp xzr, xzr, [%x[n], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [n] "+r" (n) : @@ -108,8 +112,8 @@ void fe_0(fe n) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Set zero */ - "stp xzr, xzr, [x0]\n\t" - "stp xzr, xzr, [x0, #16]\n\t" + "stp xzr, xzr, [%x[n]]\n\t" + "stp xzr, xzr, [%x[n], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [n] "+r" (n) : @@ -123,10 +127,10 @@ void fe_copy(fe r, const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Copy */ - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "ldp x2, x3, [%x[a]]\n\t" + "ldp x4, x5, [%x[a], #16]\n\t" + "stp x2, x3, [%x[r]]\n\t" + "stp x4, x5, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -134,46 +138,16 @@ void fe_copy(fe r, const fe a) ); } -void fe_cswap(fe a, fe b, int c) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - /* Conditional Swap */ - "cmp %[c], #1\n\t" - "ldp x3, x4, [x0]\n\t" - "ldp x5, x6, [x0, #16]\n\t" - "ldp x7, x8, [x1]\n\t" - "ldp x9, x10, [x1, #16]\n\t" - "csel x11, x3, x7, eq\n\t" - "csel x3, x7, x3, eq\n\t" - "csel x12, x4, x8, eq\n\t" - "csel x4, x8, x4, eq\n\t" - "csel x13, x5, x9, eq\n\t" - "csel x5, x9, x5, eq\n\t" - "csel x14, x6, x10, eq\n\t" - "csel x6, x10, x6, eq\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "stp x11, x12, [x1]\n\t" - "stp x13, x14, [x1, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14" - ); -} - void fe_sub(fe r, const fe a, const fe b) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Sub */ - "ldp x3, x4, [x1]\n\t" - "ldp x5, x6, [x1, #16]\n\t" - "ldp x7, x8, [x2]\n\t" - "ldp x9, x10, [x2, #16]\n\t" + "ldp x3, x4, [%x[a]]\n\t" + "ldp x5, x6, [%x[a], #16]\n\t" + "ldp x7, x8, [%x[b]]\n\t" + "ldp x9, x10, [%x[b], #16]\n\t" "subs x3, x3, x7\n\t" "sbcs x4, x4, x8\n\t" "sbcs x5, x5, x9\n\t" @@ -188,8 +162,8 @@ void fe_sub(fe r, const fe a, const fe b) "adcs x4, x4, x11\n\t" "adcs x5, x5, x11\n\t" "adc x6, x6, x13\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -203,10 +177,10 @@ void fe_add(fe r, const fe a, const fe b) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Add */ - "ldp x3, x4, [x1]\n\t" - "ldp x5, x6, [x1, #16]\n\t" - "ldp x7, x8, [x2]\n\t" - "ldp x9, x10, [x2, #16]\n\t" + "ldp x3, x4, [%x[a]]\n\t" + "ldp x5, x6, [%x[a], #16]\n\t" + "ldp x7, x8, [%x[b]]\n\t" + "ldp x9, x10, [%x[b], #16]\n\t" "adds x3, x3, x7\n\t" "adcs x4, x4, x8\n\t" "adcs x5, x5, x9\n\t" @@ -221,8 +195,8 @@ void fe_add(fe r, const fe a, const fe b) "sbcs x4, x4, x11\n\t" "sbcs x5, x5, x11\n\t" "sbc x6, x6, x13\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -235,8 +209,8 @@ void fe_neg(fe r, const fe a) __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x2, x3, [%x[a]]\n\t" + "ldp x4, x5, [%x[a], #16]\n\t" "mov x6, #-19\n\t" "mov x7, #-1\n\t" "mov x8, #-1\n\t" @@ -245,8 +219,8 @@ void fe_neg(fe r, const fe a) "sbcs x7, x7, x3\n\t" "sbcs x8, x8, x4\n\t" "sbc x9, x9, x5\n\t" - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -254,51 +228,27 @@ void fe_neg(fe r, const fe a) ); } -void fe_cmov(fe a, const fe b, int c) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - "ldp x4, x5, [x0]\n\t" - "ldp x6, x7, [x0, #16]\n\t" - "ldp x8, x9, [x1]\n\t" - "ldp x10, x11, [x1, #16]\n\t" - "cmp %[c], #1\n\t" - "csel x4, x4, x8, eq\n\t" - "csel x5, x5, x9, eq\n\t" - "csel x6, x6, x10, eq\n\t" - "csel x7, x7, x11, eq\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" - ); -} - int fe_isnonzero(const fe a) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "mov x6, #19\n\t" - "ldp x1, x2, [x0]\n\t" - "ldp x3, x4, [x0, #16]\n\t" + "ldp x1, x2, [%x[a]]\n\t" + "ldp x3, x4, [%x[a], #16]\n\t" "adds x5, x1, x6\n\t" "adcs x5, x2, xzr\n\t" "adcs x5, x3, xzr\n\t" "adc x5, x4, xzr\n\t" - "asr x5, x5, #63\n\t" - "and x5, x5, x6\n\t" + "and x5, x6, x5, asr 63\n\t" "adds x1, x1, x5\n\t" "adcs x2, x2, xzr\n\t" "adcs x3, x3, xzr\n\t" "adc x4, x4, xzr\n\t" "and x4, x4, #0x7fffffffffffffff\n\t" - "orr %[a], x1, x2\n\t" + "orr %x[a], x1, x2\n\t" "orr x3, x3, x4\n\t" - "orr %[a], %[a], x3\n\t" + "orr %x[a], %x[a], x3\n\t" "ldp x29, x30, [sp], #16\n\t" : [a] "+r" (a) : @@ -313,15 +263,14 @@ int fe_isnegative(const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "mov x6, #19\n\t" - "ldp x1, x2, [x0]\n\t" - "ldp x3, x4, [x0, #16]\n\t" + "ldp x1, x2, [%x[a]]\n\t" + "ldp x3, x4, [%x[a], #16]\n\t" "adds x5, x1, x6\n\t" "adcs x5, x2, xzr\n\t" "adcs x5, x3, xzr\n\t" "adc x5, x4, xzr\n\t" - "and %[a], x1, #1\n\t" - "lsr x5, x5, #63\n\t" - "eor %[a], %[a], x5\n\t" + "and %x[a], x1, #1\n\t" + "eor %x[a], %x[a], x5, lsr 63\n\t" "ldp x29, x30, [sp], #16\n\t" : [a] "+r" (a) : @@ -335,9 +284,9 @@ void fe_cmov_table(fe* r, fe* base, signed char b) __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" - "sxtb %[b], w2\n\t" - "sbfx x15, %[b], #7, #1\n\t" - "eor x16, %[b], x15\n\t" + "sxtb %x[b], %w[b]\n\t" + "sbfx x15, %x[b], #7, #1\n\t" + "eor x16, %x[b], x15\n\t" "sub x16, x16, x15\n\t" "mov x3, #1\n\t" "mov x4, xzr\n\t" @@ -352,12 +301,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "mov x13, xzr\n\t" "mov x14, xzr\n\t" "cmp x16, #1\n\t" - "ldp x17, x18, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x1, #32]\n\t" - "ldp x23, x24, [x1, #48]\n\t" - "ldp x25, x26, [x1, #64]\n\t" - "ldp x27, x28, [x1, #80]\n\t" + "ldp x17, x18, [%x[base]]\n\t" + "ldp x19, x20, [%x[base], #16]\n\t" + "ldp x21, x22, [%x[base], #32]\n\t" + "ldp x23, x24, [%x[base], #48]\n\t" + "ldp x25, x26, [%x[base], #64]\n\t" + "ldp x27, x28, [%x[base], #80]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -371,12 +320,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #2\n\t" - "ldp x17, x18, [x1, #96]\n\t" - "ldp x19, x20, [x1, #112]\n\t" - "ldp x21, x22, [x1, #128]\n\t" - "ldp x23, x24, [x1, #144]\n\t" - "ldp x25, x26, [x1, #160]\n\t" - "ldp x27, x28, [x1, #176]\n\t" + "ldp x17, x18, [%x[base], #96]\n\t" + "ldp x19, x20, [%x[base], #112]\n\t" + "ldp x21, x22, [%x[base], #128]\n\t" + "ldp x23, x24, [%x[base], #144]\n\t" + "ldp x25, x26, [%x[base], #160]\n\t" + "ldp x27, x28, [%x[base], #176]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -390,12 +339,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #3\n\t" - "ldp x17, x18, [x1, #192]\n\t" - "ldp x19, x20, [x1, #208]\n\t" - "ldp x21, x22, [x1, #224]\n\t" - "ldp x23, x24, [x1, #240]\n\t" - "ldp x25, x26, [x1, #256]\n\t" - "ldp x27, x28, [x1, #272]\n\t" + "ldp x17, x18, [%x[base], #192]\n\t" + "ldp x19, x20, [%x[base], #208]\n\t" + "ldp x21, x22, [%x[base], #224]\n\t" + "ldp x23, x24, [%x[base], #240]\n\t" + "ldp x25, x26, [%x[base], #256]\n\t" + "ldp x27, x28, [%x[base], #272]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -409,12 +358,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #4\n\t" - "ldp x17, x18, [x1, #288]\n\t" - "ldp x19, x20, [x1, #304]\n\t" - "ldp x21, x22, [x1, #320]\n\t" - "ldp x23, x24, [x1, #336]\n\t" - "ldp x25, x26, [x1, #352]\n\t" - "ldp x27, x28, [x1, #368]\n\t" + "ldp x17, x18, [%x[base], #288]\n\t" + "ldp x19, x20, [%x[base], #304]\n\t" + "ldp x21, x22, [%x[base], #320]\n\t" + "ldp x23, x24, [%x[base], #336]\n\t" + "ldp x25, x26, [%x[base], #352]\n\t" + "ldp x27, x28, [%x[base], #368]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -427,14 +376,14 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x12, x26, x12, eq\n\t" "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" - "add %[base], %[base], #0x180\n\t" + "add %x[base], %x[base], #0x180\n\t" "cmp x16, #5\n\t" - "ldp x17, x18, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x1, #32]\n\t" - "ldp x23, x24, [x1, #48]\n\t" - "ldp x25, x26, [x1, #64]\n\t" - "ldp x27, x28, [x1, #80]\n\t" + "ldp x17, x18, [%x[base]]\n\t" + "ldp x19, x20, [%x[base], #16]\n\t" + "ldp x21, x22, [%x[base], #32]\n\t" + "ldp x23, x24, [%x[base], #48]\n\t" + "ldp x25, x26, [%x[base], #64]\n\t" + "ldp x27, x28, [%x[base], #80]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -448,12 +397,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #6\n\t" - "ldp x17, x18, [x1, #96]\n\t" - "ldp x19, x20, [x1, #112]\n\t" - "ldp x21, x22, [x1, #128]\n\t" - "ldp x23, x24, [x1, #144]\n\t" - "ldp x25, x26, [x1, #160]\n\t" - "ldp x27, x28, [x1, #176]\n\t" + "ldp x17, x18, [%x[base], #96]\n\t" + "ldp x19, x20, [%x[base], #112]\n\t" + "ldp x21, x22, [%x[base], #128]\n\t" + "ldp x23, x24, [%x[base], #144]\n\t" + "ldp x25, x26, [%x[base], #160]\n\t" + "ldp x27, x28, [%x[base], #176]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -467,12 +416,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #7\n\t" - "ldp x17, x18, [x1, #192]\n\t" - "ldp x19, x20, [x1, #208]\n\t" - "ldp x21, x22, [x1, #224]\n\t" - "ldp x23, x24, [x1, #240]\n\t" - "ldp x25, x26, [x1, #256]\n\t" - "ldp x27, x28, [x1, #272]\n\t" + "ldp x17, x18, [%x[base], #192]\n\t" + "ldp x19, x20, [%x[base], #208]\n\t" + "ldp x21, x22, [%x[base], #224]\n\t" + "ldp x23, x24, [%x[base], #240]\n\t" + "ldp x25, x26, [%x[base], #256]\n\t" + "ldp x27, x28, [%x[base], #272]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -486,12 +435,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #8\n\t" - "ldp x17, x18, [x1, #288]\n\t" - "ldp x19, x20, [x1, #304]\n\t" - "ldp x21, x22, [x1, #320]\n\t" - "ldp x23, x24, [x1, #336]\n\t" - "ldp x25, x26, [x1, #352]\n\t" - "ldp x27, x28, [x1, #368]\n\t" + "ldp x17, x18, [%x[base], #288]\n\t" + "ldp x19, x20, [%x[base], #304]\n\t" + "ldp x21, x22, [%x[base], #320]\n\t" + "ldp x23, x24, [%x[base], #336]\n\t" + "ldp x25, x26, [%x[base], #352]\n\t" + "ldp x27, x28, [%x[base], #368]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -512,7 +461,7 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "sbcs x18, x18, x12\n\t" "sbcs x19, x19, x13\n\t" "sbc x20, x20, x14\n\t" - "cmp %[b], #0\n\t" + "cmp %x[b], #0\n\t" "mov x15, x3\n\t" "csel x3, x7, x3, lt\n\t" "csel x7, x15, x7, lt\n\t" @@ -529,12 +478,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x12, x18, x12, lt\n\t" "csel x13, x19, x13, lt\n\t" "csel x14, x20, x14, lt\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "stp x7, x8, [x0, #32]\n\t" - "stp x9, x10, [x0, #48]\n\t" - "stp x11, x12, [x0, #64]\n\t" - "stp x13, x14, [x0, #80]\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" + "stp x7, x8, [%x[r], #32]\n\t" + "stp x9, x10, [%x[r], #48]\n\t" + "stp x11, x12, [%x[r], #64]\n\t" + "stp x13, x14, [%x[r], #80]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) : @@ -548,10 +497,10 @@ void fe_mul(fe r, const fe a, const fe b) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Multiply */ - "ldp x14, x15, [x1]\n\t" - "ldp x16, x17, [x1, #16]\n\t" - "ldp x18, x19, [x2]\n\t" - "ldp x20, x21, [x2, #16]\n\t" + "ldp x14, x15, [%x[a]]\n\t" + "ldp x16, x17, [%x[a], #16]\n\t" + "ldp x18, x19, [%x[b]]\n\t" + "ldp x20, x21, [%x[b], #16]\n\t" /* A[0] * B[0] */ "mul x6, x14, x18\n\t" "umulh x7, x14, x18\n\t" @@ -678,16 +627,15 @@ void fe_mul(fe r, const fe a, const fe b) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -701,8 +649,8 @@ void fe_sq(fe r, const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Square */ - "ldp x13, x14, [x1]\n\t" - "ldp x15, x16, [x1, #16]\n\t" + "ldp x13, x14, [%x[a]]\n\t" + "ldp x15, x16, [%x[a], #16]\n\t" /* A[0] * A[1] */ "mul x6, x13, x14\n\t" "umulh x7, x13, x14\n\t" @@ -797,16 +745,15 @@ void fe_sq(fe r, const fe a) "adcs x7, x7, xzr\n\t" "adc x8, x8, xzr\n\t" /* Reduce if top bit set */ - "asr x4, x8, #63\n\t" - "and x4, x4, x2\n\t" + "and x4, x2, x8, asr 63\n\t" "and x8, x8, #0x7fffffffffffffff\n\t" "adds x5, x5, x4\n\t" "adcs x6, x6, xzr\n\t" "adcs x7, x7, xzr\n\t" "adc x8, x8, xzr\n\t" /* Store */ - "stp x5, x6, [x0]\n\t" - "stp x7, x8, [x0, #16]\n\t" + "stp x5, x6, [%x[r]]\n\t" + "stp x7, x8, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -814,185 +761,14 @@ void fe_sq(fe r, const fe a) ); } -void fe_mul121666(fe r, fe a) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - /* Multiply by 121666 */ - "ldp x5, x6, [x1]\n\t" - "ldp x7, x8, [x1, #16]\n\t" - "mov x4, #0xdb42\n\t" - "movk x4, #1, lsl 16\n\t" - "mul x9, x5, x4\n\t" - "umulh x10, x5, x4\n\t" - "mul x2, x6, x4\n\t" - "umulh x3, x6, x4\n\t" - "adds x10, x10, x2\n\t" - "adc x11, xzr, x3\n\t" - "mul x2, x7, x4\n\t" - "umulh x3, x7, x4\n\t" - "adds x11, x11, x2\n\t" - "adc x12, xzr, x3\n\t" - "mul x2, x8, x4\n\t" - "umulh x3, x8, x4\n\t" - "adds x12, x12, x2\n\t" - "adc x3, xzr, x3\n\t" - "mov x4, #19\n\t" - "extr x3, x3, x12, #63\n\t" - "mul x3, x3, x4\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, xzr\n\t" - "adcs x11, x11, xzr\n\t" - "adc x12, x12, xzr\n\t" - "stp x9, x10, [x0]\n\t" - "stp x11, x12, [x0, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12" - ); -} - -void fe_sq2(fe r, const fe a) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - /* Square * 2 */ - "ldp x5, x6, [x1]\n\t" - "ldp x7, x8, [x1, #16]\n\t" - /* A[0] * A[1] */ - "mul x10, x5, x6\n\t" - "umulh x11, x5, x6\n\t" - /* A[0] * A[2] */ - "mul x2, x5, x7\n\t" - "umulh x12, x5, x7\n\t" - "adds x11, x11, x2\n\t" - "adc x12, x12, xzr\n\t" - /* A[0] * A[3] */ - "mul x2, x5, x8\n\t" - "umulh x13, x5, x8\n\t" - "adds x12, x12, x2\n\t" - "adc x13, x13, xzr\n\t" - /* A[1] * A[2] */ - "mul x2, x6, x7\n\t" - "umulh x3, x6, x7\n\t" - "adds x12, x12, x2\n\t" - "adcs x13, x13, x3\n\t" - "adc x14, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x2, x6, x8\n\t" - "umulh x3, x6, x8\n\t" - "adds x13, x13, x2\n\t" - "adc x14, x14, x3\n\t" - /* A[2] * A[3] */ - "mul x2, x7, x8\n\t" - "umulh x15, x7, x8\n\t" - "adds x14, x14, x2\n\t" - "adc x15, x15, xzr\n\t" - /* Double */ - "adds x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adcs x13, x13, x13\n\t" - "adcs x14, x14, x14\n\t" - "adcs x15, x15, x15\n\t" - "adc x16, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x9, x5, x5\n\t" - "umulh x17, x5, x5\n\t" - /* A[1] * A[1] */ - "mul x2, x6, x6\n\t" - "umulh x3, x6, x6\n\t" - "adds x10, x10, x17\n\t" - "adcs x11, x11, x2\n\t" - "adc x17, x3, xzr\n\t" - /* A[2] * A[2] */ - "mul x2, x7, x7\n\t" - "umulh x3, x7, x7\n\t" - "adds x12, x12, x17\n\t" - "adcs x13, x13, x2\n\t" - "adc x17, x3, xzr\n\t" - /* A[3] * A[3] */ - "mul x2, x8, x8\n\t" - "umulh x3, x8, x8\n\t" - "adds x14, x14, x17\n\t" - "adcs x15, x15, x2\n\t" - "adc x16, x16, x3\n\t" - /* Double and Reduce */ - "mov x2, #0x169\n\t" - /* Move top half into t4-t7 and remove top bit from t3 */ - "lsr x17, x16, #61\n\t" - "extr x16, x16, x15, #62\n\t" - "extr x15, x15, x14, #62\n\t" - "extr x14, x14, x13, #62\n\t" - "extr x13, x13, x12, #62\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "lsl x9, x9, #1\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - /* Two left, only one right */ - "and x16, x16, #0x7fffffffffffffff\n\t" - /* Multiply top bits by 19*19 */ - "mul x17, x17, x2\n\t" - /* Multiply top half by 19 */ - "mov x2, #19\n\t" - "mul x3, x2, x13\n\t" - "umulh x13, x2, x13\n\t" - "adds x9, x9, x3\n\t" - "mul x3, x2, x14\n\t" - "umulh x14, x2, x14\n\t" - "adcs x10, x10, x3\n\t" - "mul x3, x2, x15\n\t" - "umulh x15, x2, x15\n\t" - "adcs x11, x11, x3\n\t" - "mul x3, x2, x16\n\t" - "umulh x4, x2, x16\n\t" - "adcs x12, x12, x3\n\t" - "adc x4, x4, xzr\n\t" - /* Add remaining product results in */ - "adds x9, x9, x17\n\t" - "adcs x10, x10, x13\n\t" - "adcs x11, x11, x14\n\t" - "adcs x12, x12, x15\n\t" - "adc x4, x4, xzr\n\t" - /* Overflow */ - "extr x4, x4, x12, #63\n\t" - "mul x4, x4, x2\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - "adds x9, x9, x4\n\t" - "adcs x10, x10, xzr\n\t" - "adcs x11, x11, xzr\n\t" - "adc x12, x12, xzr\n\t" - /* Reduce if top bit set */ - "asr x4, x12, #63\n\t" - "and x4, x4, x2\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - "adds x9, x9, x4\n\t" - "adcs x10, x10, xzr\n\t" - "adcs x11, x11, xzr\n\t" - "adc x12, x12, xzr\n\t" - /* Store */ - "stp x9, x10, [x0]\n\t" - "stp x11, x12, [x0, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" - ); -} - void fe_invert(fe r, const fe a) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-160]!\n\t" "add x29, sp, #0\n\t" /* Invert */ - "str %[r], [x29, #144]\n\t" - "str %[a], [x29, #152]\n\t" + "str %x[r], [x29, #144]\n\t" + "str %x[a], [x29, #152]\n\t" "add x0, x29, #16\n\t" "bl fe_sq\n\t" "add x0, x29, #48\n\t" @@ -1007,107 +783,107 @@ void fe_invert(fe r, const fe a) "add x1, x29, #16\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "add x0, x29, #48\n\t" "add x1, x29, #48\n\t" - "add x2, x29, #80\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x20, #4\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert1:\n\t" + "L_fe_invert1_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert1\n\t" + "bne L_fe_invert1_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #48\n\t" "bl fe_sq\n\t" "mov x20, #9\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert2:\n\t" + "L_fe_invert2_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert2\n\t" + "bne L_fe_invert2_%=\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" "mov x20, #19\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_fe_invert3:\n\t" + "L_fe_invert3_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert3\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_fe_invert3_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "mov x20, #10\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert4:\n\t" + "L_fe_invert4_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert4\n\t" + "bne L_fe_invert4_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #48\n\t" "bl fe_sq\n\t" "mov x20, #49\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert5:\n\t" + "L_fe_invert5_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert5\n\t" + "bne L_fe_invert5_%=\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" "mov x20, #0x63\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_fe_invert6:\n\t" + "L_fe_invert6_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert6\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_fe_invert6_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "mov x20, #50\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert7:\n\t" + "L_fe_invert7_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert7\n\t" + "bne L_fe_invert7_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" "mov x20, #5\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_invert8:\n\t" + "L_fe_invert8_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert8\n\t" + "bne L_fe_invert8_%=\n\t" "ldr x0, [x29, #144]\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" @@ -1124,11 +900,11 @@ int curve25519(byte* r, byte* n, byte* a) "stp x29, x30, [sp, #-192]!\n\t" "add x29, sp, #0\n\t" "mov x22, xzr\n\t" - "str %[r], [x29, #176]\n\t" + "str %x[r], [x29, #176]\n\t" /* Set one */ "mov x23, #1\n\t" - "stp x23, xzr, [x0]\n\t" - "stp xzr, xzr, [x0, #16]\n\t" + "stp x23, xzr, [%x[r]]\n\t" + "stp xzr, xzr, [%x[r], #16]\n\t" /* Set zero */ "stp xzr, xzr, [x29, #16]\n\t" "stp xzr, xzr, [x29, #32]\n\t" @@ -1137,24 +913,24 @@ int curve25519(byte* r, byte* n, byte* a) "stp x23, xzr, [x29, #48]\n\t" "stp xzr, xzr, [x29, #64]\n\t" /* Copy */ - "ldp x6, x7, [x2]\n\t" - "ldp x8, x9, [x2, #16]\n\t" + "ldp x6, x7, [%x[a]]\n\t" + "ldp x8, x9, [%x[a], #16]\n\t" "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" "mov x25, #62\n\t" "mov x24, #24\n\t" "\n" - "L_curve25519_words:\n\t" + "L_curve25519_words_%=: \n\t" "\n" - "L_curve25519_bits:\n\t" - "ldr x23, [x1, x24]\n\t" + "L_curve25519_bits_%=: \n\t" + "ldr x23, [%x[n], x24]\n\t" "lsr x23, x23, x25\n\t" "and x23, x23, #1\n\t" "eor x22, x22, x23\n\t" /* Conditional Swap */ "cmp x22, #1\n\t" - "ldp x10, x11, [x0]\n\t" - "ldp x12, x13, [x0, #16]\n\t" + "ldp x10, x11, [%x[r]]\n\t" + "ldp x12, x13, [%x[r], #16]\n\t" "ldp x6, x7, [x29, #80]\n\t" "ldp x8, x9, [x29, #96]\n\t" "csel x14, x10, x6, eq\n\t" @@ -1369,8 +1145,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x20, x20, xzr\n\t" "adc x21, x21, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x21, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x21, asr 63\n\t" "and x21, x21, #0x7fffffffffffffff\n\t" "adds x18, x18, x5\n\t" "adcs x19, x19, xzr\n\t" @@ -1508,8 +1283,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x20, x20, xzr\n\t" "adc x21, x21, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x21, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x21, asr 63\n\t" "and x21, x21, #0x7fffffffffffffff\n\t" "adds x18, x18, x5\n\t" "adcs x19, x19, xzr\n\t" @@ -1611,8 +1385,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x12, x12, xzr\n\t" "adc x13, x13, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x13, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x13, asr 63\n\t" "and x13, x13, #0x7fffffffffffffff\n\t" "adds x10, x10, x5\n\t" "adcs x11, x11, xzr\n\t" @@ -1714,8 +1487,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x17, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x17, asr 63\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, xzr\n\t" @@ -1849,16 +1621,15 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" /* Sub */ "subs x14, x14, x10\n\t" "sbcs x15, x15, x11\n\t" @@ -2041,8 +1812,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2178,8 +1948,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2283,8 +2052,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2292,8 +2060,8 @@ int curve25519(byte* r, byte* n, byte* a) "adc x9, x9, xzr\n\t" /* Store */ /* Multiply */ - "ldp x14, x15, [x2]\n\t" - "ldp x16, x17, [x2, #16]\n\t" + "ldp x14, x15, [%x[a]]\n\t" + "ldp x16, x17, [%x[a], #16]\n\t" /* A[0] * B[0] */ "mul x10, x14, x6\n\t" "umulh x11, x14, x6\n\t" @@ -2420,8 +2188,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x12, x12, xzr\n\t" "adc x13, x13, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x13, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x13, asr 63\n\t" "and x13, x13, #0x7fffffffffffffff\n\t" "adds x10, x10, x5\n\t" "adcs x11, x11, xzr\n\t" @@ -2432,135 +2199,135 @@ int curve25519(byte* r, byte* n, byte* a) "stp x12, x13, [x29, #64]\n\t" "sub x25, x25, #1\n\t" "cmp x25, #0\n\t" - "bge L_curve25519_bits\n\t" + "bge L_curve25519_bits_%=\n\t" "mov x25, #63\n\t" "sub x24, x24, #8\n\t" "cmp x24, #0\n\t" - "bge L_curve25519_words\n\t" + "bge L_curve25519_words_%=\n\t" /* Invert */ "add x0, x29, #48\n\t" "add x1, x29, #16\n\t" "bl fe_sq\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #48\n\t" "bl fe_sq\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "bl fe_sq\n\t" "add x1, x29, #16\n\t" - "add x2, x29, #80\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "add x0, x29, #48\n\t" "add x1, x29, #48\n\t" - "add x2, x29, #80\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" - "add x0, x29, #80\n\t" - "add x1, x29, #80\n\t" - "add x2, x29, #112\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #0x70\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" "mov x24, #4\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_1:\n\t" + "L_curve25519_inv_1_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_1\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" - "add x1, x29, #80\n\t" + "add x0, x29, #0x70\n\t" + "add x1, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x24, #9\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_2:\n\t" + "L_curve25519_inv_2_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_2\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #144\n\t" + "add x0, x29, #0x90\n\t" "bl fe_sq\n\t" "mov x24, #19\n\t" - "add x1, x29, #144\n\t" + "add x1, x29, #0x90\n\t" "\n" - "L_curve25519_inv_3:\n\t" + "L_curve25519_inv_3_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_3\n\t" - "add x0, x29, #112\n\t" - "add x2, x29, #112\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add x0, x29, #0x70\n\t" + "add x2, x29, #0x70\n\t" "bl fe_mul\n\t" "mov x24, #10\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_4:\n\t" + "L_curve25519_inv_4_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_4\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" - "add x1, x29, #80\n\t" + "add x0, x29, #0x70\n\t" + "add x1, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x24, #49\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_5:\n\t" + "L_curve25519_inv_5_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_5\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #144\n\t" + "add x0, x29, #0x90\n\t" "bl fe_sq\n\t" "mov x24, #0x63\n\t" - "add x1, x29, #144\n\t" + "add x1, x29, #0x90\n\t" "\n" - "L_curve25519_inv_6:\n\t" + "L_curve25519_inv_6_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_6\n\t" - "add x0, x29, #112\n\t" - "add x2, x29, #112\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add x0, x29, #0x70\n\t" + "add x2, x29, #0x70\n\t" "bl fe_mul\n\t" "mov x24, #50\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_7:\n\t" + "L_curve25519_inv_7_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_7\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "mov x24, #5\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_curve25519_inv_8:\n\t" + "L_curve25519_inv_8_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_8\n\t" + "bne L_curve25519_inv_8_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "ldr %[r], [x29, #176]\n\t" + "ldr %x[r], [x29, #176]\n\t" /* Multiply */ - "ldp x6, x7, [x0]\n\t" - "ldp x8, x9, [x0, #16]\n\t" + "ldp x6, x7, [%x[r]]\n\t" + "ldp x8, x9, [%x[r], #16]\n\t" "ldp x10, x11, [x29, #16]\n\t" "ldp x12, x13, [x29, #32]\n\t" /* A[0] * B[0] */ @@ -2689,16 +2456,15 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x17, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x17, asr 63\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, xzr\n\t" "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" /* Store */ - "stp x14, x15, [x0]\n\t" - "stp x16, x17, [x0, #16]\n\t" + "stp x14, x15, [%x[r]]\n\t" + "stp x16, x17, [%x[r], #16]\n\t" "mov x0, xzr\n\t" "ldp x29, x30, [sp], #0xc0\n\t" : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) @@ -2714,8 +2480,8 @@ void fe_pow22523(fe r, const fe a) "stp x29, x30, [sp, #-128]!\n\t" "add x29, sp, #0\n\t" /* pow22523 */ - "str %[r], [x29, #112]\n\t" - "str %[a], [x29, #120]\n\t" + "str %x[r], [x29, #112]\n\t" + "str %x[a], [x29, #120]\n\t" "add x0, x29, #16\n\t" "bl fe_sq\n\t" "add x0, x29, #48\n\t" @@ -2740,11 +2506,11 @@ void fe_pow22523(fe r, const fe a) "mov x21, #4\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_1:\n\t" + "L_fe_pow22523_1_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_1\n\t" + "bne L_fe_pow22523_1_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" @@ -2754,34 +2520,34 @@ void fe_pow22523(fe r, const fe a) "mov x21, #9\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_2:\n\t" + "L_fe_pow22523_2_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_2\n\t" + "bne L_fe_pow22523_2_%=\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x21, #19\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_pow22523_3:\n\t" + "L_fe_pow22523_3_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_3\n\t" + "bne L_fe_pow22523_3_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" "mov x21, #10\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_4:\n\t" + "L_fe_pow22523_4_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_4\n\t" + "bne L_fe_pow22523_4_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" @@ -2791,45 +2557,45 @@ void fe_pow22523(fe r, const fe a) "mov x21, #49\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_5:\n\t" + "L_fe_pow22523_5_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_5\n\t" + "bne L_fe_pow22523_5_%=\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x21, #0x63\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_pow22523_6:\n\t" + "L_fe_pow22523_6_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_6\n\t" + "bne L_fe_pow22523_6_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" "mov x21, #50\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_7:\n\t" + "L_fe_pow22523_7_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_7\n\t" + "bne L_fe_pow22523_7_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" "mov x21, #2\n\t" "add x1, x29, #16\n\t" "\n" - "L_fe_pow22523_8:\n\t" + "L_fe_pow22523_8_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_8\n\t" + "bne L_fe_pow22523_8_%=\n\t" "ldr x0, [x29, #112]\n\t" "ldr x2, [x29, #120]\n\t" "bl fe_mul\n\t" @@ -2845,12 +2611,12 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con __asm__ __volatile__ ( "stp x29, x30, [sp, #-64]!\n\t" "add x29, sp, #0\n\t" - "str %[ry], [x29, #16]\n\t" - "str %[rz], [x29, #24]\n\t" - "str %[px], [x29, #32]\n\t" - "str %[py], [x29, #40]\n\t" - "str %[pz], [x29, #48]\n\t" - "str %[pt], [x29, #56]\n\t" + "str %x[ry], [x29, #16]\n\t" + "str %x[rz], [x29, #24]\n\t" + "str %x[px], [x29, #32]\n\t" + "str %x[py], [x29, #40]\n\t" + "str %x[pz], [x29, #48]\n\t" + "str %x[pt], [x29, #56]\n\t" "ldr x1, [x29, #32]\n\t" "ldr x2, [x29, #56]\n\t" /* Multiply */ @@ -2984,8 +2750,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x21, x6, #63\n\t" - "and x21, x21, x19\n\t" + "and x21, x19, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" @@ -3128,8 +2893,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x21, x6, #63\n\t" - "and x21, x21, x19\n\t" + "and x21, x19, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" @@ -3269,8 +3033,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x21, x6, #63\n\t" - "and x21, x21, x19\n\t" + "and x21, x19, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" @@ -3291,13 +3054,13 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[ry], [x29, #16]\n\t" - "str %[rz], [x29, #24]\n\t" - "str %[rt], [x29, #32]\n\t" - "str %[px], [x29, #40]\n\t" - "str %[py], [x29, #48]\n\t" - "str %[pz], [x29, #56]\n\t" - "str %[pt], [x29, #64]\n\t" + "str %x[ry], [x29, #16]\n\t" + "str %x[rz], [x29, #24]\n\t" + "str %x[rt], [x29, #32]\n\t" + "str %x[px], [x29, #40]\n\t" + "str %x[py], [x29, #48]\n\t" + "str %x[pz], [x29, #56]\n\t" + "str %x[pt], [x29, #64]\n\t" "ldr x1, [x29, #40]\n\t" "ldr x2, [x29, #64]\n\t" /* Multiply */ @@ -3431,8 +3194,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3572,8 +3334,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3713,8 +3474,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3851,8 +3611,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3873,13 +3632,13 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" "ldr x1, [x29, #48]\n\t" /* Square */ "ldp x12, x13, [x1]\n\t" @@ -3978,8 +3737,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4087,8 +3845,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x11, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" "adds x8, x8, x26\n\t" "adcs x9, x9, xzr\n\t" @@ -4209,8 +3966,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x18, x18, xzr\n\t" "adc x19, x19, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x19, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x19, asr 63\n\t" "and x19, x19, #0x7fffffffffffffff\n\t" "adds x16, x16, x26\n\t" "adcs x17, x17, xzr\n\t" @@ -4383,8 +4139,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4421,14 +4176,14 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -4596,8 +4351,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -4735,8 +4489,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4913,8 +4666,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4990,14 +4742,14 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -5165,8 +4917,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -5304,8 +5055,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -5482,8 +5232,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -5559,14 +5308,14 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -5734,8 +5483,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -5873,8 +5621,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6051,8 +5798,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6209,8 +5955,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x11, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" "adds x8, x8, x26\n\t" "adcs x9, x9, xzr\n\t" @@ -6269,14 +6014,14 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -6444,8 +6189,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -6583,8 +6327,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6761,8 +6504,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6919,8 +6661,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x11, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" "adds x8, x8, x26\n\t" "adcs x9, x9, xzr\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S new file mode 100644 index 000000000..a8cf8e742 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -0,0 +1,1049 @@ +/* armv8-sha512-asm + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S + */ +#ifdef __aarch64__ + .text + .section .rodata + .type L_SHA512_transform_neon_len_k, %object + .size L_SHA512_transform_neon_len_k, 640 + .align 3 +L_SHA512_transform_neon_len_k: + .xword 0x428a2f98d728ae22 + .xword 0x7137449123ef65cd + .xword 0xb5c0fbcfec4d3b2f + .xword 0xe9b5dba58189dbbc + .xword 0x3956c25bf348b538 + .xword 0x59f111f1b605d019 + .xword 0x923f82a4af194f9b + .xword 0xab1c5ed5da6d8118 + .xword 0xd807aa98a3030242 + .xword 0x12835b0145706fbe + .xword 0x243185be4ee4b28c + .xword 0x550c7dc3d5ffb4e2 + .xword 0x72be5d74f27b896f + .xword 0x80deb1fe3b1696b1 + .xword 0x9bdc06a725c71235 + .xword 0xc19bf174cf692694 + .xword 0xe49b69c19ef14ad2 + .xword 0xefbe4786384f25e3 + .xword 0xfc19dc68b8cd5b5 + .xword 0x240ca1cc77ac9c65 + .xword 0x2de92c6f592b0275 + .xword 0x4a7484aa6ea6e483 + .xword 0x5cb0a9dcbd41fbd4 + .xword 0x76f988da831153b5 + .xword 0x983e5152ee66dfab + .xword 0xa831c66d2db43210 + .xword 0xb00327c898fb213f + .xword 0xbf597fc7beef0ee4 + .xword 0xc6e00bf33da88fc2 + .xword 0xd5a79147930aa725 + .xword 0x6ca6351e003826f + .xword 0x142929670a0e6e70 + .xword 0x27b70a8546d22ffc + .xword 0x2e1b21385c26c926 + .xword 0x4d2c6dfc5ac42aed + .xword 0x53380d139d95b3df + .xword 0x650a73548baf63de + .xword 0x766a0abb3c77b2a8 + .xword 0x81c2c92e47edaee6 + .xword 0x92722c851482353b + .xword 0xa2bfe8a14cf10364 + .xword 0xa81a664bbc423001 + .xword 0xc24b8b70d0f89791 + .xword 0xc76c51a30654be30 + .xword 0xd192e819d6ef5218 + .xword 0xd69906245565a910 + .xword 0xf40e35855771202a + .xword 0x106aa07032bbd1b8 + .xword 0x19a4c116b8d2d0c8 + .xword 0x1e376c085141ab53 + .xword 0x2748774cdf8eeb99 + .xword 0x34b0bcb5e19b48a8 + .xword 0x391c0cb3c5c95a63 + .xword 0x4ed8aa4ae3418acb + .xword 0x5b9cca4f7763e373 + .xword 0x682e6ff3d6b2b8a3 + .xword 0x748f82ee5defb2fc + .xword 0x78a5636f43172f60 + .xword 0x84c87814a1f0ab72 + .xword 0x8cc702081a6439ec + .xword 0x90befffa23631e28 + .xword 0xa4506cebde82bde9 + .xword 0xbef9a3f7b2c67915 + .xword 0xc67178f2e372532b + .xword 0xca273eceea26619c + .xword 0xd186b8c721c0c207 + .xword 0xeada7dd6cde0eb1e + .xword 0xf57d4f7fee6ed178 + .xword 0x6f067aa72176fba + .xword 0xa637dc5a2c898a6 + .xword 0x113f9804bef90dae + .xword 0x1b710b35131c471b + .xword 0x28db77f523047d84 + .xword 0x32caab7b40c72493 + .xword 0x3c9ebe0a15c9bebc + .xword 0x431d67c49c100d4c + .xword 0x4cc5d4becb3e42b6 + .xword 0x597f299cfc657e2a + .xword 0x5fcb6fab3ad6faec + .xword 0x6c44198c4a475817 + .text + .section .rodata + .type L_SHA512_transform_neon_len_ror8, %object + .size L_SHA512_transform_neon_len_ror8, 16 + .align 4 +L_SHA512_transform_neon_len_ror8: + .xword 0x7060504030201, 0x80f0e0d0c0b0a09 + .text + .align 2 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x17, [x29, #16] + stp x18, x19, [x29, #24] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + stp x24, x25, [x29, #72] + stp x26, x27, [x29, #88] + str x28, [x29, #104] + stp d8, d9, [x29, #112] + stp d10, d11, [x29, #128] + adr x3, L_SHA512_transform_neon_len_k + adr x28, L_SHA512_transform_neon_len_ror8 + ld1 {v11.16b}, [x28] + # Load digest into working vars + ldp x4, x5, [x0] + ldp x6, x7, [x0, #16] + ldp x8, x9, [x0, #32] + ldp x10, x11, [x0, #48] + # Start of loop processing a block +L_sha512_len_neon_begin: + # Load W + # Copy digest to add in at end + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40 + mov x20, x4 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40 + mov x21, x5 + rev64 v0.16b, v0.16b + mov x22, x6 + rev64 v1.16b, v1.16b + mov x23, x7 + rev64 v2.16b, v2.16b + mov x24, x8 + rev64 v3.16b, v3.16b + mov x25, x9 + rev64 v4.16b, v4.16b + mov x26, x10 + rev64 v5.16b, v5.16b + mov x27, x11 + rev64 v6.16b, v6.16b + rev64 v7.16b, v7.16b + # Pre-calc: b ^ c + eor x16, x5, x6 + mov x28, #4 + # Start of 16 rounds +L_sha512_len_neon_start: + # Round 0 + mov x18, v0.d[0] + ldr x19, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x18 + eor x12, x12, x10 + add x11, x11, x19 + eor x16, x16, x5 + add x11, x11, x12 + add x15, x15, x16 + add x7, x7, x11 + add x11, x11, x15 + # Round 1 + mov x18, v0.d[1] + ldr x19, [x3], #8 + ext v10.16b, v0.16b, v1.16b, #8 + ror x12, x7, #14 + shl v8.2d, v7.2d, #45 + ror x14, x11, #28 + sri v8.2d, v7.2d, #19 + eor x12, x12, x7, ror 18 + shl v9.2d, v7.2d, #3 + eor x14, x14, x11, ror 34 + sri v9.2d, v7.2d, #61 + eor x12, x12, x7, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x11, ror 39 + ushr v8.2d, v7.2d, #6 + add x10, x10, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x11, x4 + add v0.2d, v0.2d, v9.2d + eor x12, x8, x9 + ext v9.16b, v4.16b, v5.16b, #8 + and x17, x16, x17 + add v0.2d, v0.2d, v9.2d + and x12, x12, x7 + shl v8.2d, v10.2d, #63 + add x10, x10, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x9 + tbl v9.16b, {v10.16b}, v11.16b + add x10, x10, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x4 + ushr v10.2d, v10.2d, #7 + add x10, x10, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v0.2d, v0.2d, v9.2d + add x6, x6, x10 + add x10, x10, x15 + # Round 2 + mov x18, v1.d[0] + ldr x19, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x18 + eor x12, x12, x8 + add x9, x9, x19 + eor x16, x16, x11 + add x9, x9, x12 + add x15, x15, x16 + add x5, x5, x9 + add x9, x9, x15 + # Round 3 + mov x18, v1.d[1] + ldr x19, [x3], #8 + ext v10.16b, v1.16b, v2.16b, #8 + ror x12, x5, #14 + shl v8.2d, v0.2d, #45 + ror x14, x9, #28 + sri v8.2d, v0.2d, #19 + eor x12, x12, x5, ror 18 + shl v9.2d, v0.2d, #3 + eor x14, x14, x9, ror 34 + sri v9.2d, v0.2d, #61 + eor x12, x12, x5, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x9, ror 39 + ushr v8.2d, v0.2d, #6 + add x8, x8, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x9, x10 + add v1.2d, v1.2d, v9.2d + eor x12, x6, x7 + ext v9.16b, v5.16b, v6.16b, #8 + and x17, x16, x17 + add v1.2d, v1.2d, v9.2d + and x12, x12, x5 + shl v8.2d, v10.2d, #63 + add x8, x8, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x7 + tbl v9.16b, {v10.16b}, v11.16b + add x8, x8, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x10 + ushr v10.2d, v10.2d, #7 + add x8, x8, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v1.2d, v1.2d, v9.2d + add x4, x4, x8 + add x8, x8, x15 + # Round 4 + mov x18, v2.d[0] + ldr x19, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x18 + eor x12, x12, x6 + add x7, x7, x19 + eor x16, x16, x9 + add x7, x7, x12 + add x15, x15, x16 + add x11, x11, x7 + add x7, x7, x15 + # Round 5 + mov x18, v2.d[1] + ldr x19, [x3], #8 + ext v10.16b, v2.16b, v3.16b, #8 + ror x12, x11, #14 + shl v8.2d, v1.2d, #45 + ror x14, x7, #28 + sri v8.2d, v1.2d, #19 + eor x12, x12, x11, ror 18 + shl v9.2d, v1.2d, #3 + eor x14, x14, x7, ror 34 + sri v9.2d, v1.2d, #61 + eor x12, x12, x11, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x7, ror 39 + ushr v8.2d, v1.2d, #6 + add x6, x6, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x7, x8 + add v2.2d, v2.2d, v9.2d + eor x12, x4, x5 + ext v9.16b, v6.16b, v7.16b, #8 + and x17, x16, x17 + add v2.2d, v2.2d, v9.2d + and x12, x12, x11 + shl v8.2d, v10.2d, #63 + add x6, x6, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x5 + tbl v9.16b, {v10.16b}, v11.16b + add x6, x6, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x8 + ushr v10.2d, v10.2d, #7 + add x6, x6, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v2.2d, v2.2d, v9.2d + add x10, x10, x6 + add x6, x6, x15 + # Round 6 + mov x18, v3.d[0] + ldr x19, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x18 + eor x12, x12, x4 + add x5, x5, x19 + eor x16, x16, x7 + add x5, x5, x12 + add x15, x15, x16 + add x9, x9, x5 + add x5, x5, x15 + # Round 7 + mov x18, v3.d[1] + ldr x19, [x3], #8 + ext v10.16b, v3.16b, v4.16b, #8 + ror x12, x9, #14 + shl v8.2d, v2.2d, #45 + ror x14, x5, #28 + sri v8.2d, v2.2d, #19 + eor x12, x12, x9, ror 18 + shl v9.2d, v2.2d, #3 + eor x14, x14, x5, ror 34 + sri v9.2d, v2.2d, #61 + eor x12, x12, x9, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x5, ror 39 + ushr v8.2d, v2.2d, #6 + add x4, x4, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x5, x6 + add v3.2d, v3.2d, v9.2d + eor x12, x10, x11 + ext v9.16b, v7.16b, v0.16b, #8 + and x17, x16, x17 + add v3.2d, v3.2d, v9.2d + and x12, x12, x9 + shl v8.2d, v10.2d, #63 + add x4, x4, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x11 + tbl v9.16b, {v10.16b}, v11.16b + add x4, x4, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x6 + ushr v10.2d, v10.2d, #7 + add x4, x4, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v3.2d, v3.2d, v9.2d + add x8, x8, x4 + add x4, x4, x15 + # Round 8 + mov x18, v4.d[0] + ldr x19, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x18 + eor x12, x12, x10 + add x11, x11, x19 + eor x16, x16, x5 + add x11, x11, x12 + add x15, x15, x16 + add x7, x7, x11 + add x11, x11, x15 + # Round 9 + mov x18, v4.d[1] + ldr x19, [x3], #8 + ext v10.16b, v4.16b, v5.16b, #8 + ror x12, x7, #14 + shl v8.2d, v3.2d, #45 + ror x14, x11, #28 + sri v8.2d, v3.2d, #19 + eor x12, x12, x7, ror 18 + shl v9.2d, v3.2d, #3 + eor x14, x14, x11, ror 34 + sri v9.2d, v3.2d, #61 + eor x12, x12, x7, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x11, ror 39 + ushr v8.2d, v3.2d, #6 + add x10, x10, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x11, x4 + add v4.2d, v4.2d, v9.2d + eor x12, x8, x9 + ext v9.16b, v0.16b, v1.16b, #8 + and x17, x16, x17 + add v4.2d, v4.2d, v9.2d + and x12, x12, x7 + shl v8.2d, v10.2d, #63 + add x10, x10, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x9 + tbl v9.16b, {v10.16b}, v11.16b + add x10, x10, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x4 + ushr v10.2d, v10.2d, #7 + add x10, x10, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v4.2d, v4.2d, v9.2d + add x6, x6, x10 + add x10, x10, x15 + # Round 10 + mov x18, v5.d[0] + ldr x19, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x18 + eor x12, x12, x8 + add x9, x9, x19 + eor x16, x16, x11 + add x9, x9, x12 + add x15, x15, x16 + add x5, x5, x9 + add x9, x9, x15 + # Round 11 + mov x18, v5.d[1] + ldr x19, [x3], #8 + ext v10.16b, v5.16b, v6.16b, #8 + ror x12, x5, #14 + shl v8.2d, v4.2d, #45 + ror x14, x9, #28 + sri v8.2d, v4.2d, #19 + eor x12, x12, x5, ror 18 + shl v9.2d, v4.2d, #3 + eor x14, x14, x9, ror 34 + sri v9.2d, v4.2d, #61 + eor x12, x12, x5, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x9, ror 39 + ushr v8.2d, v4.2d, #6 + add x8, x8, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x9, x10 + add v5.2d, v5.2d, v9.2d + eor x12, x6, x7 + ext v9.16b, v1.16b, v2.16b, #8 + and x17, x16, x17 + add v5.2d, v5.2d, v9.2d + and x12, x12, x5 + shl v8.2d, v10.2d, #63 + add x8, x8, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x7 + tbl v9.16b, {v10.16b}, v11.16b + add x8, x8, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x10 + ushr v10.2d, v10.2d, #7 + add x8, x8, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v5.2d, v5.2d, v9.2d + add x4, x4, x8 + add x8, x8, x15 + # Round 12 + mov x18, v6.d[0] + ldr x19, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x18 + eor x12, x12, x6 + add x7, x7, x19 + eor x16, x16, x9 + add x7, x7, x12 + add x15, x15, x16 + add x11, x11, x7 + add x7, x7, x15 + # Round 13 + mov x18, v6.d[1] + ldr x19, [x3], #8 + ext v10.16b, v6.16b, v7.16b, #8 + ror x12, x11, #14 + shl v8.2d, v5.2d, #45 + ror x14, x7, #28 + sri v8.2d, v5.2d, #19 + eor x12, x12, x11, ror 18 + shl v9.2d, v5.2d, #3 + eor x14, x14, x7, ror 34 + sri v9.2d, v5.2d, #61 + eor x12, x12, x11, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x7, ror 39 + ushr v8.2d, v5.2d, #6 + add x6, x6, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x7, x8 + add v6.2d, v6.2d, v9.2d + eor x12, x4, x5 + ext v9.16b, v2.16b, v3.16b, #8 + and x17, x16, x17 + add v6.2d, v6.2d, v9.2d + and x12, x12, x11 + shl v8.2d, v10.2d, #63 + add x6, x6, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x5 + tbl v9.16b, {v10.16b}, v11.16b + add x6, x6, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x8 + ushr v10.2d, v10.2d, #7 + add x6, x6, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v6.2d, v6.2d, v9.2d + add x10, x10, x6 + add x6, x6, x15 + # Round 14 + mov x18, v7.d[0] + ldr x19, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x18 + eor x12, x12, x4 + add x5, x5, x19 + eor x16, x16, x7 + add x5, x5, x12 + add x15, x15, x16 + add x9, x9, x5 + add x5, x5, x15 + # Round 15 + mov x18, v7.d[1] + ldr x19, [x3], #8 + ext v10.16b, v7.16b, v0.16b, #8 + ror x12, x9, #14 + shl v8.2d, v6.2d, #45 + ror x14, x5, #28 + sri v8.2d, v6.2d, #19 + eor x12, x12, x9, ror 18 + shl v9.2d, v6.2d, #3 + eor x14, x14, x5, ror 34 + sri v9.2d, v6.2d, #61 + eor x12, x12, x9, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x15, x14, x5, ror 39 + ushr v8.2d, v6.2d, #6 + add x4, x4, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x5, x6 + add v7.2d, v7.2d, v9.2d + eor x12, x10, x11 + ext v9.16b, v3.16b, v4.16b, #8 + and x17, x16, x17 + add v7.2d, v7.2d, v9.2d + and x12, x12, x9 + shl v8.2d, v10.2d, #63 + add x4, x4, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x11 + tbl v9.16b, {v10.16b}, v11.16b + add x4, x4, x19 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x6 + ushr v10.2d, v10.2d, #7 + add x4, x4, x12 + eor v9.16b, v9.16b, v10.16b + add x15, x15, x17 + add v7.2d, v7.2d, v9.2d + add x8, x8, x4 + add x4, x4, x15 + subs x28, x28, #1 + bne L_sha512_len_neon_start + # Round 0 + mov x18, v0.d[0] + ldr x19, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x18 + eor x12, x12, x10 + add x11, x11, x19 + eor x16, x16, x5 + add x11, x11, x12 + add x15, x15, x16 + add x7, x7, x11 + add x11, x11, x15 + # Round 1 + mov x18, v0.d[1] + ldr x19, [x3], #8 + ror x12, x7, #14 + ror x14, x11, #28 + eor x12, x12, x7, ror 18 + eor x14, x14, x11, ror 34 + eor x12, x12, x7, ror 41 + eor x15, x14, x11, ror 39 + add x10, x10, x12 + eor x16, x11, x4 + eor x12, x8, x9 + and x17, x16, x17 + and x12, x12, x7 + add x10, x10, x18 + eor x12, x12, x9 + add x10, x10, x19 + eor x17, x17, x4 + add x10, x10, x12 + add x15, x15, x17 + add x6, x6, x10 + add x10, x10, x15 + # Round 2 + mov x18, v1.d[0] + ldr x19, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x18 + eor x12, x12, x8 + add x9, x9, x19 + eor x16, x16, x11 + add x9, x9, x12 + add x15, x15, x16 + add x5, x5, x9 + add x9, x9, x15 + # Round 3 + mov x18, v1.d[1] + ldr x19, [x3], #8 + ror x12, x5, #14 + ror x14, x9, #28 + eor x12, x12, x5, ror 18 + eor x14, x14, x9, ror 34 + eor x12, x12, x5, ror 41 + eor x15, x14, x9, ror 39 + add x8, x8, x12 + eor x16, x9, x10 + eor x12, x6, x7 + and x17, x16, x17 + and x12, x12, x5 + add x8, x8, x18 + eor x12, x12, x7 + add x8, x8, x19 + eor x17, x17, x10 + add x8, x8, x12 + add x15, x15, x17 + add x4, x4, x8 + add x8, x8, x15 + # Round 4 + mov x18, v2.d[0] + ldr x19, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x18 + eor x12, x12, x6 + add x7, x7, x19 + eor x16, x16, x9 + add x7, x7, x12 + add x15, x15, x16 + add x11, x11, x7 + add x7, x7, x15 + # Round 5 + mov x18, v2.d[1] + ldr x19, [x3], #8 + ror x12, x11, #14 + ror x14, x7, #28 + eor x12, x12, x11, ror 18 + eor x14, x14, x7, ror 34 + eor x12, x12, x11, ror 41 + eor x15, x14, x7, ror 39 + add x6, x6, x12 + eor x16, x7, x8 + eor x12, x4, x5 + and x17, x16, x17 + and x12, x12, x11 + add x6, x6, x18 + eor x12, x12, x5 + add x6, x6, x19 + eor x17, x17, x8 + add x6, x6, x12 + add x15, x15, x17 + add x10, x10, x6 + add x6, x6, x15 + # Round 6 + mov x18, v3.d[0] + ldr x19, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x18 + eor x12, x12, x4 + add x5, x5, x19 + eor x16, x16, x7 + add x5, x5, x12 + add x15, x15, x16 + add x9, x9, x5 + add x5, x5, x15 + # Round 7 + mov x18, v3.d[1] + ldr x19, [x3], #8 + ror x12, x9, #14 + ror x14, x5, #28 + eor x12, x12, x9, ror 18 + eor x14, x14, x5, ror 34 + eor x12, x12, x9, ror 41 + eor x15, x14, x5, ror 39 + add x4, x4, x12 + eor x16, x5, x6 + eor x12, x10, x11 + and x17, x16, x17 + and x12, x12, x9 + add x4, x4, x18 + eor x12, x12, x11 + add x4, x4, x19 + eor x17, x17, x6 + add x4, x4, x12 + add x15, x15, x17 + add x8, x8, x4 + add x4, x4, x15 + # Round 8 + mov x18, v4.d[0] + ldr x19, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x18 + eor x12, x12, x10 + add x11, x11, x19 + eor x16, x16, x5 + add x11, x11, x12 + add x15, x15, x16 + add x7, x7, x11 + add x11, x11, x15 + # Round 9 + mov x18, v4.d[1] + ldr x19, [x3], #8 + ror x12, x7, #14 + ror x14, x11, #28 + eor x12, x12, x7, ror 18 + eor x14, x14, x11, ror 34 + eor x12, x12, x7, ror 41 + eor x15, x14, x11, ror 39 + add x10, x10, x12 + eor x16, x11, x4 + eor x12, x8, x9 + and x17, x16, x17 + and x12, x12, x7 + add x10, x10, x18 + eor x12, x12, x9 + add x10, x10, x19 + eor x17, x17, x4 + add x10, x10, x12 + add x15, x15, x17 + add x6, x6, x10 + add x10, x10, x15 + # Round 10 + mov x18, v5.d[0] + ldr x19, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x18 + eor x12, x12, x8 + add x9, x9, x19 + eor x16, x16, x11 + add x9, x9, x12 + add x15, x15, x16 + add x5, x5, x9 + add x9, x9, x15 + # Round 11 + mov x18, v5.d[1] + ldr x19, [x3], #8 + ror x12, x5, #14 + ror x14, x9, #28 + eor x12, x12, x5, ror 18 + eor x14, x14, x9, ror 34 + eor x12, x12, x5, ror 41 + eor x15, x14, x9, ror 39 + add x8, x8, x12 + eor x16, x9, x10 + eor x12, x6, x7 + and x17, x16, x17 + and x12, x12, x5 + add x8, x8, x18 + eor x12, x12, x7 + add x8, x8, x19 + eor x17, x17, x10 + add x8, x8, x12 + add x15, x15, x17 + add x4, x4, x8 + add x8, x8, x15 + # Round 12 + mov x18, v6.d[0] + ldr x19, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x18 + eor x12, x12, x6 + add x7, x7, x19 + eor x16, x16, x9 + add x7, x7, x12 + add x15, x15, x16 + add x11, x11, x7 + add x7, x7, x15 + # Round 13 + mov x18, v6.d[1] + ldr x19, [x3], #8 + ror x12, x11, #14 + ror x14, x7, #28 + eor x12, x12, x11, ror 18 + eor x14, x14, x7, ror 34 + eor x12, x12, x11, ror 41 + eor x15, x14, x7, ror 39 + add x6, x6, x12 + eor x16, x7, x8 + eor x12, x4, x5 + and x17, x16, x17 + and x12, x12, x11 + add x6, x6, x18 + eor x12, x12, x5 + add x6, x6, x19 + eor x17, x17, x8 + add x6, x6, x12 + add x15, x15, x17 + add x10, x10, x6 + add x6, x6, x15 + # Round 14 + mov x18, v7.d[0] + ldr x19, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x18 + eor x12, x12, x4 + add x5, x5, x19 + eor x16, x16, x7 + add x5, x5, x12 + add x15, x15, x16 + add x9, x9, x5 + add x5, x5, x15 + # Round 15 + mov x18, v7.d[1] + ldr x19, [x3], #8 + ror x12, x9, #14 + ror x14, x5, #28 + eor x12, x12, x9, ror 18 + eor x14, x14, x5, ror 34 + eor x12, x12, x9, ror 41 + eor x15, x14, x5, ror 39 + add x4, x4, x12 + eor x16, x5, x6 + eor x12, x10, x11 + and x17, x16, x17 + and x12, x12, x9 + add x4, x4, x18 + eor x12, x12, x11 + add x4, x4, x19 + eor x17, x17, x6 + add x4, x4, x12 + add x15, x15, x17 + add x8, x8, x4 + add x4, x4, x15 + add x11, x11, x27 + add x10, x10, x26 + add x9, x9, x25 + add x8, x8, x24 + add x7, x7, x23 + add x6, x6, x22 + add x5, x5, x21 + add x4, x4, x20 + adr x3, L_SHA512_transform_neon_len_k + subs w2, w2, #0x80 + bne L_sha512_len_neon_begin + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + eor x0, x0, x0 + ldr x17, [x29, #16] + ldp x18, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldp x24, x25, [x29, #72] + ldp x26, x27, [x29, #88] + ldr x28, [x29, #104] + ldp d8, d9, [x29, #112] + ldp d10, d11, [x29, #128] + ldp x29, x30, [sp], #0x90 + ret + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm.c new file mode 100644 index 000000000..dbc5a7dee --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.c @@ -0,0 +1,1034 @@ +/* armv8-sha512-asm + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c + */ +#ifdef __aarch64__ +#include +#include + +static const uint64_t L_SHA512_transform_neon_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +static const uint64_t L_SHA512_transform_neon_len_ror8[] = { + 0x7060504030201UL, + 0x80f0e0d0c0b0a09UL, +}; + +int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" + "adr x28, %[L_SHA512_transform_neon_len_ror8]\n\t" + "ld1 {v11.16b}, [x28]\n\t" + /* Load digest into working vars */ + "ldp x4, x5, [%x[sha512]]\n\t" + "ldp x6, x7, [%x[sha512], #16]\n\t" + "ldp x8, x9, [%x[sha512], #32]\n\t" + "ldp x10, x11, [%x[sha512], #48]\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_neon_begin_%=: \n\t" + /* Load W */ + /* Copy digest to add in at end */ + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" + "mov x20, x4\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" + "mov x21, x5\n\t" + "rev64 v0.16b, v0.16b\n\t" + "mov x22, x6\n\t" + "rev64 v1.16b, v1.16b\n\t" + "mov x23, x7\n\t" + "rev64 v2.16b, v2.16b\n\t" + "mov x24, x8\n\t" + "rev64 v3.16b, v3.16b\n\t" + "mov x25, x9\n\t" + "rev64 v4.16b, v4.16b\n\t" + "mov x26, x10\n\t" + "rev64 v5.16b, v5.16b\n\t" + "mov x27, x11\n\t" + "rev64 v6.16b, v6.16b\n\t" + "rev64 v7.16b, v7.16b\n\t" + /* Pre-calc: b ^ c */ + "eor x16, x5, x6\n\t" + "mov x28, #4\n\t" + /* Start of 16 rounds */ + "\n" + "L_sha512_len_neon_start_%=: \n\t" + /* Round 0 */ + "mov x18, v0.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x18\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x19\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x15, x15, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x15\n\t" + /* Round 1 */ + "mov x18, v0.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v0.16b, v1.16b, #8\n\t" + "ror x12, x7, #14\n\t" + "shl v8.2d, v7.2d, #45\n\t" + "ror x14, x11, #28\n\t" + "sri v8.2d, v7.2d, #19\n\t" + "eor x12, x12, x7, ror 18\n\t" + "shl v9.2d, v7.2d, #3\n\t" + "eor x14, x14, x11, ror 34\n\t" + "sri v9.2d, v7.2d, #61\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x11, ror 39\n\t" + "ushr v8.2d, v7.2d, #6\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x11, x4\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "eor x12, x8, x9\n\t" + "ext v9.16b, v4.16b, v5.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "and x12, x12, x7\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x10, x10, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x9\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x10, x10, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x4\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x15\n\t" + /* Round 2 */ + "mov x18, v1.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x18\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x19\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x15, x15, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x15\n\t" + /* Round 3 */ + "mov x18, v1.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v1.16b, v2.16b, #8\n\t" + "ror x12, x5, #14\n\t" + "shl v8.2d, v0.2d, #45\n\t" + "ror x14, x9, #28\n\t" + "sri v8.2d, v0.2d, #19\n\t" + "eor x12, x12, x5, ror 18\n\t" + "shl v9.2d, v0.2d, #3\n\t" + "eor x14, x14, x9, ror 34\n\t" + "sri v9.2d, v0.2d, #61\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x9, ror 39\n\t" + "ushr v8.2d, v0.2d, #6\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x9, x10\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "eor x12, x6, x7\n\t" + "ext v9.16b, v5.16b, v6.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "and x12, x12, x5\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x8, x8, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x7\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x8, x8, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x10\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x15\n\t" + /* Round 4 */ + "mov x18, v2.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x18\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x19\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x15, x15, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x15\n\t" + /* Round 5 */ + "mov x18, v2.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v2.16b, v3.16b, #8\n\t" + "ror x12, x11, #14\n\t" + "shl v8.2d, v1.2d, #45\n\t" + "ror x14, x7, #28\n\t" + "sri v8.2d, v1.2d, #19\n\t" + "eor x12, x12, x11, ror 18\n\t" + "shl v9.2d, v1.2d, #3\n\t" + "eor x14, x14, x7, ror 34\n\t" + "sri v9.2d, v1.2d, #61\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x7, ror 39\n\t" + "ushr v8.2d, v1.2d, #6\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x7, x8\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "eor x12, x4, x5\n\t" + "ext v9.16b, v6.16b, v7.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "and x12, x12, x11\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x6, x6, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x5\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x6, x6, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x8\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x15\n\t" + /* Round 6 */ + "mov x18, v3.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x18\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x19\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x15, x15, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x15\n\t" + /* Round 7 */ + "mov x18, v3.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v3.16b, v4.16b, #8\n\t" + "ror x12, x9, #14\n\t" + "shl v8.2d, v2.2d, #45\n\t" + "ror x14, x5, #28\n\t" + "sri v8.2d, v2.2d, #19\n\t" + "eor x12, x12, x9, ror 18\n\t" + "shl v9.2d, v2.2d, #3\n\t" + "eor x14, x14, x5, ror 34\n\t" + "sri v9.2d, v2.2d, #61\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x5, ror 39\n\t" + "ushr v8.2d, v2.2d, #6\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x5, x6\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "eor x12, x10, x11\n\t" + "ext v9.16b, v7.16b, v0.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "and x12, x12, x9\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x4, x4, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x11\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x4, x4, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x6\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x15\n\t" + /* Round 8 */ + "mov x18, v4.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x18\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x19\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x15, x15, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x15\n\t" + /* Round 9 */ + "mov x18, v4.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v4.16b, v5.16b, #8\n\t" + "ror x12, x7, #14\n\t" + "shl v8.2d, v3.2d, #45\n\t" + "ror x14, x11, #28\n\t" + "sri v8.2d, v3.2d, #19\n\t" + "eor x12, x12, x7, ror 18\n\t" + "shl v9.2d, v3.2d, #3\n\t" + "eor x14, x14, x11, ror 34\n\t" + "sri v9.2d, v3.2d, #61\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x11, ror 39\n\t" + "ushr v8.2d, v3.2d, #6\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x11, x4\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "eor x12, x8, x9\n\t" + "ext v9.16b, v0.16b, v1.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "and x12, x12, x7\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x10, x10, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x9\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x10, x10, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x4\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x15\n\t" + /* Round 10 */ + "mov x18, v5.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x18\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x19\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x15, x15, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x15\n\t" + /* Round 11 */ + "mov x18, v5.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v5.16b, v6.16b, #8\n\t" + "ror x12, x5, #14\n\t" + "shl v8.2d, v4.2d, #45\n\t" + "ror x14, x9, #28\n\t" + "sri v8.2d, v4.2d, #19\n\t" + "eor x12, x12, x5, ror 18\n\t" + "shl v9.2d, v4.2d, #3\n\t" + "eor x14, x14, x9, ror 34\n\t" + "sri v9.2d, v4.2d, #61\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x9, ror 39\n\t" + "ushr v8.2d, v4.2d, #6\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x9, x10\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "eor x12, x6, x7\n\t" + "ext v9.16b, v1.16b, v2.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "and x12, x12, x5\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x8, x8, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x7\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x8, x8, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x10\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x15\n\t" + /* Round 12 */ + "mov x18, v6.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x18\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x19\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x15, x15, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x15\n\t" + /* Round 13 */ + "mov x18, v6.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v6.16b, v7.16b, #8\n\t" + "ror x12, x11, #14\n\t" + "shl v8.2d, v5.2d, #45\n\t" + "ror x14, x7, #28\n\t" + "sri v8.2d, v5.2d, #19\n\t" + "eor x12, x12, x11, ror 18\n\t" + "shl v9.2d, v5.2d, #3\n\t" + "eor x14, x14, x7, ror 34\n\t" + "sri v9.2d, v5.2d, #61\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x7, ror 39\n\t" + "ushr v8.2d, v5.2d, #6\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x7, x8\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "eor x12, x4, x5\n\t" + "ext v9.16b, v2.16b, v3.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "and x12, x12, x11\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x6, x6, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x5\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x6, x6, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x8\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x15\n\t" + /* Round 14 */ + "mov x18, v7.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x18\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x19\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x15, x15, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x15\n\t" + /* Round 15 */ + "mov x18, v7.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ext v10.16b, v7.16b, v0.16b, #8\n\t" + "ror x12, x9, #14\n\t" + "shl v8.2d, v6.2d, #45\n\t" + "ror x14, x5, #28\n\t" + "sri v8.2d, v6.2d, #19\n\t" + "eor x12, x12, x9, ror 18\n\t" + "shl v9.2d, v6.2d, #3\n\t" + "eor x14, x14, x5, ror 34\n\t" + "sri v9.2d, v6.2d, #61\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x15, x14, x5, ror 39\n\t" + "ushr v8.2d, v6.2d, #6\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x5, x6\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "eor x12, x10, x11\n\t" + "ext v9.16b, v3.16b, v4.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "and x12, x12, x9\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x4, x4, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x11\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x4, x4, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x6\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x15, x15, x17\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x15\n\t" + "subs x28, x28, #1\n\t" + "bne L_sha512_len_neon_start_%=\n\t" + /* Round 0 */ + "mov x18, v0.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x18\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x19\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x15, x15, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x15\n\t" + /* Round 1 */ + "mov x18, v0.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x7, #14\n\t" + "ror x14, x11, #28\n\t" + "eor x12, x12, x7, ror 18\n\t" + "eor x14, x14, x11, ror 34\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor x15, x14, x11, ror 39\n\t" + "add x10, x10, x12\n\t" + "eor x16, x11, x4\n\t" + "eor x12, x8, x9\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x7\n\t" + "add x10, x10, x18\n\t" + "eor x12, x12, x9\n\t" + "add x10, x10, x19\n\t" + "eor x17, x17, x4\n\t" + "add x10, x10, x12\n\t" + "add x15, x15, x17\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x15\n\t" + /* Round 2 */ + "mov x18, v1.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x18\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x19\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x15, x15, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x15\n\t" + /* Round 3 */ + "mov x18, v1.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x5, #14\n\t" + "ror x14, x9, #28\n\t" + "eor x12, x12, x5, ror 18\n\t" + "eor x14, x14, x9, ror 34\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor x15, x14, x9, ror 39\n\t" + "add x8, x8, x12\n\t" + "eor x16, x9, x10\n\t" + "eor x12, x6, x7\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x5\n\t" + "add x8, x8, x18\n\t" + "eor x12, x12, x7\n\t" + "add x8, x8, x19\n\t" + "eor x17, x17, x10\n\t" + "add x8, x8, x12\n\t" + "add x15, x15, x17\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x15\n\t" + /* Round 4 */ + "mov x18, v2.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x18\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x19\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x15, x15, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x15\n\t" + /* Round 5 */ + "mov x18, v2.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x11, #14\n\t" + "ror x14, x7, #28\n\t" + "eor x12, x12, x11, ror 18\n\t" + "eor x14, x14, x7, ror 34\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor x15, x14, x7, ror 39\n\t" + "add x6, x6, x12\n\t" + "eor x16, x7, x8\n\t" + "eor x12, x4, x5\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x11\n\t" + "add x6, x6, x18\n\t" + "eor x12, x12, x5\n\t" + "add x6, x6, x19\n\t" + "eor x17, x17, x8\n\t" + "add x6, x6, x12\n\t" + "add x15, x15, x17\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x15\n\t" + /* Round 6 */ + "mov x18, v3.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x18\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x19\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x15, x15, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x15\n\t" + /* Round 7 */ + "mov x18, v3.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x9, #14\n\t" + "ror x14, x5, #28\n\t" + "eor x12, x12, x9, ror 18\n\t" + "eor x14, x14, x5, ror 34\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor x15, x14, x5, ror 39\n\t" + "add x4, x4, x12\n\t" + "eor x16, x5, x6\n\t" + "eor x12, x10, x11\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x9\n\t" + "add x4, x4, x18\n\t" + "eor x12, x12, x11\n\t" + "add x4, x4, x19\n\t" + "eor x17, x17, x6\n\t" + "add x4, x4, x12\n\t" + "add x15, x15, x17\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x15\n\t" + /* Round 8 */ + "mov x18, v4.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x18\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x19\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x15, x15, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x15\n\t" + /* Round 9 */ + "mov x18, v4.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x7, #14\n\t" + "ror x14, x11, #28\n\t" + "eor x12, x12, x7, ror 18\n\t" + "eor x14, x14, x11, ror 34\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor x15, x14, x11, ror 39\n\t" + "add x10, x10, x12\n\t" + "eor x16, x11, x4\n\t" + "eor x12, x8, x9\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x7\n\t" + "add x10, x10, x18\n\t" + "eor x12, x12, x9\n\t" + "add x10, x10, x19\n\t" + "eor x17, x17, x4\n\t" + "add x10, x10, x12\n\t" + "add x15, x15, x17\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x15\n\t" + /* Round 10 */ + "mov x18, v5.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x18\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x19\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x15, x15, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x15\n\t" + /* Round 11 */ + "mov x18, v5.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x5, #14\n\t" + "ror x14, x9, #28\n\t" + "eor x12, x12, x5, ror 18\n\t" + "eor x14, x14, x9, ror 34\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor x15, x14, x9, ror 39\n\t" + "add x8, x8, x12\n\t" + "eor x16, x9, x10\n\t" + "eor x12, x6, x7\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x5\n\t" + "add x8, x8, x18\n\t" + "eor x12, x12, x7\n\t" + "add x8, x8, x19\n\t" + "eor x17, x17, x10\n\t" + "add x8, x8, x12\n\t" + "add x15, x15, x17\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x15\n\t" + /* Round 12 */ + "mov x18, v6.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x18\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x19\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x15, x15, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x15\n\t" + /* Round 13 */ + "mov x18, v6.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x11, #14\n\t" + "ror x14, x7, #28\n\t" + "eor x12, x12, x11, ror 18\n\t" + "eor x14, x14, x7, ror 34\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor x15, x14, x7, ror 39\n\t" + "add x6, x6, x12\n\t" + "eor x16, x7, x8\n\t" + "eor x12, x4, x5\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x11\n\t" + "add x6, x6, x18\n\t" + "eor x12, x12, x5\n\t" + "add x6, x6, x19\n\t" + "eor x17, x17, x8\n\t" + "add x6, x6, x12\n\t" + "add x15, x15, x17\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x15\n\t" + /* Round 14 */ + "mov x18, v7.d[0]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x18\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x19\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x15, x15, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x15\n\t" + /* Round 15 */ + "mov x18, v7.d[1]\n\t" + "ldr x19, [x3], #8\n\t" + "ror x12, x9, #14\n\t" + "ror x14, x5, #28\n\t" + "eor x12, x12, x9, ror 18\n\t" + "eor x14, x14, x5, ror 34\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor x15, x14, x5, ror 39\n\t" + "add x4, x4, x12\n\t" + "eor x16, x5, x6\n\t" + "eor x12, x10, x11\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x9\n\t" + "add x4, x4, x18\n\t" + "eor x12, x12, x11\n\t" + "add x4, x4, x19\n\t" + "eor x17, x17, x6\n\t" + "add x4, x4, x12\n\t" + "add x15, x15, x17\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x15\n\t" + "add x11, x11, x27\n\t" + "add x10, x10, x26\n\t" + "add x9, x9, x25\n\t" + "add x8, x8, x24\n\t" + "add x7, x7, x23\n\t" + "add x6, x6, x22\n\t" + "add x5, x5, x21\n\t" + "add x4, x4, x20\n\t" + "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" + "subs %w[len], %w[len], #0x80\n\t" + "bne L_sha512_len_neon_begin_%=\n\t" + "stp x4, x5, [%x[sha512]]\n\t" + "stp x6, x7, [%x[sha512], #16]\n\t" + "stp x8, x9, [%x[sha512], #32]\n\t" + "stp x10, x11, [%x[sha512], #48]\n\t" + "eor x0, x0, x0\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); + return (uint32_t)(size_t)sha512; +} + +#endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512.c b/wolfcrypt/src/port/arm/armv8-sha512.c new file mode 100644 index 000000000..7f33a57ad --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-sha512.c @@ -0,0 +1,716 @@ +/* sha512.c + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384) + +#include +#include +#include +#include + +#include + +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif + +#ifdef WOLFSSL_SHA512 + +static int InitSha512(wc_Sha512* sha512) +{ + if (sha512 == NULL) + return BAD_FUNC_ARG; + + sha512->digest[0] = W64LIT(0x6a09e667f3bcc908); + sha512->digest[1] = W64LIT(0xbb67ae8584caa73b); + sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b); + sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1); + sha512->digest[4] = W64LIT(0x510e527fade682d1); + sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f); + sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b); + sha512->digest[7] = W64LIT(0x5be0cd19137e2179); + + sha512->buffLen = 0; + sha512->loLen = 0; + sha512->hiLen = 0; + + return 0; +} + +#endif /* WOLFSSL_SHA512 */ + +#ifdef WOLFSSL_SHA512 + +int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) +{ + int ret = 0; + + if (sha512 == NULL) + return BAD_FUNC_ARG; + + sha512->heap = heap; + + ret = InitSha512(sha512); + if (ret != 0) + return ret; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + sha512->W = NULL; +#endif + + (void)devId; + + return ret; +} + +#endif /* WOLFSSL_SHA512 */ + +#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) +static const word64 K512[80] = { + W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), + W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), + W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), + W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), + W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), + W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), + W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), + W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), + W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), + W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), + W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), + W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), + W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), + W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), + W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), + W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), + W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), + W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), + W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), + W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), + W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), + W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), + W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), + W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), + W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), + W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), + W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), + W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), + W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), + W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), + W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), + W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), + W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), + W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), + W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), + W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), + W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), + W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), + W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), + W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) +}; + +#ifdef LITTLE_ENDIAN_ORDER +#define blk0(i) (W[i] = ByteReverseWord64(DATA[i])) +#else +#define blk0(i) (W[i] = DATA[i]) +#endif + +#define blk2(i) ( \ + W[ i ] += \ + s1(W[(i- 2) & 15])+ \ + W[(i- 7) & 15] + \ + s0(W[(i-15) & 15]) \ + ) + +#define Ch(x,y,z) (z ^ ((z ^ y) & x)) +#define Maj(x,y,z) (y ^ ((y ^ z) & (x ^ y))) + +#define a(i) T[(0-i) & 7] +#define b(i) T[(1-i) & 7] +#define c(i) T[(2-i) & 7] +#define d(i) T[(3-i) & 7] +#define e(i) T[(4-i) & 7] +#define f(i) T[(5-i) & 7] +#define g(i) T[(6-i) & 7] +#define h(i) T[(7-i) & 7] + +#define S0(x) (rotrFixed64(x,28) ^ rotrFixed64(x,34) ^ rotrFixed64(x,39)) +#define S1(x) (rotrFixed64(x,14) ^ rotrFixed64(x,18) ^ rotrFixed64(x,41)) +#define s0(x) (rotrFixed64(x, 1) ^ rotrFixed64(x, 8) ^ (x>>7)) +#define s1(x) (rotrFixed64(x,19) ^ rotrFixed64(x,61) ^ (x>>6)) + +#define R0(i) \ + h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + blk0(i); \ + d(i) += h(i); \ + h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) +#define R(i) \ + h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + blk2(i); \ + d(i) += h(i); \ + h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) + +#define DATA sha512->buffer +static int Transform_Sha512(wc_Sha512* sha512) +{ + const word64* K = K512; + word32 j; + word64 T[8]; + word64 W[16]; + + /* Copy digest to working vars */ + T[0] = sha512->digest[0]; + T[1] = sha512->digest[1]; + T[2] = sha512->digest[2]; + T[3] = sha512->digest[3]; + T[4] = sha512->digest[4]; + T[5] = sha512->digest[5]; + T[6] = sha512->digest[6]; + T[7] = sha512->digest[7]; + + /* 80 operations, partially loop unrolled */ + j = 0; + R0( 0); R0( 1); R0( 2); R0( 3); + R0( 4); R0( 5); R0( 6); R0( 7); + R0( 8); R0( 9); R0(10); R0(11); + R0(12); R0(13); R0(14); R0(15); + for (j = 16; j < 80; j += 16) { + R( 0); R( 1); R( 2); R( 3); + R( 4); R( 5); R( 6); R( 7); + R( 8); R( 9); R(10); R(11); + R(12); R(13); R(14); R(15); + } + + /* Add the working vars back into digest */ + sha512->digest[0] += T[0]; + sha512->digest[1] += T[1]; + sha512->digest[2] += T[2]; + sha512->digest[3] += T[3]; + sha512->digest[4] += T[4]; + sha512->digest[5] += T[5]; + sha512->digest[6] += T[6]; + sha512->digest[7] += T[7]; + + return 0; +} +#undef DATA + +#define DATA ((word64*)data) +static int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + const word64* K = K512; + word32 j; + word64 T[8]; + word64 TO[8]; + word64 W[16]; + + /* Copy digest to working vars */ + T[0] = sha512->digest[0]; + T[1] = sha512->digest[1]; + T[2] = sha512->digest[2]; + T[3] = sha512->digest[3]; + T[4] = sha512->digest[4]; + T[5] = sha512->digest[5]; + T[6] = sha512->digest[6]; + T[7] = sha512->digest[7]; + + do { + TO[0] = T[0]; + TO[1] = T[1]; + TO[2] = T[2]; + TO[3] = T[3]; + TO[4] = T[4]; + TO[5] = T[5]; + TO[6] = T[6]; + TO[7] = T[7]; + + /* 80 operations, partially loop unrolled */ + j = 0; + R0( 0); R0( 1); R0( 2); R0( 3); + R0( 4); R0( 5); R0( 6); R0( 7); + R0( 8); R0( 9); R0(10); R0(11); + R0(12); R0(13); R0(14); R0(15); + for (j = 16; j < 80; j += 16) { + R( 0); R( 1); R( 2); R( 3); + R( 4); R( 5); R( 6); R( 7); + R( 8); R( 9); R(10); R(11); + R(12); R(13); R(14); R(15); + } + + T[0] += TO[0]; + T[1] += TO[1]; + T[2] += TO[2]; + T[3] += TO[3]; + T[4] += TO[4]; + T[5] += TO[5]; + T[6] += TO[6]; + T[7] += TO[7]; + + data += 128; + len -= 128; + } + while (len > 0); + + /* Add the working vars back into digest */ + sha512->digest[0] = T[0]; + sha512->digest[1] = T[1]; + sha512->digest[2] = T[2]; + sha512->digest[3] = T[3]; + sha512->digest[4] = T[4]; + sha512->digest[5] = T[5]; + sha512->digest[6] = T[6]; + sha512->digest[7] = T[7]; + + return 0; +} +#undef DATA +#endif + + +static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len) +{ + word64 tmp = sha512->loLen; + if ( (sha512->loLen += len) < tmp) + sha512->hiLen++; /* carry low to high */ +} + +static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + int ret = 0; + /* do block size increments */ + byte* local = (byte*)sha512->buffer; + word32 blocksLen; + + /* check that internal buffLen is valid */ + if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE) + return BUFFER_E; + + AddLength(sha512, len); + + if (sha512->buffLen > 0) { + word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen); + if (add > 0) { + XMEMCPY(&local[sha512->buffLen], data, add); + + sha512->buffLen += add; + data += add; + len -= add; + } + + if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) { +#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) + ret = Transform_Sha512(sha512); +#else + ret = Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE); +#endif + if (ret == 0) + sha512->buffLen = 0; + else + len = 0; + } + } + + blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); + if (blocksLen > 0) { + /* Byte reversal performed in function if required. */ + Transform_Sha512_Len(sha512, data, blocksLen); + data += blocksLen; + len -= blocksLen; + } + + if (len > 0) { + XMEMCPY(local, data, len); + sha512->buffLen = len; + } + + return ret; +} + +#ifdef WOLFSSL_SHA512 + +int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + if (sha512 == NULL || (data == NULL && len > 0)) { + return BAD_FUNC_ARG; + } + + return Sha512Update(sha512, data, len); +} + +#endif /* WOLFSSL_SHA512 */ + +static WC_INLINE int Sha512Final(wc_Sha512* sha512) +{ + byte* local = (byte*)sha512->buffer; + int ret; + + if (sha512 == NULL) { + return BAD_FUNC_ARG; + } + + local[sha512->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha512->buffLen > WC_SHA512_PAD_SIZE) { + XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - + sha512->buffLen); + sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen; +#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) + ret = Transform_Sha512(sha512); +#else + ret = Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE); +#endif + if (ret != 0) + return ret; + + sha512->buffLen = 0; + } + XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen); + + /* put lengths in bits */ + sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) + + (sha512->hiLen << 3); + sha512->loLen = sha512->loLen << 3; + + /* store lengths */ + /* ! length ordering dependent on digest endian type ! */ + + sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; + sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; + + ByteReverseWords64( + &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), + &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), + WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE); +#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) + ret = Transform_Sha512(sha512); +#else + ret = Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE); +#endif + if (ret != 0) + return ret; + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE); +#endif + + return 0; +} + +#ifdef WOLFSSL_SHA512 + +int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash) +{ +#ifdef LITTLE_ENDIAN_ORDER + word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)]; +#endif + + if (sha512 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64((word64*)digest, (word64*)sha512->digest, + WC_SHA512_DIGEST_SIZE); + XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE); +#else + XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); +#endif + + return 0; +} + +int wc_Sha512Final(wc_Sha512* sha512, byte* hash) +{ + int ret; + + if (sha512 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = Sha512Final(sha512); + if (ret != 0) + return ret; + + XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); + + return InitSha512(sha512); /* reset state */ +} + +int wc_InitSha512(wc_Sha512* sha512) +{ + return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID); +} + +void wc_Sha512Free(wc_Sha512* sha512) +{ + if (sha512 == NULL) + return; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + if (sha512->W != NULL) { + XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + sha512->W = NULL; + } +#endif +} + +#endif /* WOLFSSL_SHA512 */ + +/* -------------------------------------------------------------------------- */ +/* SHA384 */ +/* -------------------------------------------------------------------------- */ +#ifdef WOLFSSL_SHA384 + +static int InitSha384(wc_Sha384* sha384) +{ + if (sha384 == NULL) { + return BAD_FUNC_ARG; + } + + sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8); + sha384->digest[1] = W64LIT(0x629a292a367cd507); + sha384->digest[2] = W64LIT(0x9159015a3070dd17); + sha384->digest[3] = W64LIT(0x152fecd8f70e5939); + sha384->digest[4] = W64LIT(0x67332667ffc00b31); + sha384->digest[5] = W64LIT(0x8eb44a8768581511); + sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7); + sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4); + + sha384->buffLen = 0; + sha384->loLen = 0; + sha384->hiLen = 0; + + return 0; +} + +int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len) +{ + if (sha384 == NULL || (data == NULL && len > 0)) { + return BAD_FUNC_ARG; + } + + return Sha512Update((wc_Sha512*)sha384, data, len); +} + + +int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash) +{ +#ifdef LITTLE_ENDIAN_ORDER + word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)]; +#endif + + if (sha384 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64((word64*)digest, (word64*)sha384->digest, + WC_SHA384_DIGEST_SIZE); + XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE); +#else + XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); +#endif + + return 0; +} + +int wc_Sha384Final(wc_Sha384* sha384, byte* hash) +{ + int ret; + + if (sha384 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = Sha512Final((wc_Sha512*)sha384); + if (ret != 0) + return ret; + + XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); + + return InitSha384(sha384); /* reset state */ +} + +int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId) +{ + int ret; + + if (sha384 == NULL) { + return BAD_FUNC_ARG; + } + + sha384->heap = heap; + ret = InitSha384(sha384); + if (ret != 0) + return ret; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + sha384->W = NULL; +#endif + + (void)devId; + + return ret; +} + +int wc_InitSha384(wc_Sha384* sha384) +{ + return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID); +} + +void wc_Sha384Free(wc_Sha384* sha384) +{ + if (sha384 == NULL) + return; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + if (sha384->W != NULL) { + XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + sha384->W = NULL; + } +#endif +} + +#endif /* WOLFSSL_SHA384 */ + +#ifdef WOLFSSL_SHA512 + +int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash) +{ + int ret; + wc_Sha512 tmpSha512; + + if (sha512 == NULL || hash == NULL) + return BAD_FUNC_ARG; + + ret = wc_Sha512Copy(sha512, &tmpSha512); + if (ret == 0) { + ret = wc_Sha512Final(&tmpSha512, hash); + wc_Sha512Free(&tmpSha512); + } + return ret; +} + +int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst) +{ + int ret = 0; + + if (src == NULL || dst == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(dst, src, sizeof(wc_Sha512)); +#ifdef WOLFSSL_SMALL_STACK_CACHE + dst->W = NULL; +#endif + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + dst->flags |= WC_HASH_FLAG_ISCOPY; +#endif + + return ret; +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha512SetFlags(wc_Sha512* sha512, word32 flags) +{ + if (sha512) { + sha512->flags = flags; + } + return 0; +} +int wc_Sha512GetFlags(wc_Sha512* sha512, word32* flags) +{ + if (sha512 && flags) { + *flags = sha512->flags; + } + return 0; +} +#endif + +#endif /* WOLFSSL_SHA512 */ + +#ifdef WOLFSSL_SHA384 + +int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash) +{ + int ret; + wc_Sha384 tmpSha384; + + if (sha384 == NULL || hash == NULL) + return BAD_FUNC_ARG; + ret = wc_Sha384Copy(sha384, &tmpSha384); + if (ret == 0) { + ret = wc_Sha384Final(&tmpSha384, hash); + wc_Sha384Free(&tmpSha384); + } + return ret; +} +int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst) +{ + int ret = 0; + + if (src == NULL || dst == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(dst, src, sizeof(wc_Sha384)); +#ifdef WOLFSSL_SMALL_STACK_CACHE + dst->W = NULL; +#endif + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + dst->flags |= WC_HASH_FLAG_ISCOPY; +#endif + + return ret; +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha384SetFlags(wc_Sha384* sha384, word32 flags) +{ + if (sha384) { + sha384->flags = flags; + } + return 0; +} +int wc_Sha384GetFlags(wc_Sha384* sha384, word32* flags) +{ + if (sha384 && flags) { + *flags = sha384->flags; + } + return 0; +} +#endif + +#endif /* WOLFSSL_SHA384 */ + +#endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */ diff --git a/wolfssl/wolfcrypt/sha512.h b/wolfssl/wolfcrypt/sha512.h index 83a96b53a..958e7688b 100644 --- a/wolfssl/wolfcrypt/sha512.h +++ b/wolfssl/wolfcrypt/sha512.h @@ -141,6 +141,11 @@ typedef struct wc_Sha512 { #endif /* HAVE_FIPS */ +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) +WOLFSSL_LOCAL int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, + word32 len); +#endif + #ifdef WOLFSSL_SHA512 WOLFSSL_API int wc_InitSha512(wc_Sha512*);