diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index 9426d9987..aa8b25198 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -19,18 +19,22 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S + */ #ifdef __aarch64__ -.text -.globl fe_init -.type fe_init,@function -.align 4 + .text + .align 2 + .globl fe_init + .type fe_init, %function fe_init: ret -.size fe_init,.-fe_init -.text -.globl fe_frombytes -.type fe_frombytes,@function -.align 4 + .size fe_init,.-fe_init + .text + .align 2 + .globl fe_frombytes + .type fe_frombytes, %function fe_frombytes: ldp x2, x3, [x1] ldp x4, x5, [x1, #16] @@ -38,11 +42,11 @@ fe_frombytes: stp x2, x3, [x0] stp x4, x5, [x0, #16] ret -.size fe_frombytes,.-fe_frombytes -.text -.globl fe_tobytes -.type fe_tobytes,@function -.align 4 + .size fe_frombytes,.-fe_frombytes + .text + .align 2 + .globl fe_tobytes + .type fe_tobytes, %function fe_tobytes: mov x7, #19 ldp x2, x3, [x1] @@ -51,8 +55,7 @@ fe_tobytes: adcs x6, x3, xzr adcs x6, x4, xzr adc x6, x5, xzr - asr x6, x6, #63 - and x6, x6, x7 + and x6, x7, x6, asr 63 adds x2, x2, x6 adcs x3, x3, xzr adcs x4, x4, xzr @@ -61,32 +64,32 @@ fe_tobytes: stp x2, x3, [x0] stp x4, x5, [x0, #16] ret -.size fe_tobytes,.-fe_tobytes -.text -.globl fe_1 -.type fe_1,@function -.align 4 + .size fe_tobytes,.-fe_tobytes + .text + .align 2 + .globl fe_1 + .type fe_1, %function fe_1: # Set one mov x1, #1 stp x1, xzr, [x0] stp xzr, xzr, [x0, #16] ret -.size fe_1,.-fe_1 -.text -.globl fe_0 -.type fe_0,@function -.align 4 + .size fe_1,.-fe_1 + .text + .align 2 + .globl fe_0 + .type fe_0, %function fe_0: # Set zero stp xzr, xzr, [x0] stp xzr, xzr, [x0, #16] ret -.size fe_0,.-fe_0 -.text -.globl fe_copy -.type fe_copy,@function -.align 4 + .size fe_0,.-fe_0 + .text + .align 2 + .globl fe_copy + .type fe_copy, %function fe_copy: # Copy ldp x2, x3, [x1] @@ -94,36 +97,11 @@ fe_copy: stp x2, x3, [x0] stp x4, x5, [x0, #16] ret -.size fe_copy,.-fe_copy -.text -.globl fe_cswap -.type fe_cswap,@function -.align 4 -fe_cswap: - # Conditional Swap - cmp x2, #1 - ldp x3, x4, [x0] - ldp x5, x6, [x0, #16] - ldp x7, x8, [x1] - ldp x9, x10, [x1, #16] - csel x11, x3, x7, eq - csel x3, x7, x3, eq - csel x12, x4, x8, eq - csel x4, x8, x4, eq - csel x13, x5, x9, eq - csel x5, x9, x5, eq - csel x14, x6, x10, eq - csel x6, x10, x6, eq - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - stp x11, x12, [x1] - stp x13, x14, [x1, #16] - ret -.size fe_cswap,.-fe_cswap -.text -.globl fe_sub -.type fe_sub,@function -.align 4 + .size fe_copy,.-fe_copy + .text + .align 2 + .globl fe_sub + .type fe_sub, %function fe_sub: # Sub ldp x3, x4, [x1] @@ -147,11 +125,11 @@ fe_sub: stp x3, x4, [x0] stp x5, x6, [x0, #16] ret -.size fe_sub,.-fe_sub -.text -.globl fe_add -.type fe_add,@function -.align 4 + .size fe_sub,.-fe_sub + .text + .align 2 + .globl fe_add + .type fe_add, %function fe_add: # Add ldp x3, x4, [x1] @@ -175,11 +153,11 @@ fe_add: stp x3, x4, [x0] stp x5, x6, [x0, #16] ret -.size fe_add,.-fe_add -.text -.globl fe_neg -.type fe_neg,@function -.align 4 + .size fe_add,.-fe_add + .text + .align 2 + .globl fe_neg + .type fe_neg, %function fe_neg: ldp x2, x3, [x1] ldp x4, x5, [x1, #16] @@ -194,29 +172,11 @@ fe_neg: stp x6, x7, [x0] stp x8, x9, [x0, #16] ret -.size fe_neg,.-fe_neg -.text -.globl fe_cmov -.type fe_cmov,@function -.align 4 -fe_cmov: - ldp x4, x5, [x0] - ldp x6, x7, [x0, #16] - ldp x8, x9, [x1] - ldp x10, x11, [x1, #16] - cmp x2, #1 - csel x4, x4, x8, eq - csel x5, x5, x9, eq - csel x6, x6, x10, eq - csel x7, x7, x11, eq - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ret -.size fe_cmov,.-fe_cmov -.text -.globl fe_isnonzero -.type fe_isnonzero,@function -.align 4 + .size fe_neg,.-fe_neg + .text + .align 2 + .globl fe_isnonzero + .type fe_isnonzero, %function fe_isnonzero: mov x6, #19 ldp x1, x2, [x0] @@ -225,8 +185,7 @@ fe_isnonzero: adcs x5, x2, xzr adcs x5, x3, xzr adc x5, x4, xzr - asr x5, x5, #63 - and x5, x5, x6 + and x5, x6, x5, asr 63 adds x1, x1, x5 adcs x2, x2, xzr adcs x3, x3, xzr @@ -236,11 +195,11 @@ fe_isnonzero: orr x3, x3, x4 orr x0, x0, x3 ret -.size fe_isnonzero,.-fe_isnonzero -.text -.globl fe_isnegative -.type fe_isnegative,@function -.align 4 + .size fe_isnonzero,.-fe_isnonzero + .text + .align 2 + .globl fe_isnegative + .type fe_isnegative, %function fe_isnegative: mov x6, #19 ldp x1, x2, [x0] @@ -250,28 +209,22 @@ fe_isnegative: adcs x5, x3, xzr adc x5, x4, xzr and x0, x1, #1 - lsr x5, x5, #63 - eor x0, x0, x5 + eor x0, x0, x5, lsr 63 ret -.size fe_isnegative,.-fe_isnegative -.text -.globl fe_cmov_table -.type fe_cmov_table,@function -.align 4 + .size fe_isnegative,.-fe_isnegative + .text + .align 2 + .globl fe_cmov_table + .type fe_cmov_table, %function fe_cmov_table: stp x29, x30, [sp, #-112]! add x29, sp, #0 str x17, [x29, #16] - str x18, [x29, #24] - str x19, [x29, #32] - str x20, [x29, #40] - str x21, [x29, #48] - str x22, [x29, #56] - str x23, [x29, #64] - str x24, [x29, #72] - str x25, [x29, #80] - str x26, [x29, #88] - str x27, [x29, #96] + stp x18, x19, [x29, #24] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + stp x24, x25, [x29, #72] + stp x26, x27, [x29, #88] str x28, [x29, #104] sxtb x2, w2 sbfx x15, x2, #7, #1 @@ -474,32 +427,25 @@ fe_cmov_table: stp x11, x12, [x0, #64] stp x13, x14, [x0, #80] ldr x17, [x29, #16] - ldr x18, [x29, #24] - ldr x19, [x29, #32] - ldr x20, [x29, #40] - ldr x21, [x29, #48] - ldr x22, [x29, #56] - ldr x23, [x29, #64] - ldr x24, [x29, #72] - ldr x25, [x29, #80] - ldr x26, [x29, #88] - ldr x27, [x29, #96] + ldp x18, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldp x24, x25, [x29, #72] + ldp x26, x27, [x29, #88] ldr x28, [x29, #104] ldp x29, x30, [sp], #0x70 ret -.size fe_cmov_table,.-fe_cmov_table -.text -.globl fe_mul -.type fe_mul,@function -.align 4 + .size fe_cmov_table,.-fe_cmov_table + .text + .align 2 + .globl fe_mul + .type fe_mul, %function fe_mul: stp x29, x30, [sp, #-64]! add x29, sp, #0 str x17, [x29, #24] - str x18, [x29, #32] - str x19, [x29, #40] - str x20, [x29, #48] - str x21, [x29, #56] + stp x18, x19, [x29, #32] + stp x20, x21, [x29, #48] # Multiply ldp x14, x15, [x1] ldp x16, x17, [x1, #16] @@ -631,8 +577,7 @@ fe_mul: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -642,17 +587,15 @@ fe_mul: stp x6, x7, [x0] stp x8, x9, [x0, #16] ldr x17, [x29, #24] - ldr x18, [x29, #32] - ldr x19, [x29, #40] - ldr x20, [x29, #48] - ldr x21, [x29, #56] + ldp x18, x19, [x29, #32] + ldp x20, x21, [x29, #48] ldp x29, x30, [sp], #0x40 ret -.size fe_mul,.-fe_mul -.text -.globl fe_sq -.type fe_sq,@function -.align 4 + .size fe_mul,.-fe_mul + .text + .align 2 + .globl fe_sq + .type fe_sq, %function fe_sq: # Square ldp x13, x14, [x1] @@ -751,8 +694,7 @@ fe_sq: adcs x7, x7, xzr adc x8, x8, xzr # Reduce if top bit set - asr x4, x8, #63 - and x4, x4, x2 + and x4, x2, x8, asr 63 and x8, x8, #0x7fffffffffffffff adds x5, x5, x4 adcs x6, x6, xzr @@ -762,177 +704,11 @@ fe_sq: stp x5, x6, [x0] stp x7, x8, [x0, #16] ret -.size fe_sq,.-fe_sq -.text -.globl fe_mul121666 -.type fe_mul121666,@function -.align 4 -fe_mul121666: - # Multiply by 121666 - ldp x5, x6, [x1] - ldp x7, x8, [x1, #16] - mov x4, #0xdb42 - movk x4, #1, lsl 16 - mul x9, x5, x4 - umulh x10, x5, x4 - mul x2, x6, x4 - umulh x3, x6, x4 - adds x10, x10, x2 - adc x11, xzr, x3 - mul x2, x7, x4 - umulh x3, x7, x4 - adds x11, x11, x2 - adc x12, xzr, x3 - mul x2, x8, x4 - umulh x3, x8, x4 - adds x12, x12, x2 - adc x3, xzr, x3 - mov x4, #19 - extr x3, x3, x12, #63 - mul x3, x3, x4 - and x12, x12, #0x7fffffffffffffff - adds x9, x9, x3 - adcs x10, x10, xzr - adcs x11, x11, xzr - adc x12, x12, xzr - stp x9, x10, [x0] - stp x11, x12, [x0, #16] - ret -.size fe_mul121666,.-fe_mul121666 -.text -.globl fe_sq2 -.type fe_sq2,@function -.align 4 -fe_sq2: - stp x29, x30, [sp, #-32]! - add x29, sp, #0 - str x17, [x29, #24] - # Square * 2 - ldp x5, x6, [x1] - ldp x7, x8, [x1, #16] - # A[0] * A[1] - mul x10, x5, x6 - umulh x11, x5, x6 - # A[0] * A[2] - mul x2, x5, x7 - umulh x12, x5, x7 - adds x11, x11, x2 - adc x12, x12, xzr - # A[0] * A[3] - mul x2, x5, x8 - umulh x13, x5, x8 - adds x12, x12, x2 - adc x13, x13, xzr - # A[1] * A[2] - mul x2, x6, x7 - umulh x3, x6, x7 - adds x12, x12, x2 - adcs x13, x13, x3 - adc x14, xzr, xzr - # A[1] * A[3] - mul x2, x6, x8 - umulh x3, x6, x8 - adds x13, x13, x2 - adc x14, x14, x3 - # A[2] * A[3] - mul x2, x7, x8 - umulh x15, x7, x8 - adds x14, x14, x2 - adc x15, x15, xzr - # Double - adds x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adcs x13, x13, x13 - adcs x14, x14, x14 - adcs x15, x15, x15 - adc x16, xzr, xzr - # A[0] * A[0] - mul x9, x5, x5 - umulh x17, x5, x5 - # A[1] * A[1] - mul x2, x6, x6 - umulh x3, x6, x6 - adds x10, x10, x17 - adcs x11, x11, x2 - adc x17, x3, xzr - # A[2] * A[2] - mul x2, x7, x7 - umulh x3, x7, x7 - adds x12, x12, x17 - adcs x13, x13, x2 - adc x17, x3, xzr - # A[3] * A[3] - mul x2, x8, x8 - umulh x3, x8, x8 - adds x14, x14, x17 - adcs x15, x15, x2 - adc x16, x16, x3 - # Double and Reduce - mov x2, #0x169 - # Move top half into t4-t7 and remove top bit from t3 - lsr x17, x16, #61 - extr x16, x16, x15, #62 - extr x15, x15, x14, #62 - extr x14, x14, x13, #62 - extr x13, x13, x12, #62 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - lsl x9, x9, #1 - and x12, x12, #0x7fffffffffffffff - # Two left, only one right - and x16, x16, #0x7fffffffffffffff - # Multiply top bits by 19*19 - mul x17, x17, x2 - # Multiply top half by 19 - mov x2, #19 - mul x3, x2, x13 - umulh x13, x2, x13 - adds x9, x9, x3 - mul x3, x2, x14 - umulh x14, x2, x14 - adcs x10, x10, x3 - mul x3, x2, x15 - umulh x15, x2, x15 - adcs x11, x11, x3 - mul x3, x2, x16 - umulh x4, x2, x16 - adcs x12, x12, x3 - adc x4, x4, xzr - # Add remaining product results in - adds x9, x9, x17 - adcs x10, x10, x13 - adcs x11, x11, x14 - adcs x12, x12, x15 - adc x4, x4, xzr - # Overflow - extr x4, x4, x12, #63 - mul x4, x4, x2 - and x12, x12, #0x7fffffffffffffff - adds x9, x9, x4 - adcs x10, x10, xzr - adcs x11, x11, xzr - adc x12, x12, xzr - # Reduce if top bit set - asr x4, x12, #63 - and x4, x4, x2 - and x12, x12, #0x7fffffffffffffff - adds x9, x9, x4 - adcs x10, x10, xzr - adcs x11, x11, xzr - adc x12, x12, xzr - # Store - stp x9, x10, [x0] - stp x11, x12, [x0, #16] - ldr x17, [x29, #24] - ldp x29, x30, [sp], #32 - ret -.size fe_sq2,.-fe_sq2 -.text -.globl fe_invert -.type fe_invert,@function -.align 4 + .size fe_sq,.-fe_sq + .text + .align 2 + .globl fe_invert + .type fe_invert, %function fe_invert: stp x29, x30, [sp, #-176]! add x29, sp, #0 @@ -954,16 +730,16 @@ fe_invert: add x1, x29, #16 add x2, x29, #48 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq add x0, x29, #48 add x1, x29, #48 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq mov x20, #4 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert1: bl fe_sq sub x20, x20, #1 @@ -972,11 +748,11 @@ L_fe_invert1: add x0, x29, #48 add x2, x29, #48 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 add x1, x29, #48 bl fe_sq mov x20, #9 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert2: bl fe_sq sub x20, x20, #1 @@ -984,20 +760,20 @@ L_fe_invert2: bne L_fe_invert2 add x2, x29, #48 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq mov x20, #19 - add x1, x29, #112 + add x1, x29, #0x70 L_fe_invert3: bl fe_sq sub x20, x20, #1 cmp x20, #0 bne L_fe_invert3 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul mov x20, #10 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert4: bl fe_sq sub x20, x20, #1 @@ -1006,11 +782,11 @@ L_fe_invert4: add x0, x29, #48 add x2, x29, #48 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 add x1, x29, #48 bl fe_sq mov x20, #49 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert5: bl fe_sq sub x20, x20, #1 @@ -1018,20 +794,20 @@ L_fe_invert5: bne L_fe_invert5 add x2, x29, #48 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq mov x20, #0x63 - add x1, x29, #112 + add x1, x29, #0x70 L_fe_invert6: bl fe_sq sub x20, x20, #1 cmp x20, #0 bne L_fe_invert6 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul mov x20, #50 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_invert7: bl fe_sq sub x20, x20, #1 @@ -1053,25 +829,20 @@ L_fe_invert8: ldr x20, [x29, #168] ldp x29, x30, [sp], #0xb0 ret -.size fe_invert,.-fe_invert -.text -.globl curve25519 -.type curve25519,@function -.align 4 + .size fe_invert,.-fe_invert + .text + .align 2 + .globl curve25519 + .type curve25519, %function curve25519: stp x29, x30, [sp, #-288]! add x29, sp, #0 str x17, [x29, #192] - str x18, [x29, #200] - str x19, [x29, #208] - str x20, [x29, #216] - str x21, [x29, #224] - str x22, [x29, #232] - str x23, [x29, #240] - str x24, [x29, #248] - str x25, [x29, #256] - str x26, [x29, #264] - str x27, [x29, #272] + stp x18, x19, [x29, #200] + stp x20, x21, [x29, #216] + stp x22, x23, [x29, #232] + stp x24, x25, [x29, #248] + stp x26, x27, [x29, #264] str x28, [x29, #280] mov x22, xzr str x0, [x29, #176] @@ -1317,8 +1088,7 @@ L_curve25519_bits: adcs x20, x20, xzr adc x21, x21, xzr # Reduce if top bit set - asr x5, x21, #63 - and x5, x5, x3 + and x5, x3, x21, asr 63 and x21, x21, #0x7fffffffffffffff adds x18, x18, x5 adcs x19, x19, xzr @@ -1456,8 +1226,7 @@ L_curve25519_bits: adcs x20, x20, xzr adc x21, x21, xzr # Reduce if top bit set - asr x5, x21, #63 - and x5, x5, x3 + and x5, x3, x21, asr 63 and x21, x21, #0x7fffffffffffffff adds x18, x18, x5 adcs x19, x19, xzr @@ -1559,8 +1328,7 @@ L_curve25519_bits: adcs x12, x12, xzr adc x13, x13, xzr # Reduce if top bit set - asr x5, x13, #63 - and x5, x5, x3 + and x5, x3, x13, asr 63 and x13, x13, #0x7fffffffffffffff adds x10, x10, x5 adcs x11, x11, xzr @@ -1662,8 +1430,7 @@ L_curve25519_bits: adcs x16, x16, xzr adc x17, x17, xzr # Reduce if top bit set - asr x5, x17, #63 - and x5, x5, x3 + and x5, x3, x17, asr 63 and x17, x17, #0x7fffffffffffffff adds x14, x14, x5 adcs x15, x15, xzr @@ -1797,8 +1564,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -1989,8 +1755,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2126,8 +1891,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2231,8 +1995,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Reduce if top bit set - asr x5, x9, #63 - and x5, x5, x3 + and x5, x3, x9, asr 63 and x9, x9, #0x7fffffffffffffff adds x6, x6, x5 adcs x7, x7, xzr @@ -2368,8 +2131,7 @@ L_curve25519_bits: adcs x12, x12, xzr adc x13, x13, xzr # Reduce if top bit set - asr x5, x13, #63 - and x5, x5, x3 + and x5, x3, x13, asr 63 and x13, x13, #0x7fffffffffffffff adds x10, x10, x5 adcs x11, x11, xzr @@ -2389,106 +2151,106 @@ L_curve25519_bits: add x0, x29, #48 add x1, x29, #16 bl fe_sq - add x0, x29, #80 + add x0, x29, #0x50 add x1, x29, #48 bl fe_sq - add x1, x29, #80 + add x1, x29, #0x50 bl fe_sq add x1, x29, #16 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul add x0, x29, #48 add x1, x29, #48 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq - add x0, x29, #80 - add x1, x29, #80 - add x2, x29, #112 + add x0, x29, #0x50 + add x1, x29, #0x50 + add x2, x29, #0x70 bl fe_mul - add x0, x29, #112 + add x0, x29, #0x70 bl fe_sq mov x24, #4 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_1: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_1 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #112 - add x1, x29, #80 + add x0, x29, #0x70 + add x1, x29, #0x50 bl fe_sq mov x24, #9 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_2: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_2 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #144 + add x0, x29, #0x90 bl fe_sq mov x24, #19 - add x1, x29, #144 + add x1, x29, #0x90 L_curve25519_inv_3: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_3 - add x0, x29, #112 - add x2, x29, #112 + add x0, x29, #0x70 + add x2, x29, #0x70 bl fe_mul mov x24, #10 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_4: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_4 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #112 - add x1, x29, #80 + add x0, x29, #0x70 + add x1, x29, #0x50 bl fe_sq mov x24, #49 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_5: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_5 - add x2, x29, #80 + add x2, x29, #0x50 bl fe_mul - add x0, x29, #144 + add x0, x29, #0x90 bl fe_sq mov x24, #0x63 - add x1, x29, #144 + add x1, x29, #0x90 L_curve25519_inv_6: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_6 - add x0, x29, #112 - add x2, x29, #112 + add x0, x29, #0x70 + add x2, x29, #0x70 bl fe_mul mov x24, #50 - add x1, x29, #112 + add x1, x29, #0x70 L_curve25519_inv_7: bl fe_sq sub x24, x24, #1 cmp x24, #0 bne L_curve25519_inv_7 - add x0, x29, #80 - add x2, x29, #80 + add x0, x29, #0x50 + add x2, x29, #0x50 bl fe_mul mov x24, #5 - add x1, x29, #80 + add x1, x29, #0x50 L_curve25519_inv_8: bl fe_sq sub x24, x24, #1 @@ -2629,8 +2391,7 @@ L_curve25519_inv_8: adcs x16, x16, xzr adc x17, x17, xzr # Reduce if top bit set - asr x5, x17, #63 - and x5, x5, x3 + and x5, x3, x17, asr 63 and x17, x17, #0x7fffffffffffffff adds x14, x14, x5 adcs x15, x15, xzr @@ -2641,24 +2402,19 @@ L_curve25519_inv_8: stp x16, x17, [x0, #16] mov x0, xzr ldr x17, [x29, #192] - ldr x18, [x29, #200] - ldr x19, [x29, #208] - ldr x20, [x29, #216] - ldr x21, [x29, #224] - ldr x22, [x29, #232] - ldr x23, [x29, #240] - ldr x24, [x29, #248] - ldr x25, [x29, #256] - ldr x26, [x29, #264] - ldr x27, [x29, #272] + ldp x18, x19, [x29, #200] + ldp x20, x21, [x29, #216] + ldp x22, x23, [x29, #232] + ldp x24, x25, [x29, #248] + ldp x26, x27, [x29, #264] ldr x28, [x29, #280] ldp x29, x30, [sp], #0x120 ret -.size curve25519,.-curve25519 -.text -.globl fe_pow22523 -.type fe_pow22523,@function -.align 4 + .size curve25519,.-curve25519 + .text + .align 2 + .globl fe_pow22523 + .type fe_pow22523, %function fe_pow22523: stp x29, x30, [sp, #-144]! add x29, sp, #0 @@ -2709,10 +2465,10 @@ L_fe_pow22523_2: bne L_fe_pow22523_2 add x2, x29, #16 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq mov x21, #19 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_pow22523_3: bl fe_sq sub x21, x21, #1 @@ -2743,10 +2499,10 @@ L_fe_pow22523_5: bne L_fe_pow22523_5 add x2, x29, #16 bl fe_mul - add x0, x29, #80 + add x0, x29, #0x50 bl fe_sq mov x21, #0x63 - add x1, x29, #80 + add x1, x29, #0x50 L_fe_pow22523_6: bl fe_sq sub x21, x21, #1 @@ -2778,19 +2534,17 @@ L_fe_pow22523_8: ldr x21, [x29, #136] ldp x29, x30, [sp], #0x90 ret -.size fe_pow22523,.-fe_pow22523 -.text -.globl fe_ge_to_p2 -.type fe_ge_to_p2,@function -.align 4 + .size fe_pow22523,.-fe_pow22523 + .text + .align 2 + .globl fe_ge_to_p2 + .type fe_ge_to_p2, %function fe_ge_to_p2: stp x29, x30, [sp, #-112]! add x29, sp, #0 str x17, [x29, #72] - str x18, [x29, #80] - str x19, [x29, #88] - str x20, [x29, #96] - str x21, [x29, #104] + stp x18, x19, [x29, #80] + stp x20, x21, [x29, #96] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -2930,8 +2684,7 @@ fe_ge_to_p2: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x21, x6, #63 - and x21, x21, x19 + and x21, x19, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x21 adcs x4, x4, xzr @@ -3074,8 +2827,7 @@ fe_ge_to_p2: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x21, x6, #63 - and x21, x21, x19 + and x21, x19, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x21 adcs x4, x4, xzr @@ -3215,8 +2967,7 @@ fe_ge_to_p2: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x21, x6, #63 - and x21, x21, x19 + and x21, x19, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x21 adcs x4, x4, xzr @@ -3226,29 +2977,23 @@ fe_ge_to_p2: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x17, [x29, #72] - ldr x18, [x29, #80] - ldr x19, [x29, #88] - ldr x20, [x29, #96] - ldr x21, [x29, #104] + ldp x18, x19, [x29, #80] + ldp x20, x21, [x29, #96] ldp x29, x30, [sp], #0x70 ret -.size fe_ge_to_p2,.-fe_ge_to_p2 -.text -.globl fe_ge_to_p3 -.type fe_ge_to_p3,@function -.align 4 + .size fe_ge_to_p2,.-fe_ge_to_p2 + .text + .align 2 + .globl fe_ge_to_p3 + .type fe_ge_to_p3, %function fe_ge_to_p3: stp x29, x30, [sp, #-160]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -3389,8 +3134,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3530,8 +3274,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3671,8 +3414,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3809,8 +3551,7 @@ fe_ge_to_p3: adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - asr x25, x6, #63 - and x25, x25, x23 + and x25, x23, x6, asr 63 and x6, x6, #0x7fffffffffffffff adds x3, x3, x25 adcs x4, x4, xzr @@ -3820,35 +3561,26 @@ fe_ge_to_p3: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] ldp x29, x30, [sp], #0xa0 ret -.size fe_ge_to_p3,.-fe_ge_to_p3 -.text -.globl fe_ge_dbl -.type fe_ge_dbl,@function -.align 4 + .size fe_ge_to_p3,.-fe_ge_to_p3 + .text + .align 2 + .globl fe_ge_dbl + .type fe_ge_dbl, %function fe_ge_dbl: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -3954,8 +3686,7 @@ fe_ge_dbl: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4063,8 +3794,7 @@ fe_ge_dbl: adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - asr x26, x11, #63 - and x26, x26, x24 + and x26, x24, x11, asr 63 and x11, x11, #0x7fffffffffffffff adds x8, x8, x26 adcs x9, x9, xzr @@ -4185,8 +3915,7 @@ fe_ge_dbl: adcs x18, x18, xzr adc x19, x19, xzr # Reduce if top bit set - asr x26, x19, #63 - and x26, x26, x24 + and x26, x24, x19, asr 63 and x19, x19, #0x7fffffffffffffff adds x16, x16, x26 adcs x17, x17, xzr @@ -4359,8 +4088,7 @@ fe_ge_dbl: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4386,37 +4114,27 @@ fe_ge_dbl: stp x4, x5, [x0] stp x6, x7, [x0, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_dbl,.-fe_ge_dbl -.text -.globl fe_ge_madd -.type fe_ge_madd,@function -.align 4 + .size fe_ge_dbl,.-fe_ge_dbl + .text + .align 2 + .globl fe_ge_madd + .type fe_ge_madd, %function fe_ge_madd: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -4592,8 +4310,7 @@ fe_ge_madd: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -4731,8 +4448,7 @@ fe_ge_madd: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4909,8 +4625,7 @@ fe_ge_madd: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -4972,37 +4687,27 @@ fe_ge_madd: stp x16, x17, [x1] stp x18, x19, [x1, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_madd,.-fe_ge_madd -.text -.globl fe_ge_msub -.type fe_ge_msub,@function -.align 4 + .size fe_ge_madd,.-fe_ge_madd + .text + .align 2 + .globl fe_ge_msub + .type fe_ge_msub, %function fe_ge_msub: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -5178,8 +4883,7 @@ fe_ge_msub: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -5317,8 +5021,7 @@ fe_ge_msub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -5495,8 +5198,7 @@ fe_ge_msub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -5558,37 +5260,27 @@ fe_ge_msub: stp x16, x17, [x0] stp x18, x19, [x0, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_msub,.-fe_ge_msub -.text -.globl fe_ge_add -.type fe_ge_add,@function -.align 4 + .size fe_ge_msub,.-fe_ge_msub + .text + .align 2 + .globl fe_ge_add + .type fe_ge_add, %function fe_ge_add: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -5764,8 +5456,7 @@ fe_ge_add: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -5903,8 +5594,7 @@ fe_ge_add: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6081,8 +5771,7 @@ fe_ge_add: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6239,8 +5928,7 @@ fe_ge_add: adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - asr x26, x11, #63 - and x26, x26, x24 + and x26, x24, x11, asr 63 and x11, x11, #0x7fffffffffffffff adds x8, x8, x26 adcs x9, x9, xzr @@ -6284,37 +5972,27 @@ fe_ge_add: stp x16, x17, [x1] stp x18, x19, [x1, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_add,.-fe_ge_add -.text -.globl fe_ge_sub -.type fe_ge_sub,@function -.align 4 + .size fe_ge_add,.-fe_ge_add + .text + .align 2 + .globl fe_ge_sub + .type fe_ge_sub, %function fe_ge_sub: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - str x18, [x29, #96] - str x19, [x29, #104] - str x20, [x29, #112] - str x21, [x29, #120] - str x22, [x29, #128] - str x23, [x29, #136] - str x24, [x29, #144] - str x25, [x29, #152] - str x26, [x29, #160] - str x27, [x29, #168] + stp x18, x19, [x29, #96] + stp x20, x21, [x29, #112] + stp x22, x23, [x29, #128] + stp x24, x25, [x29, #144] + stp x26, x27, [x29, #160] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -6490,8 +6168,7 @@ fe_ge_sub: adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - asr x26, x15, #63 - and x26, x26, x24 + and x26, x24, x15, asr 63 and x15, x15, #0x7fffffffffffffff adds x12, x12, x26 adcs x13, x13, xzr @@ -6629,8 +6306,7 @@ fe_ge_sub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6807,8 +6483,7 @@ fe_ge_sub: adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - asr x26, x7, #63 - and x26, x26, x24 + and x26, x24, x7, asr 63 and x7, x7, #0x7fffffffffffffff adds x4, x4, x26 adcs x5, x5, xzr @@ -6965,8 +6640,7 @@ fe_ge_sub: adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - asr x26, x11, #63 - and x26, x26, x24 + and x26, x24, x11, asr 63 and x11, x11, #0x7fffffffffffffff adds x8, x8, x26 adcs x9, x9, xzr @@ -7010,17 +6684,12 @@ fe_ge_sub: stp x16, x17, [x1] stp x18, x19, [x1, #16] ldr x17, [x29, #88] - ldr x18, [x29, #96] - ldr x19, [x29, #104] - ldr x20, [x29, #112] - ldr x21, [x29, #120] - ldr x22, [x29, #128] - ldr x23, [x29, #136] - ldr x24, [x29, #144] - ldr x25, [x29, #152] - ldr x26, [x29, #160] - ldr x27, [x29, #168] + ldp x18, x19, [x29, #96] + ldp x20, x21, [x29, #112] + ldp x22, x23, [x29, #128] + ldp x24, x25, [x29, #144] + ldp x26, x27, [x29, #160] ldp x29, x30, [sp], #0xb0 ret -.size fe_ge_sub,.-fe_ge_sub + .size fe_ge_sub,.-fe_ge_sub #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c index 2d0b0642c..d42daee4c 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -19,7 +19,12 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c + */ #ifdef __aarch64__ +#include #ifdef HAVE_CONFIG_H #include #endif @@ -46,11 +51,11 @@ void fe_frombytes(fe out, const unsigned char* in) __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x2, x3, [%x[in]]\n\t" + "ldp x4, x5, [%x[in], #16]\n\t" "and x5, x5, #0x7fffffffffffffff\n\t" - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "stp x2, x3, [%x[out]]\n\t" + "stp x4, x5, [%x[out], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [out] "+r" (out), [in] "+r" (in) : @@ -64,21 +69,20 @@ void fe_tobytes(unsigned char* out, const fe n) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "mov x7, #19\n\t" - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x2, x3, [%x[n]]\n\t" + "ldp x4, x5, [%x[n], #16]\n\t" "adds x6, x2, x7\n\t" "adcs x6, x3, xzr\n\t" "adcs x6, x4, xzr\n\t" "adc x6, x5, xzr\n\t" - "asr x6, x6, #63\n\t" - "and x6, x6, x7\n\t" + "and x6, x7, x6, asr 63\n\t" "adds x2, x2, x6\n\t" "adcs x3, x3, xzr\n\t" "adcs x4, x4, xzr\n\t" "adc x5, x5, xzr\n\t" "and x5, x5, #0x7fffffffffffffff\n\t" - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "stp x2, x3, [%x[out]]\n\t" + "stp x4, x5, [%x[out], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [out] "+r" (out), [n] "+r" (n) : @@ -93,8 +97,8 @@ void fe_1(fe n) "add x29, sp, #0\n\t" /* Set one */ "mov x1, #1\n\t" - "stp x1, xzr, [x0]\n\t" - "stp xzr, xzr, [x0, #16]\n\t" + "stp x1, xzr, [%x[n]]\n\t" + "stp xzr, xzr, [%x[n], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [n] "+r" (n) : @@ -108,8 +112,8 @@ void fe_0(fe n) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Set zero */ - "stp xzr, xzr, [x0]\n\t" - "stp xzr, xzr, [x0, #16]\n\t" + "stp xzr, xzr, [%x[n]]\n\t" + "stp xzr, xzr, [%x[n], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [n] "+r" (n) : @@ -123,10 +127,10 @@ void fe_copy(fe r, const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Copy */ - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" - "stp x2, x3, [x0]\n\t" - "stp x4, x5, [x0, #16]\n\t" + "ldp x2, x3, [%x[a]]\n\t" + "ldp x4, x5, [%x[a], #16]\n\t" + "stp x2, x3, [%x[r]]\n\t" + "stp x4, x5, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -134,46 +138,16 @@ void fe_copy(fe r, const fe a) ); } -void fe_cswap(fe a, fe b, int c) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - /* Conditional Swap */ - "cmp %[c], #1\n\t" - "ldp x3, x4, [x0]\n\t" - "ldp x5, x6, [x0, #16]\n\t" - "ldp x7, x8, [x1]\n\t" - "ldp x9, x10, [x1, #16]\n\t" - "csel x11, x3, x7, eq\n\t" - "csel x3, x7, x3, eq\n\t" - "csel x12, x4, x8, eq\n\t" - "csel x4, x8, x4, eq\n\t" - "csel x13, x5, x9, eq\n\t" - "csel x5, x9, x5, eq\n\t" - "csel x14, x6, x10, eq\n\t" - "csel x6, x10, x6, eq\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "stp x11, x12, [x1]\n\t" - "stp x13, x14, [x1, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14" - ); -} - void fe_sub(fe r, const fe a, const fe b) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Sub */ - "ldp x3, x4, [x1]\n\t" - "ldp x5, x6, [x1, #16]\n\t" - "ldp x7, x8, [x2]\n\t" - "ldp x9, x10, [x2, #16]\n\t" + "ldp x3, x4, [%x[a]]\n\t" + "ldp x5, x6, [%x[a], #16]\n\t" + "ldp x7, x8, [%x[b]]\n\t" + "ldp x9, x10, [%x[b], #16]\n\t" "subs x3, x3, x7\n\t" "sbcs x4, x4, x8\n\t" "sbcs x5, x5, x9\n\t" @@ -188,8 +162,8 @@ void fe_sub(fe r, const fe a, const fe b) "adcs x4, x4, x11\n\t" "adcs x5, x5, x11\n\t" "adc x6, x6, x13\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -203,10 +177,10 @@ void fe_add(fe r, const fe a, const fe b) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Add */ - "ldp x3, x4, [x1]\n\t" - "ldp x5, x6, [x1, #16]\n\t" - "ldp x7, x8, [x2]\n\t" - "ldp x9, x10, [x2, #16]\n\t" + "ldp x3, x4, [%x[a]]\n\t" + "ldp x5, x6, [%x[a], #16]\n\t" + "ldp x7, x8, [%x[b]]\n\t" + "ldp x9, x10, [%x[b], #16]\n\t" "adds x3, x3, x7\n\t" "adcs x4, x4, x8\n\t" "adcs x5, x5, x9\n\t" @@ -221,8 +195,8 @@ void fe_add(fe r, const fe a, const fe b) "sbcs x4, x4, x11\n\t" "sbcs x5, x5, x11\n\t" "sbc x6, x6, x13\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -235,8 +209,8 @@ void fe_neg(fe r, const fe a) __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" - "ldp x2, x3, [x1]\n\t" - "ldp x4, x5, [x1, #16]\n\t" + "ldp x2, x3, [%x[a]]\n\t" + "ldp x4, x5, [%x[a], #16]\n\t" "mov x6, #-19\n\t" "mov x7, #-1\n\t" "mov x8, #-1\n\t" @@ -245,8 +219,8 @@ void fe_neg(fe r, const fe a) "sbcs x7, x7, x3\n\t" "sbcs x8, x8, x4\n\t" "sbc x9, x9, x5\n\t" - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -254,51 +228,27 @@ void fe_neg(fe r, const fe a) ); } -void fe_cmov(fe a, const fe b, int c) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - "ldp x4, x5, [x0]\n\t" - "ldp x6, x7, [x0, #16]\n\t" - "ldp x8, x9, [x1]\n\t" - "ldp x10, x11, [x1, #16]\n\t" - "cmp %[c], #1\n\t" - "csel x4, x4, x8, eq\n\t" - "csel x5, x5, x9, eq\n\t" - "csel x6, x6, x10, eq\n\t" - "csel x7, x7, x11, eq\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" - ); -} - int fe_isnonzero(const fe a) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "mov x6, #19\n\t" - "ldp x1, x2, [x0]\n\t" - "ldp x3, x4, [x0, #16]\n\t" + "ldp x1, x2, [%x[a]]\n\t" + "ldp x3, x4, [%x[a], #16]\n\t" "adds x5, x1, x6\n\t" "adcs x5, x2, xzr\n\t" "adcs x5, x3, xzr\n\t" "adc x5, x4, xzr\n\t" - "asr x5, x5, #63\n\t" - "and x5, x5, x6\n\t" + "and x5, x6, x5, asr 63\n\t" "adds x1, x1, x5\n\t" "adcs x2, x2, xzr\n\t" "adcs x3, x3, xzr\n\t" "adc x4, x4, xzr\n\t" "and x4, x4, #0x7fffffffffffffff\n\t" - "orr %[a], x1, x2\n\t" + "orr %x[a], x1, x2\n\t" "orr x3, x3, x4\n\t" - "orr %[a], %[a], x3\n\t" + "orr %x[a], %x[a], x3\n\t" "ldp x29, x30, [sp], #16\n\t" : [a] "+r" (a) : @@ -313,15 +263,14 @@ int fe_isnegative(const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "mov x6, #19\n\t" - "ldp x1, x2, [x0]\n\t" - "ldp x3, x4, [x0, #16]\n\t" + "ldp x1, x2, [%x[a]]\n\t" + "ldp x3, x4, [%x[a], #16]\n\t" "adds x5, x1, x6\n\t" "adcs x5, x2, xzr\n\t" "adcs x5, x3, xzr\n\t" "adc x5, x4, xzr\n\t" - "and %[a], x1, #1\n\t" - "lsr x5, x5, #63\n\t" - "eor %[a], %[a], x5\n\t" + "and %x[a], x1, #1\n\t" + "eor %x[a], %x[a], x5, lsr 63\n\t" "ldp x29, x30, [sp], #16\n\t" : [a] "+r" (a) : @@ -335,9 +284,9 @@ void fe_cmov_table(fe* r, fe* base, signed char b) __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" - "sxtb %[b], w2\n\t" - "sbfx x15, %[b], #7, #1\n\t" - "eor x16, %[b], x15\n\t" + "sxtb %x[b], %w[b]\n\t" + "sbfx x15, %x[b], #7, #1\n\t" + "eor x16, %x[b], x15\n\t" "sub x16, x16, x15\n\t" "mov x3, #1\n\t" "mov x4, xzr\n\t" @@ -352,12 +301,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "mov x13, xzr\n\t" "mov x14, xzr\n\t" "cmp x16, #1\n\t" - "ldp x17, x18, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x1, #32]\n\t" - "ldp x23, x24, [x1, #48]\n\t" - "ldp x25, x26, [x1, #64]\n\t" - "ldp x27, x28, [x1, #80]\n\t" + "ldp x17, x18, [%x[base]]\n\t" + "ldp x19, x20, [%x[base], #16]\n\t" + "ldp x21, x22, [%x[base], #32]\n\t" + "ldp x23, x24, [%x[base], #48]\n\t" + "ldp x25, x26, [%x[base], #64]\n\t" + "ldp x27, x28, [%x[base], #80]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -371,12 +320,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #2\n\t" - "ldp x17, x18, [x1, #96]\n\t" - "ldp x19, x20, [x1, #112]\n\t" - "ldp x21, x22, [x1, #128]\n\t" - "ldp x23, x24, [x1, #144]\n\t" - "ldp x25, x26, [x1, #160]\n\t" - "ldp x27, x28, [x1, #176]\n\t" + "ldp x17, x18, [%x[base], #96]\n\t" + "ldp x19, x20, [%x[base], #112]\n\t" + "ldp x21, x22, [%x[base], #128]\n\t" + "ldp x23, x24, [%x[base], #144]\n\t" + "ldp x25, x26, [%x[base], #160]\n\t" + "ldp x27, x28, [%x[base], #176]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -390,12 +339,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #3\n\t" - "ldp x17, x18, [x1, #192]\n\t" - "ldp x19, x20, [x1, #208]\n\t" - "ldp x21, x22, [x1, #224]\n\t" - "ldp x23, x24, [x1, #240]\n\t" - "ldp x25, x26, [x1, #256]\n\t" - "ldp x27, x28, [x1, #272]\n\t" + "ldp x17, x18, [%x[base], #192]\n\t" + "ldp x19, x20, [%x[base], #208]\n\t" + "ldp x21, x22, [%x[base], #224]\n\t" + "ldp x23, x24, [%x[base], #240]\n\t" + "ldp x25, x26, [%x[base], #256]\n\t" + "ldp x27, x28, [%x[base], #272]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -409,12 +358,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #4\n\t" - "ldp x17, x18, [x1, #288]\n\t" - "ldp x19, x20, [x1, #304]\n\t" - "ldp x21, x22, [x1, #320]\n\t" - "ldp x23, x24, [x1, #336]\n\t" - "ldp x25, x26, [x1, #352]\n\t" - "ldp x27, x28, [x1, #368]\n\t" + "ldp x17, x18, [%x[base], #288]\n\t" + "ldp x19, x20, [%x[base], #304]\n\t" + "ldp x21, x22, [%x[base], #320]\n\t" + "ldp x23, x24, [%x[base], #336]\n\t" + "ldp x25, x26, [%x[base], #352]\n\t" + "ldp x27, x28, [%x[base], #368]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -427,14 +376,14 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x12, x26, x12, eq\n\t" "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" - "add %[base], %[base], #0x180\n\t" + "add %x[base], %x[base], #0x180\n\t" "cmp x16, #5\n\t" - "ldp x17, x18, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x1, #32]\n\t" - "ldp x23, x24, [x1, #48]\n\t" - "ldp x25, x26, [x1, #64]\n\t" - "ldp x27, x28, [x1, #80]\n\t" + "ldp x17, x18, [%x[base]]\n\t" + "ldp x19, x20, [%x[base], #16]\n\t" + "ldp x21, x22, [%x[base], #32]\n\t" + "ldp x23, x24, [%x[base], #48]\n\t" + "ldp x25, x26, [%x[base], #64]\n\t" + "ldp x27, x28, [%x[base], #80]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -448,12 +397,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #6\n\t" - "ldp x17, x18, [x1, #96]\n\t" - "ldp x19, x20, [x1, #112]\n\t" - "ldp x21, x22, [x1, #128]\n\t" - "ldp x23, x24, [x1, #144]\n\t" - "ldp x25, x26, [x1, #160]\n\t" - "ldp x27, x28, [x1, #176]\n\t" + "ldp x17, x18, [%x[base], #96]\n\t" + "ldp x19, x20, [%x[base], #112]\n\t" + "ldp x21, x22, [%x[base], #128]\n\t" + "ldp x23, x24, [%x[base], #144]\n\t" + "ldp x25, x26, [%x[base], #160]\n\t" + "ldp x27, x28, [%x[base], #176]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -467,12 +416,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #7\n\t" - "ldp x17, x18, [x1, #192]\n\t" - "ldp x19, x20, [x1, #208]\n\t" - "ldp x21, x22, [x1, #224]\n\t" - "ldp x23, x24, [x1, #240]\n\t" - "ldp x25, x26, [x1, #256]\n\t" - "ldp x27, x28, [x1, #272]\n\t" + "ldp x17, x18, [%x[base], #192]\n\t" + "ldp x19, x20, [%x[base], #208]\n\t" + "ldp x21, x22, [%x[base], #224]\n\t" + "ldp x23, x24, [%x[base], #240]\n\t" + "ldp x25, x26, [%x[base], #256]\n\t" + "ldp x27, x28, [%x[base], #272]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -486,12 +435,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x13, x27, x13, eq\n\t" "csel x14, x28, x14, eq\n\t" "cmp x16, #8\n\t" - "ldp x17, x18, [x1, #288]\n\t" - "ldp x19, x20, [x1, #304]\n\t" - "ldp x21, x22, [x1, #320]\n\t" - "ldp x23, x24, [x1, #336]\n\t" - "ldp x25, x26, [x1, #352]\n\t" - "ldp x27, x28, [x1, #368]\n\t" + "ldp x17, x18, [%x[base], #288]\n\t" + "ldp x19, x20, [%x[base], #304]\n\t" + "ldp x21, x22, [%x[base], #320]\n\t" + "ldp x23, x24, [%x[base], #336]\n\t" + "ldp x25, x26, [%x[base], #352]\n\t" + "ldp x27, x28, [%x[base], #368]\n\t" "csel x3, x17, x3, eq\n\t" "csel x4, x18, x4, eq\n\t" "csel x5, x19, x5, eq\n\t" @@ -512,7 +461,7 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "sbcs x18, x18, x12\n\t" "sbcs x19, x19, x13\n\t" "sbc x20, x20, x14\n\t" - "cmp %[b], #0\n\t" + "cmp %x[b], #0\n\t" "mov x15, x3\n\t" "csel x3, x7, x3, lt\n\t" "csel x7, x15, x7, lt\n\t" @@ -529,12 +478,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) "csel x12, x18, x12, lt\n\t" "csel x13, x19, x13, lt\n\t" "csel x14, x20, x14, lt\n\t" - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "stp x7, x8, [x0, #32]\n\t" - "stp x9, x10, [x0, #48]\n\t" - "stp x11, x12, [x0, #64]\n\t" - "stp x13, x14, [x0, #80]\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" + "stp x7, x8, [%x[r], #32]\n\t" + "stp x9, x10, [%x[r], #48]\n\t" + "stp x11, x12, [%x[r], #64]\n\t" + "stp x13, x14, [%x[r], #80]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) : @@ -548,10 +497,10 @@ void fe_mul(fe r, const fe a, const fe b) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Multiply */ - "ldp x14, x15, [x1]\n\t" - "ldp x16, x17, [x1, #16]\n\t" - "ldp x18, x19, [x2]\n\t" - "ldp x20, x21, [x2, #16]\n\t" + "ldp x14, x15, [%x[a]]\n\t" + "ldp x16, x17, [%x[a], #16]\n\t" + "ldp x18, x19, [%x[b]]\n\t" + "ldp x20, x21, [%x[b], #16]\n\t" /* A[0] * B[0] */ "mul x6, x14, x18\n\t" "umulh x7, x14, x18\n\t" @@ -678,16 +627,15 @@ void fe_mul(fe r, const fe a, const fe b) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -701,8 +649,8 @@ void fe_sq(fe r, const fe a) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" /* Square */ - "ldp x13, x14, [x1]\n\t" - "ldp x15, x16, [x1, #16]\n\t" + "ldp x13, x14, [%x[a]]\n\t" + "ldp x15, x16, [%x[a], #16]\n\t" /* A[0] * A[1] */ "mul x6, x13, x14\n\t" "umulh x7, x13, x14\n\t" @@ -797,16 +745,15 @@ void fe_sq(fe r, const fe a) "adcs x7, x7, xzr\n\t" "adc x8, x8, xzr\n\t" /* Reduce if top bit set */ - "asr x4, x8, #63\n\t" - "and x4, x4, x2\n\t" + "and x4, x2, x8, asr 63\n\t" "and x8, x8, #0x7fffffffffffffff\n\t" "adds x5, x5, x4\n\t" "adcs x6, x6, xzr\n\t" "adcs x7, x7, xzr\n\t" "adc x8, x8, xzr\n\t" /* Store */ - "stp x5, x6, [x0]\n\t" - "stp x7, x8, [x0, #16]\n\t" + "stp x5, x6, [%x[r]]\n\t" + "stp x7, x8, [%x[r], #16]\n\t" "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -814,185 +761,14 @@ void fe_sq(fe r, const fe a) ); } -void fe_mul121666(fe r, fe a) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - /* Multiply by 121666 */ - "ldp x5, x6, [x1]\n\t" - "ldp x7, x8, [x1, #16]\n\t" - "mov x4, #0xdb42\n\t" - "movk x4, #1, lsl 16\n\t" - "mul x9, x5, x4\n\t" - "umulh x10, x5, x4\n\t" - "mul x2, x6, x4\n\t" - "umulh x3, x6, x4\n\t" - "adds x10, x10, x2\n\t" - "adc x11, xzr, x3\n\t" - "mul x2, x7, x4\n\t" - "umulh x3, x7, x4\n\t" - "adds x11, x11, x2\n\t" - "adc x12, xzr, x3\n\t" - "mul x2, x8, x4\n\t" - "umulh x3, x8, x4\n\t" - "adds x12, x12, x2\n\t" - "adc x3, xzr, x3\n\t" - "mov x4, #19\n\t" - "extr x3, x3, x12, #63\n\t" - "mul x3, x3, x4\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, xzr\n\t" - "adcs x11, x11, xzr\n\t" - "adc x12, x12, xzr\n\t" - "stp x9, x10, [x0]\n\t" - "stp x11, x12, [x0, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12" - ); -} - -void fe_sq2(fe r, const fe a) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" - "add x29, sp, #0\n\t" - /* Square * 2 */ - "ldp x5, x6, [x1]\n\t" - "ldp x7, x8, [x1, #16]\n\t" - /* A[0] * A[1] */ - "mul x10, x5, x6\n\t" - "umulh x11, x5, x6\n\t" - /* A[0] * A[2] */ - "mul x2, x5, x7\n\t" - "umulh x12, x5, x7\n\t" - "adds x11, x11, x2\n\t" - "adc x12, x12, xzr\n\t" - /* A[0] * A[3] */ - "mul x2, x5, x8\n\t" - "umulh x13, x5, x8\n\t" - "adds x12, x12, x2\n\t" - "adc x13, x13, xzr\n\t" - /* A[1] * A[2] */ - "mul x2, x6, x7\n\t" - "umulh x3, x6, x7\n\t" - "adds x12, x12, x2\n\t" - "adcs x13, x13, x3\n\t" - "adc x14, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x2, x6, x8\n\t" - "umulh x3, x6, x8\n\t" - "adds x13, x13, x2\n\t" - "adc x14, x14, x3\n\t" - /* A[2] * A[3] */ - "mul x2, x7, x8\n\t" - "umulh x15, x7, x8\n\t" - "adds x14, x14, x2\n\t" - "adc x15, x15, xzr\n\t" - /* Double */ - "adds x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adcs x13, x13, x13\n\t" - "adcs x14, x14, x14\n\t" - "adcs x15, x15, x15\n\t" - "adc x16, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x9, x5, x5\n\t" - "umulh x17, x5, x5\n\t" - /* A[1] * A[1] */ - "mul x2, x6, x6\n\t" - "umulh x3, x6, x6\n\t" - "adds x10, x10, x17\n\t" - "adcs x11, x11, x2\n\t" - "adc x17, x3, xzr\n\t" - /* A[2] * A[2] */ - "mul x2, x7, x7\n\t" - "umulh x3, x7, x7\n\t" - "adds x12, x12, x17\n\t" - "adcs x13, x13, x2\n\t" - "adc x17, x3, xzr\n\t" - /* A[3] * A[3] */ - "mul x2, x8, x8\n\t" - "umulh x3, x8, x8\n\t" - "adds x14, x14, x17\n\t" - "adcs x15, x15, x2\n\t" - "adc x16, x16, x3\n\t" - /* Double and Reduce */ - "mov x2, #0x169\n\t" - /* Move top half into t4-t7 and remove top bit from t3 */ - "lsr x17, x16, #61\n\t" - "extr x16, x16, x15, #62\n\t" - "extr x15, x15, x14, #62\n\t" - "extr x14, x14, x13, #62\n\t" - "extr x13, x13, x12, #62\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "lsl x9, x9, #1\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - /* Two left, only one right */ - "and x16, x16, #0x7fffffffffffffff\n\t" - /* Multiply top bits by 19*19 */ - "mul x17, x17, x2\n\t" - /* Multiply top half by 19 */ - "mov x2, #19\n\t" - "mul x3, x2, x13\n\t" - "umulh x13, x2, x13\n\t" - "adds x9, x9, x3\n\t" - "mul x3, x2, x14\n\t" - "umulh x14, x2, x14\n\t" - "adcs x10, x10, x3\n\t" - "mul x3, x2, x15\n\t" - "umulh x15, x2, x15\n\t" - "adcs x11, x11, x3\n\t" - "mul x3, x2, x16\n\t" - "umulh x4, x2, x16\n\t" - "adcs x12, x12, x3\n\t" - "adc x4, x4, xzr\n\t" - /* Add remaining product results in */ - "adds x9, x9, x17\n\t" - "adcs x10, x10, x13\n\t" - "adcs x11, x11, x14\n\t" - "adcs x12, x12, x15\n\t" - "adc x4, x4, xzr\n\t" - /* Overflow */ - "extr x4, x4, x12, #63\n\t" - "mul x4, x4, x2\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - "adds x9, x9, x4\n\t" - "adcs x10, x10, xzr\n\t" - "adcs x11, x11, xzr\n\t" - "adc x12, x12, xzr\n\t" - /* Reduce if top bit set */ - "asr x4, x12, #63\n\t" - "and x4, x4, x2\n\t" - "and x12, x12, #0x7fffffffffffffff\n\t" - "adds x9, x9, x4\n\t" - "adcs x10, x10, xzr\n\t" - "adcs x11, x11, xzr\n\t" - "adc x12, x12, xzr\n\t" - /* Store */ - "stp x9, x10, [x0]\n\t" - "stp x11, x12, [x0, #16]\n\t" - "ldp x29, x30, [sp], #16\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" - ); -} - void fe_invert(fe r, const fe a) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-160]!\n\t" "add x29, sp, #0\n\t" /* Invert */ - "str %[r], [x29, #144]\n\t" - "str %[a], [x29, #152]\n\t" + "str %x[r], [x29, #144]\n\t" + "str %x[a], [x29, #152]\n\t" "add x0, x29, #16\n\t" "bl fe_sq\n\t" "add x0, x29, #48\n\t" @@ -1007,107 +783,107 @@ void fe_invert(fe r, const fe a) "add x1, x29, #16\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "add x0, x29, #48\n\t" "add x1, x29, #48\n\t" - "add x2, x29, #80\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x20, #4\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert1:\n\t" + "L_fe_invert1_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert1\n\t" + "bne L_fe_invert1_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #48\n\t" "bl fe_sq\n\t" "mov x20, #9\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert2:\n\t" + "L_fe_invert2_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert2\n\t" + "bne L_fe_invert2_%=\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" "mov x20, #19\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_fe_invert3:\n\t" + "L_fe_invert3_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert3\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_fe_invert3_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "mov x20, #10\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert4:\n\t" + "L_fe_invert4_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert4\n\t" + "bne L_fe_invert4_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #48\n\t" "bl fe_sq\n\t" "mov x20, #49\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert5:\n\t" + "L_fe_invert5_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert5\n\t" + "bne L_fe_invert5_%=\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" "mov x20, #0x63\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_fe_invert6:\n\t" + "L_fe_invert6_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert6\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_fe_invert6_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "mov x20, #50\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_invert7:\n\t" + "L_fe_invert7_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert7\n\t" + "bne L_fe_invert7_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" "mov x20, #5\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_invert8:\n\t" + "L_fe_invert8_%=: \n\t" "bl fe_sq\n\t" "sub x20, x20, #1\n\t" "cmp x20, #0\n\t" - "bne L_fe_invert8\n\t" + "bne L_fe_invert8_%=\n\t" "ldr x0, [x29, #144]\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" @@ -1124,11 +900,11 @@ int curve25519(byte* r, byte* n, byte* a) "stp x29, x30, [sp, #-192]!\n\t" "add x29, sp, #0\n\t" "mov x22, xzr\n\t" - "str %[r], [x29, #176]\n\t" + "str %x[r], [x29, #176]\n\t" /* Set one */ "mov x23, #1\n\t" - "stp x23, xzr, [x0]\n\t" - "stp xzr, xzr, [x0, #16]\n\t" + "stp x23, xzr, [%x[r]]\n\t" + "stp xzr, xzr, [%x[r], #16]\n\t" /* Set zero */ "stp xzr, xzr, [x29, #16]\n\t" "stp xzr, xzr, [x29, #32]\n\t" @@ -1137,24 +913,24 @@ int curve25519(byte* r, byte* n, byte* a) "stp x23, xzr, [x29, #48]\n\t" "stp xzr, xzr, [x29, #64]\n\t" /* Copy */ - "ldp x6, x7, [x2]\n\t" - "ldp x8, x9, [x2, #16]\n\t" + "ldp x6, x7, [%x[a]]\n\t" + "ldp x8, x9, [%x[a], #16]\n\t" "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" "mov x25, #62\n\t" "mov x24, #24\n\t" "\n" - "L_curve25519_words:\n\t" + "L_curve25519_words_%=: \n\t" "\n" - "L_curve25519_bits:\n\t" - "ldr x23, [x1, x24]\n\t" + "L_curve25519_bits_%=: \n\t" + "ldr x23, [%x[n], x24]\n\t" "lsr x23, x23, x25\n\t" "and x23, x23, #1\n\t" "eor x22, x22, x23\n\t" /* Conditional Swap */ "cmp x22, #1\n\t" - "ldp x10, x11, [x0]\n\t" - "ldp x12, x13, [x0, #16]\n\t" + "ldp x10, x11, [%x[r]]\n\t" + "ldp x12, x13, [%x[r], #16]\n\t" "ldp x6, x7, [x29, #80]\n\t" "ldp x8, x9, [x29, #96]\n\t" "csel x14, x10, x6, eq\n\t" @@ -1369,8 +1145,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x20, x20, xzr\n\t" "adc x21, x21, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x21, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x21, asr 63\n\t" "and x21, x21, #0x7fffffffffffffff\n\t" "adds x18, x18, x5\n\t" "adcs x19, x19, xzr\n\t" @@ -1508,8 +1283,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x20, x20, xzr\n\t" "adc x21, x21, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x21, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x21, asr 63\n\t" "and x21, x21, #0x7fffffffffffffff\n\t" "adds x18, x18, x5\n\t" "adcs x19, x19, xzr\n\t" @@ -1611,8 +1385,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x12, x12, xzr\n\t" "adc x13, x13, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x13, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x13, asr 63\n\t" "and x13, x13, #0x7fffffffffffffff\n\t" "adds x10, x10, x5\n\t" "adcs x11, x11, xzr\n\t" @@ -1714,8 +1487,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x17, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x17, asr 63\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, xzr\n\t" @@ -1849,16 +1621,15 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ - "stp x6, x7, [x0]\n\t" - "stp x8, x9, [x0, #16]\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" /* Sub */ "subs x14, x14, x10\n\t" "sbcs x15, x15, x11\n\t" @@ -2041,8 +1812,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2178,8 +1948,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2283,8 +2052,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x9, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x9, asr 63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x5\n\t" "adcs x7, x7, xzr\n\t" @@ -2292,8 +2060,8 @@ int curve25519(byte* r, byte* n, byte* a) "adc x9, x9, xzr\n\t" /* Store */ /* Multiply */ - "ldp x14, x15, [x2]\n\t" - "ldp x16, x17, [x2, #16]\n\t" + "ldp x14, x15, [%x[a]]\n\t" + "ldp x16, x17, [%x[a], #16]\n\t" /* A[0] * B[0] */ "mul x10, x14, x6\n\t" "umulh x11, x14, x6\n\t" @@ -2420,8 +2188,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x12, x12, xzr\n\t" "adc x13, x13, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x13, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x13, asr 63\n\t" "and x13, x13, #0x7fffffffffffffff\n\t" "adds x10, x10, x5\n\t" "adcs x11, x11, xzr\n\t" @@ -2432,135 +2199,135 @@ int curve25519(byte* r, byte* n, byte* a) "stp x12, x13, [x29, #64]\n\t" "sub x25, x25, #1\n\t" "cmp x25, #0\n\t" - "bge L_curve25519_bits\n\t" + "bge L_curve25519_bits_%=\n\t" "mov x25, #63\n\t" "sub x24, x24, #8\n\t" "cmp x24, #0\n\t" - "bge L_curve25519_words\n\t" + "bge L_curve25519_words_%=\n\t" /* Invert */ "add x0, x29, #48\n\t" "add x1, x29, #16\n\t" "bl fe_sq\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #48\n\t" "bl fe_sq\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "bl fe_sq\n\t" "add x1, x29, #16\n\t" - "add x2, x29, #80\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "add x0, x29, #48\n\t" "add x1, x29, #48\n\t" - "add x2, x29, #80\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" - "add x0, x29, #80\n\t" - "add x1, x29, #80\n\t" - "add x2, x29, #112\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #0x70\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" + "add x0, x29, #0x70\n\t" "bl fe_sq\n\t" "mov x24, #4\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_1:\n\t" + "L_curve25519_inv_1_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_1\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" - "add x1, x29, #80\n\t" + "add x0, x29, #0x70\n\t" + "add x1, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x24, #9\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_2:\n\t" + "L_curve25519_inv_2_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_2\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #144\n\t" + "add x0, x29, #0x90\n\t" "bl fe_sq\n\t" "mov x24, #19\n\t" - "add x1, x29, #144\n\t" + "add x1, x29, #0x90\n\t" "\n" - "L_curve25519_inv_3:\n\t" + "L_curve25519_inv_3_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_3\n\t" - "add x0, x29, #112\n\t" - "add x2, x29, #112\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add x0, x29, #0x70\n\t" + "add x2, x29, #0x70\n\t" "bl fe_mul\n\t" "mov x24, #10\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_4:\n\t" + "L_curve25519_inv_4_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_4\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #112\n\t" - "add x1, x29, #80\n\t" + "add x0, x29, #0x70\n\t" + "add x1, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x24, #49\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_5:\n\t" + "L_curve25519_inv_5_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_5\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" - "add x0, x29, #144\n\t" + "add x0, x29, #0x90\n\t" "bl fe_sq\n\t" "mov x24, #0x63\n\t" - "add x1, x29, #144\n\t" + "add x1, x29, #0x90\n\t" "\n" - "L_curve25519_inv_6:\n\t" + "L_curve25519_inv_6_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_6\n\t" - "add x0, x29, #112\n\t" - "add x2, x29, #112\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add x0, x29, #0x70\n\t" + "add x2, x29, #0x70\n\t" "bl fe_mul\n\t" "mov x24, #50\n\t" - "add x1, x29, #112\n\t" + "add x1, x29, #0x70\n\t" "\n" - "L_curve25519_inv_7:\n\t" + "L_curve25519_inv_7_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_7\n\t" - "add x0, x29, #80\n\t" - "add x2, x29, #80\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" "bl fe_mul\n\t" "mov x24, #5\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_curve25519_inv_8:\n\t" + "L_curve25519_inv_8_%=: \n\t" "bl fe_sq\n\t" "sub x24, x24, #1\n\t" "cmp x24, #0\n\t" - "bne L_curve25519_inv_8\n\t" + "bne L_curve25519_inv_8_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" - "ldr %[r], [x29, #176]\n\t" + "ldr %x[r], [x29, #176]\n\t" /* Multiply */ - "ldp x6, x7, [x0]\n\t" - "ldp x8, x9, [x0, #16]\n\t" + "ldp x6, x7, [%x[r]]\n\t" + "ldp x8, x9, [%x[r], #16]\n\t" "ldp x10, x11, [x29, #16]\n\t" "ldp x12, x13, [x29, #32]\n\t" /* A[0] * B[0] */ @@ -2689,16 +2456,15 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" /* Reduce if top bit set */ - "asr x5, x17, #63\n\t" - "and x5, x5, x3\n\t" + "and x5, x3, x17, asr 63\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, xzr\n\t" "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" /* Store */ - "stp x14, x15, [x0]\n\t" - "stp x16, x17, [x0, #16]\n\t" + "stp x14, x15, [%x[r]]\n\t" + "stp x16, x17, [%x[r], #16]\n\t" "mov x0, xzr\n\t" "ldp x29, x30, [sp], #0xc0\n\t" : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) @@ -2714,8 +2480,8 @@ void fe_pow22523(fe r, const fe a) "stp x29, x30, [sp, #-128]!\n\t" "add x29, sp, #0\n\t" /* pow22523 */ - "str %[r], [x29, #112]\n\t" - "str %[a], [x29, #120]\n\t" + "str %x[r], [x29, #112]\n\t" + "str %x[a], [x29, #120]\n\t" "add x0, x29, #16\n\t" "bl fe_sq\n\t" "add x0, x29, #48\n\t" @@ -2740,11 +2506,11 @@ void fe_pow22523(fe r, const fe a) "mov x21, #4\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_1:\n\t" + "L_fe_pow22523_1_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_1\n\t" + "bne L_fe_pow22523_1_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" @@ -2754,34 +2520,34 @@ void fe_pow22523(fe r, const fe a) "mov x21, #9\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_2:\n\t" + "L_fe_pow22523_2_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_2\n\t" + "bne L_fe_pow22523_2_%=\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x21, #19\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_pow22523_3:\n\t" + "L_fe_pow22523_3_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_3\n\t" + "bne L_fe_pow22523_3_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" "mov x21, #10\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_4:\n\t" + "L_fe_pow22523_4_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_4\n\t" + "bne L_fe_pow22523_4_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" @@ -2791,45 +2557,45 @@ void fe_pow22523(fe r, const fe a) "mov x21, #49\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_5:\n\t" + "L_fe_pow22523_5_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_5\n\t" + "bne L_fe_pow22523_5_%=\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" - "add x0, x29, #80\n\t" + "add x0, x29, #0x50\n\t" "bl fe_sq\n\t" "mov x21, #0x63\n\t" - "add x1, x29, #80\n\t" + "add x1, x29, #0x50\n\t" "\n" - "L_fe_pow22523_6:\n\t" + "L_fe_pow22523_6_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_6\n\t" + "bne L_fe_pow22523_6_%=\n\t" "add x0, x29, #48\n\t" "add x2, x29, #48\n\t" "bl fe_mul\n\t" "mov x21, #50\n\t" "add x1, x29, #48\n\t" "\n" - "L_fe_pow22523_7:\n\t" + "L_fe_pow22523_7_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_7\n\t" + "bne L_fe_pow22523_7_%=\n\t" "add x0, x29, #16\n\t" "add x2, x29, #16\n\t" "bl fe_mul\n\t" "mov x21, #2\n\t" "add x1, x29, #16\n\t" "\n" - "L_fe_pow22523_8:\n\t" + "L_fe_pow22523_8_%=: \n\t" "bl fe_sq\n\t" "sub x21, x21, #1\n\t" "cmp x21, #0\n\t" - "bne L_fe_pow22523_8\n\t" + "bne L_fe_pow22523_8_%=\n\t" "ldr x0, [x29, #112]\n\t" "ldr x2, [x29, #120]\n\t" "bl fe_mul\n\t" @@ -2845,12 +2611,12 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con __asm__ __volatile__ ( "stp x29, x30, [sp, #-64]!\n\t" "add x29, sp, #0\n\t" - "str %[ry], [x29, #16]\n\t" - "str %[rz], [x29, #24]\n\t" - "str %[px], [x29, #32]\n\t" - "str %[py], [x29, #40]\n\t" - "str %[pz], [x29, #48]\n\t" - "str %[pt], [x29, #56]\n\t" + "str %x[ry], [x29, #16]\n\t" + "str %x[rz], [x29, #24]\n\t" + "str %x[px], [x29, #32]\n\t" + "str %x[py], [x29, #40]\n\t" + "str %x[pz], [x29, #48]\n\t" + "str %x[pt], [x29, #56]\n\t" "ldr x1, [x29, #32]\n\t" "ldr x2, [x29, #56]\n\t" /* Multiply */ @@ -2984,8 +2750,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x21, x6, #63\n\t" - "and x21, x21, x19\n\t" + "and x21, x19, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" @@ -3128,8 +2893,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x21, x6, #63\n\t" - "and x21, x21, x19\n\t" + "and x21, x19, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" @@ -3269,8 +3033,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x21, x6, #63\n\t" - "and x21, x21, x19\n\t" + "and x21, x19, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x21\n\t" "adcs x4, x4, xzr\n\t" @@ -3291,13 +3054,13 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[ry], [x29, #16]\n\t" - "str %[rz], [x29, #24]\n\t" - "str %[rt], [x29, #32]\n\t" - "str %[px], [x29, #40]\n\t" - "str %[py], [x29, #48]\n\t" - "str %[pz], [x29, #56]\n\t" - "str %[pt], [x29, #64]\n\t" + "str %x[ry], [x29, #16]\n\t" + "str %x[rz], [x29, #24]\n\t" + "str %x[rt], [x29, #32]\n\t" + "str %x[px], [x29, #40]\n\t" + "str %x[py], [x29, #48]\n\t" + "str %x[pz], [x29, #56]\n\t" + "str %x[pt], [x29, #64]\n\t" "ldr x1, [x29, #40]\n\t" "ldr x2, [x29, #64]\n\t" /* Multiply */ @@ -3431,8 +3194,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3572,8 +3334,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3713,8 +3474,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3851,8 +3611,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "asr x25, x6, #63\n\t" - "and x25, x25, x23\n\t" + "and x25, x23, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" "adds x3, x3, x25\n\t" "adcs x4, x4, xzr\n\t" @@ -3873,13 +3632,13 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" "ldr x1, [x29, #48]\n\t" /* Square */ "ldp x12, x13, [x1]\n\t" @@ -3978,8 +3737,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4087,8 +3845,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x11, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" "adds x8, x8, x26\n\t" "adcs x9, x9, xzr\n\t" @@ -4209,8 +3966,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x18, x18, xzr\n\t" "adc x19, x19, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x19, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x19, asr 63\n\t" "and x19, x19, #0x7fffffffffffffff\n\t" "adds x16, x16, x26\n\t" "adcs x17, x17, xzr\n\t" @@ -4383,8 +4139,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4421,14 +4176,14 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -4596,8 +4351,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -4735,8 +4489,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4913,8 +4666,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -4990,14 +4742,14 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -5165,8 +4917,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -5304,8 +5055,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -5482,8 +5232,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -5559,14 +5308,14 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -5734,8 +5483,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -5873,8 +5621,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6051,8 +5798,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6209,8 +5955,7 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x11, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" "adds x8, x8, x26\n\t" "adcs x9, x9, xzr\n\t" @@ -6269,14 +6014,14 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz __asm__ __volatile__ ( "stp x29, x30, [sp, #-80]!\n\t" "add x29, sp, #0\n\t" - "str %[rx], [x29, #16]\n\t" - "str %[ry], [x29, #24]\n\t" - "str %[rz], [x29, #32]\n\t" - "str %[rt], [x29, #40]\n\t" - "str %[px], [x29, #48]\n\t" - "str %[py], [x29, #56]\n\t" - "str %[pz], [x29, #64]\n\t" - "str %[pt], [x29, #72]\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" "ldr x2, [x29, #56]\n\t" "ldr x3, [x29, #48]\n\t" /* Add */ @@ -6444,8 +6189,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x15, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" "adds x12, x12, x26\n\t" "adcs x13, x13, xzr\n\t" @@ -6583,8 +6327,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6761,8 +6504,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x7, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" @@ -6919,8 +6661,7 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "asr x26, x11, #63\n\t" - "and x26, x26, x24\n\t" + "and x26, x24, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" "adds x8, x8, x26\n\t" "adcs x9, x9, xzr\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index 471267509..a8cf8e742 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -19,6 +19,10 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S + */ #ifdef __aarch64__ .text .section .rodata @@ -109,8 +113,8 @@ L_SHA512_transform_neon_len_k: .text .section .rodata .type L_SHA512_transform_neon_len_ror8, %object - .align 4 .size L_SHA512_transform_neon_len_ror8, 16 + .align 4 L_SHA512_transform_neon_len_ror8: .xword 0x7060504030201, 0x80f0e0d0c0b0a09 .text @@ -169,14 +173,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 - ror x13, x8, #18 - ror x15, x4, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x8, #41 - ror x15, x4, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 @@ -195,43 +195,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v0.16b, v1.16b, #8 ror x12, x7, #14 - ext v9.16b, v4.16b, v5.16b, #8 - ror x14, x11, #28 - add v0.2d, v0.2d, v9.2d - ror x13, x7, #18 shl v8.2d, v7.2d, #45 - ror x15, x11, #34 + ror x14, x11, #28 sri v8.2d, v7.2d, #19 - eor x12, x13, x12 + eor x12, x12, x7, ror 18 shl v9.2d, v7.2d, #3 - eor x14, x15, x14 + eor x14, x14, x11, ror 34 sri v9.2d, v7.2d, #61 - ror x13, x7, #41 + eor x12, x12, x7, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x11, #39 + eor x15, x14, x11, ror 39 ushr v8.2d, v7.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v0.2d, v0.2d, v9.2d add x10, x10, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x11, x4 - sri v8.2d, v10.2d, #1 - eor x12, x8, x9 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x7 - ushr v10.2d, v10.2d, #7 - add x10, x10, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x9 add v0.2d, v0.2d, v9.2d + eor x12, x8, x9 + ext v9.16b, v4.16b, v5.16b, #8 + and x17, x16, x17 + add v0.2d, v0.2d, v9.2d + and x12, x12, x7 + shl v8.2d, v10.2d, #63 + add x10, x10, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x9 + tbl v9.16b, {v10.16b}, v11.16b add x10, x10, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x4 + ushr v10.2d, v10.2d, #7 add x10, x10, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v0.2d, v0.2d, v9.2d add x6, x6, x10 add x10, x10, x15 # Round 2 @@ -239,14 +235,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 - ror x13, x6, #18 - ror x15, x10, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x6, #41 - ror x15, x10, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 @@ -265,43 +257,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v1.16b, v2.16b, #8 ror x12, x5, #14 - ext v9.16b, v5.16b, v6.16b, #8 - ror x14, x9, #28 - add v1.2d, v1.2d, v9.2d - ror x13, x5, #18 shl v8.2d, v0.2d, #45 - ror x15, x9, #34 + ror x14, x9, #28 sri v8.2d, v0.2d, #19 - eor x12, x13, x12 + eor x12, x12, x5, ror 18 shl v9.2d, v0.2d, #3 - eor x14, x15, x14 + eor x14, x14, x9, ror 34 sri v9.2d, v0.2d, #61 - ror x13, x5, #41 + eor x12, x12, x5, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x9, #39 + eor x15, x14, x9, ror 39 ushr v8.2d, v0.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v1.2d, v1.2d, v9.2d add x8, x8, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x9, x10 - sri v8.2d, v10.2d, #1 - eor x12, x6, x7 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x5 - ushr v10.2d, v10.2d, #7 - add x8, x8, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x7 add v1.2d, v1.2d, v9.2d + eor x12, x6, x7 + ext v9.16b, v5.16b, v6.16b, #8 + and x17, x16, x17 + add v1.2d, v1.2d, v9.2d + and x12, x12, x5 + shl v8.2d, v10.2d, #63 + add x8, x8, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x7 + tbl v9.16b, {v10.16b}, v11.16b add x8, x8, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x10 + ushr v10.2d, v10.2d, #7 add x8, x8, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v1.2d, v1.2d, v9.2d add x4, x4, x8 add x8, x8, x15 # Round 4 @@ -309,14 +297,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 - ror x13, x4, #18 - ror x15, x8, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x4, #41 - ror x15, x8, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 @@ -335,43 +319,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v2.16b, v3.16b, #8 ror x12, x11, #14 - ext v9.16b, v6.16b, v7.16b, #8 - ror x14, x7, #28 - add v2.2d, v2.2d, v9.2d - ror x13, x11, #18 shl v8.2d, v1.2d, #45 - ror x15, x7, #34 + ror x14, x7, #28 sri v8.2d, v1.2d, #19 - eor x12, x13, x12 + eor x12, x12, x11, ror 18 shl v9.2d, v1.2d, #3 - eor x14, x15, x14 + eor x14, x14, x7, ror 34 sri v9.2d, v1.2d, #61 - ror x13, x11, #41 + eor x12, x12, x11, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x7, #39 + eor x15, x14, x7, ror 39 ushr v8.2d, v1.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v2.2d, v2.2d, v9.2d add x6, x6, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x7, x8 - sri v8.2d, v10.2d, #1 - eor x12, x4, x5 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x11 - ushr v10.2d, v10.2d, #7 - add x6, x6, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x5 add v2.2d, v2.2d, v9.2d + eor x12, x4, x5 + ext v9.16b, v6.16b, v7.16b, #8 + and x17, x16, x17 + add v2.2d, v2.2d, v9.2d + and x12, x12, x11 + shl v8.2d, v10.2d, #63 + add x6, x6, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x5 + tbl v9.16b, {v10.16b}, v11.16b add x6, x6, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x8 + ushr v10.2d, v10.2d, #7 add x6, x6, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v2.2d, v2.2d, v9.2d add x10, x10, x6 add x6, x6, x15 # Round 6 @@ -379,14 +359,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 - ror x13, x10, #18 - ror x15, x6, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x10, #41 - ror x15, x6, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 @@ -405,43 +381,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v3.16b, v4.16b, #8 ror x12, x9, #14 - ext v9.16b, v7.16b, v0.16b, #8 - ror x14, x5, #28 - add v3.2d, v3.2d, v9.2d - ror x13, x9, #18 shl v8.2d, v2.2d, #45 - ror x15, x5, #34 + ror x14, x5, #28 sri v8.2d, v2.2d, #19 - eor x12, x13, x12 + eor x12, x12, x9, ror 18 shl v9.2d, v2.2d, #3 - eor x14, x15, x14 + eor x14, x14, x5, ror 34 sri v9.2d, v2.2d, #61 - ror x13, x9, #41 + eor x12, x12, x9, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x5, #39 + eor x15, x14, x5, ror 39 ushr v8.2d, v2.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v3.2d, v3.2d, v9.2d add x4, x4, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x5, x6 - sri v8.2d, v10.2d, #1 - eor x12, x10, x11 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x9 - ushr v10.2d, v10.2d, #7 - add x4, x4, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x11 add v3.2d, v3.2d, v9.2d + eor x12, x10, x11 + ext v9.16b, v7.16b, v0.16b, #8 + and x17, x16, x17 + add v3.2d, v3.2d, v9.2d + and x12, x12, x9 + shl v8.2d, v10.2d, #63 + add x4, x4, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x11 + tbl v9.16b, {v10.16b}, v11.16b add x4, x4, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x6 + ushr v10.2d, v10.2d, #7 add x4, x4, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v3.2d, v3.2d, v9.2d add x8, x8, x4 add x4, x4, x15 # Round 8 @@ -449,14 +421,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 - ror x13, x8, #18 - ror x15, x4, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x8, #41 - ror x15, x4, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 @@ -475,43 +443,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v4.16b, v5.16b, #8 ror x12, x7, #14 - ext v9.16b, v0.16b, v1.16b, #8 - ror x14, x11, #28 - add v4.2d, v4.2d, v9.2d - ror x13, x7, #18 shl v8.2d, v3.2d, #45 - ror x15, x11, #34 + ror x14, x11, #28 sri v8.2d, v3.2d, #19 - eor x12, x13, x12 + eor x12, x12, x7, ror 18 shl v9.2d, v3.2d, #3 - eor x14, x15, x14 + eor x14, x14, x11, ror 34 sri v9.2d, v3.2d, #61 - ror x13, x7, #41 + eor x12, x12, x7, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x11, #39 + eor x15, x14, x11, ror 39 ushr v8.2d, v3.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v4.2d, v4.2d, v9.2d add x10, x10, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x11, x4 - sri v8.2d, v10.2d, #1 - eor x12, x8, x9 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x7 - ushr v10.2d, v10.2d, #7 - add x10, x10, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x9 add v4.2d, v4.2d, v9.2d + eor x12, x8, x9 + ext v9.16b, v0.16b, v1.16b, #8 + and x17, x16, x17 + add v4.2d, v4.2d, v9.2d + and x12, x12, x7 + shl v8.2d, v10.2d, #63 + add x10, x10, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x9 + tbl v9.16b, {v10.16b}, v11.16b add x10, x10, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x4 + ushr v10.2d, v10.2d, #7 add x10, x10, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v4.2d, v4.2d, v9.2d add x6, x6, x10 add x10, x10, x15 # Round 10 @@ -519,14 +483,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 - ror x13, x6, #18 - ror x15, x10, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x6, #41 - ror x15, x10, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 @@ -545,43 +505,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v5.16b, v6.16b, #8 ror x12, x5, #14 - ext v9.16b, v1.16b, v2.16b, #8 - ror x14, x9, #28 - add v5.2d, v5.2d, v9.2d - ror x13, x5, #18 shl v8.2d, v4.2d, #45 - ror x15, x9, #34 + ror x14, x9, #28 sri v8.2d, v4.2d, #19 - eor x12, x13, x12 + eor x12, x12, x5, ror 18 shl v9.2d, v4.2d, #3 - eor x14, x15, x14 + eor x14, x14, x9, ror 34 sri v9.2d, v4.2d, #61 - ror x13, x5, #41 + eor x12, x12, x5, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x9, #39 + eor x15, x14, x9, ror 39 ushr v8.2d, v4.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v5.2d, v5.2d, v9.2d add x8, x8, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x9, x10 - sri v8.2d, v10.2d, #1 - eor x12, x6, x7 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x5 - ushr v10.2d, v10.2d, #7 - add x8, x8, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x7 add v5.2d, v5.2d, v9.2d + eor x12, x6, x7 + ext v9.16b, v1.16b, v2.16b, #8 + and x17, x16, x17 + add v5.2d, v5.2d, v9.2d + and x12, x12, x5 + shl v8.2d, v10.2d, #63 + add x8, x8, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x7 + tbl v9.16b, {v10.16b}, v11.16b add x8, x8, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x10 + ushr v10.2d, v10.2d, #7 add x8, x8, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v5.2d, v5.2d, v9.2d add x4, x4, x8 add x8, x8, x15 # Round 12 @@ -589,14 +545,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 - ror x13, x4, #18 - ror x15, x8, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x4, #41 - ror x15, x8, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 @@ -615,43 +567,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v6.16b, v7.16b, #8 ror x12, x11, #14 - ext v9.16b, v2.16b, v3.16b, #8 - ror x14, x7, #28 - add v6.2d, v6.2d, v9.2d - ror x13, x11, #18 shl v8.2d, v5.2d, #45 - ror x15, x7, #34 + ror x14, x7, #28 sri v8.2d, v5.2d, #19 - eor x12, x13, x12 + eor x12, x12, x11, ror 18 shl v9.2d, v5.2d, #3 - eor x14, x15, x14 + eor x14, x14, x7, ror 34 sri v9.2d, v5.2d, #61 - ror x13, x11, #41 + eor x12, x12, x11, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x7, #39 + eor x15, x14, x7, ror 39 ushr v8.2d, v5.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v6.2d, v6.2d, v9.2d add x6, x6, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x7, x8 - sri v8.2d, v10.2d, #1 - eor x12, x4, x5 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x11 - ushr v10.2d, v10.2d, #7 - add x6, x6, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x5 add v6.2d, v6.2d, v9.2d + eor x12, x4, x5 + ext v9.16b, v2.16b, v3.16b, #8 + and x17, x16, x17 + add v6.2d, v6.2d, v9.2d + and x12, x12, x11 + shl v8.2d, v10.2d, #63 + add x6, x6, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x5 + tbl v9.16b, {v10.16b}, v11.16b add x6, x6, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x8 + ushr v10.2d, v10.2d, #7 add x6, x6, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v6.2d, v6.2d, v9.2d add x10, x10, x6 add x6, x6, x15 # Round 14 @@ -659,14 +607,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 - ror x13, x10, #18 - ror x15, x6, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x10, #41 - ror x15, x6, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 @@ -685,43 +629,39 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ext v10.16b, v7.16b, v0.16b, #8 ror x12, x9, #14 - ext v9.16b, v3.16b, v4.16b, #8 - ror x14, x5, #28 - add v7.2d, v7.2d, v9.2d - ror x13, x9, #18 shl v8.2d, v6.2d, #45 - ror x15, x5, #34 + ror x14, x5, #28 sri v8.2d, v6.2d, #19 - eor x12, x13, x12 + eor x12, x12, x9, ror 18 shl v9.2d, v6.2d, #3 - eor x14, x15, x14 + eor x14, x14, x5, ror 34 sri v9.2d, v6.2d, #61 - ror x13, x9, #41 + eor x12, x12, x9, ror 41 eor v9.16b, v9.16b, v8.16b - ror x15, x5, #39 + eor x15, x14, x5, ror 39 ushr v8.2d, v6.2d, #6 - eor x12, x13, x12 - eor v9.16b, v9.16b, v8.16b - eor x15, x15, x14 - add v7.2d, v7.2d, v9.2d add x4, x4, x12 - shl v8.2d, v10.2d, #63 + eor v9.16b, v9.16b, v8.16b eor x16, x5, x6 - sri v8.2d, v10.2d, #1 - eor x12, x10, x11 - tbl v9.16b, { v10.16b }, v11.16b - and x17, x16, x17 - eor v9.16b, v9.16b, v8.16b - and x12, x12, x9 - ushr v10.2d, v10.2d, #7 - add x4, x4, x18 - eor v9.16b, v9.16b, v10.16b - eor x12, x12, x11 add v7.2d, v7.2d, v9.2d + eor x12, x10, x11 + ext v9.16b, v3.16b, v4.16b, #8 + and x17, x16, x17 + add v7.2d, v7.2d, v9.2d + and x12, x12, x9 + shl v8.2d, v10.2d, #63 + add x4, x4, x18 + sri v8.2d, v10.2d, #1 + eor x12, x12, x11 + tbl v9.16b, {v10.16b}, v11.16b add x4, x4, x19 + eor v9.16b, v9.16b, v8.16b eor x17, x17, x6 + ushr v10.2d, v10.2d, #7 add x4, x4, x12 + eor v9.16b, v9.16b, v10.16b add x15, x15, x17 + add v7.2d, v7.2d, v9.2d add x8, x8, x4 add x4, x4, x15 subs x28, x28, #1 @@ -731,14 +671,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 - ror x13, x8, #18 - ror x15, x4, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x8, #41 - ror x15, x4, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 @@ -757,14 +693,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x7, #14 ror x14, x11, #28 - ror x13, x7, #18 - ror x15, x11, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x7, #41 - ror x15, x11, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x7, ror 18 + eor x14, x14, x11, ror 34 + eor x12, x12, x7, ror 41 + eor x15, x14, x11, ror 39 add x10, x10, x12 eor x16, x11, x4 eor x12, x8, x9 @@ -783,14 +715,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 - ror x13, x6, #18 - ror x15, x10, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x6, #41 - ror x15, x10, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 @@ -809,14 +737,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x5, #14 ror x14, x9, #28 - ror x13, x5, #18 - ror x15, x9, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x5, #41 - ror x15, x9, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x5, ror 18 + eor x14, x14, x9, ror 34 + eor x12, x12, x5, ror 41 + eor x15, x14, x9, ror 39 add x8, x8, x12 eor x16, x9, x10 eor x12, x6, x7 @@ -835,14 +759,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 - ror x13, x4, #18 - ror x15, x8, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x4, #41 - ror x15, x8, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 @@ -861,14 +781,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x11, #14 ror x14, x7, #28 - ror x13, x11, #18 - ror x15, x7, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x11, #41 - ror x15, x7, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x11, ror 18 + eor x14, x14, x7, ror 34 + eor x12, x12, x11, ror 41 + eor x15, x14, x7, ror 39 add x6, x6, x12 eor x16, x7, x8 eor x12, x4, x5 @@ -887,14 +803,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 - ror x13, x10, #18 - ror x15, x6, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x10, #41 - ror x15, x6, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 @@ -913,14 +825,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x9, #14 ror x14, x5, #28 - ror x13, x9, #18 - ror x15, x5, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x9, #41 - ror x15, x5, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x9, ror 18 + eor x14, x14, x5, ror 34 + eor x12, x12, x9, ror 41 + eor x15, x14, x5, ror 39 add x4, x4, x12 eor x16, x5, x6 eor x12, x10, x11 @@ -939,14 +847,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 - ror x13, x8, #18 - ror x15, x4, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x8, #41 - ror x15, x4, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x15, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 @@ -965,14 +869,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x7, #14 ror x14, x11, #28 - ror x13, x7, #18 - ror x15, x11, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x7, #41 - ror x15, x11, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x7, ror 18 + eor x14, x14, x11, ror 34 + eor x12, x12, x7, ror 41 + eor x15, x14, x11, ror 39 add x10, x10, x12 eor x16, x11, x4 eor x12, x8, x9 @@ -991,14 +891,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 - ror x13, x6, #18 - ror x15, x10, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x6, #41 - ror x15, x10, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x15, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 @@ -1017,14 +913,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x5, #14 ror x14, x9, #28 - ror x13, x5, #18 - ror x15, x9, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x5, #41 - ror x15, x9, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x5, ror 18 + eor x14, x14, x9, ror 34 + eor x12, x12, x5, ror 41 + eor x15, x14, x9, ror 39 add x8, x8, x12 eor x16, x9, x10 eor x12, x6, x7 @@ -1043,14 +935,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 - ror x13, x4, #18 - ror x15, x8, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x4, #41 - ror x15, x8, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x15, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 @@ -1069,14 +957,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x11, #14 ror x14, x7, #28 - ror x13, x11, #18 - ror x15, x7, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x11, #41 - ror x15, x7, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x11, ror 18 + eor x14, x14, x7, ror 34 + eor x12, x12, x11, ror 41 + eor x15, x14, x7, ror 39 add x6, x6, x12 eor x16, x7, x8 eor x12, x4, x5 @@ -1095,14 +979,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 - ror x13, x10, #18 - ror x15, x6, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x10, #41 - ror x15, x6, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x15, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 @@ -1121,14 +1001,10 @@ L_sha512_len_neon_start: ldr x19, [x3], #8 ror x12, x9, #14 ror x14, x5, #28 - ror x13, x9, #18 - ror x15, x5, #34 - eor x12, x13, x12 - eor x14, x15, x14 - ror x13, x9, #41 - ror x15, x5, #39 - eor x12, x13, x12 - eor x15, x15, x14 + eor x12, x12, x9, ror 18 + eor x14, x14, x5, ror 34 + eor x12, x12, x9, ror 41 + eor x15, x14, x5, ror 39 add x4, x4, x12 eor x16, x5, x6 eor x12, x10, x11 @@ -1165,8 +1041,8 @@ L_sha512_len_neon_start: ldp x24, x25, [x29, #72] ldp x26, x27, [x29, #88] ldr x28, [x29, #104] - ldp d8, d9, [sp, #112] - ldp d10, d11, [sp, #128] + ldp d8, d9, [x29, #112] + ldp d10, d11, [x29, #128] ldp x29, x30, [sp], #0x90 ret .size Transform_Sha512_Len,.-Transform_Sha512_Len diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm.c index a4350d7f8..dbc5a7dee 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.c @@ -19,6 +19,10 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c + */ #ifdef __aarch64__ #include #include @@ -120,18 +124,18 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "adr x28, %[L_SHA512_transform_neon_len_ror8]\n\t" "ld1 {v11.16b}, [x28]\n\t" /* Load digest into working vars */ - "ldp x4, x5, [%[sha512]]\n\t" - "ldp x6, x7, [%[sha512], #16]\n\t" - "ldp x8, x9, [%[sha512], #32]\n\t" - "ldp x10, x11, [%[sha512], #48]\n\t" + "ldp x4, x5, [%x[sha512]]\n\t" + "ldp x6, x7, [%x[sha512], #16]\n\t" + "ldp x8, x9, [%x[sha512], #32]\n\t" + "ldp x10, x11, [%x[sha512], #48]\n\t" /* Start of loop processing a block */ "\n" "L_sha512_len_neon_begin_%=: \n\t" /* Load W */ /* Copy digest to add in at end */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[data]], #0x40\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" "mov x20, x4\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%[data]], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" "mov x21, x5\n\t" "rev64 v0.16b, v0.16b\n\t" "mov x22, x6\n\t" @@ -158,14 +162,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" - "ror x13, x8, #18\n\t" - "ror x15, x4, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x8, #41\n\t" - "ror x15, x4, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" @@ -184,43 +184,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v0.16b, v1.16b, #8\n\t" "ror x12, x7, #14\n\t" - "ext v9.16b, v4.16b, v5.16b, #8\n\t" - "ror x14, x11, #28\n\t" - "add v0.2d, v0.2d, v9.2d\n\t" - "ror x13, x7, #18\n\t" "shl v8.2d, v7.2d, #45\n\t" - "ror x15, x11, #34\n\t" + "ror x14, x11, #28\n\t" "sri v8.2d, v7.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x7, ror 18\n\t" "shl v9.2d, v7.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x11, ror 34\n\t" "sri v9.2d, v7.2d, #61\n\t" - "ror x13, x7, #41\n\t" + "eor x12, x12, x7, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x11, #39\n\t" + "eor x15, x14, x11, ror 39\n\t" "ushr v8.2d, v7.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v0.2d, v0.2d, v9.2d\n\t" "add x10, x10, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x11, x4\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x8, x9\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x7\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x10, x10, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x9\n\t" "add v0.2d, v0.2d, v9.2d\n\t" + "eor x12, x8, x9\n\t" + "ext v9.16b, v4.16b, v5.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "and x12, x12, x7\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x10, x10, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x9\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x10, x10, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x4\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" "add x6, x6, x10\n\t" "add x10, x10, x15\n\t" /* Round 2 */ @@ -228,14 +224,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" - "ror x13, x6, #18\n\t" - "ror x15, x10, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x6, #41\n\t" - "ror x15, x10, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" @@ -254,43 +246,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v1.16b, v2.16b, #8\n\t" "ror x12, x5, #14\n\t" - "ext v9.16b, v5.16b, v6.16b, #8\n\t" - "ror x14, x9, #28\n\t" - "add v1.2d, v1.2d, v9.2d\n\t" - "ror x13, x5, #18\n\t" "shl v8.2d, v0.2d, #45\n\t" - "ror x15, x9, #34\n\t" + "ror x14, x9, #28\n\t" "sri v8.2d, v0.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x5, ror 18\n\t" "shl v9.2d, v0.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x9, ror 34\n\t" "sri v9.2d, v0.2d, #61\n\t" - "ror x13, x5, #41\n\t" + "eor x12, x12, x5, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x9, #39\n\t" + "eor x15, x14, x9, ror 39\n\t" "ushr v8.2d, v0.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v1.2d, v1.2d, v9.2d\n\t" "add x8, x8, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x9, x10\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x6, x7\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x5\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x8, x8, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x7\n\t" "add v1.2d, v1.2d, v9.2d\n\t" + "eor x12, x6, x7\n\t" + "ext v9.16b, v5.16b, v6.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "and x12, x12, x5\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x8, x8, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x7\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x8, x8, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x10\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" "add x4, x4, x8\n\t" "add x8, x8, x15\n\t" /* Round 4 */ @@ -298,14 +286,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" - "ror x13, x4, #18\n\t" - "ror x15, x8, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x4, #41\n\t" - "ror x15, x8, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" @@ -324,43 +308,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v2.16b, v3.16b, #8\n\t" "ror x12, x11, #14\n\t" - "ext v9.16b, v6.16b, v7.16b, #8\n\t" - "ror x14, x7, #28\n\t" - "add v2.2d, v2.2d, v9.2d\n\t" - "ror x13, x11, #18\n\t" "shl v8.2d, v1.2d, #45\n\t" - "ror x15, x7, #34\n\t" + "ror x14, x7, #28\n\t" "sri v8.2d, v1.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x11, ror 18\n\t" "shl v9.2d, v1.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x7, ror 34\n\t" "sri v9.2d, v1.2d, #61\n\t" - "ror x13, x11, #41\n\t" + "eor x12, x12, x11, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x7, #39\n\t" + "eor x15, x14, x7, ror 39\n\t" "ushr v8.2d, v1.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v2.2d, v2.2d, v9.2d\n\t" "add x6, x6, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x7, x8\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x4, x5\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x11\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x6, x6, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x5\n\t" "add v2.2d, v2.2d, v9.2d\n\t" + "eor x12, x4, x5\n\t" + "ext v9.16b, v6.16b, v7.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "and x12, x12, x11\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x6, x6, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x5\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x6, x6, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x8\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" "add x10, x10, x6\n\t" "add x6, x6, x15\n\t" /* Round 6 */ @@ -368,14 +348,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" - "ror x13, x10, #18\n\t" - "ror x15, x6, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x10, #41\n\t" - "ror x15, x6, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" @@ -394,43 +370,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v3.16b, v4.16b, #8\n\t" "ror x12, x9, #14\n\t" - "ext v9.16b, v7.16b, v0.16b, #8\n\t" - "ror x14, x5, #28\n\t" - "add v3.2d, v3.2d, v9.2d\n\t" - "ror x13, x9, #18\n\t" "shl v8.2d, v2.2d, #45\n\t" - "ror x15, x5, #34\n\t" + "ror x14, x5, #28\n\t" "sri v8.2d, v2.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x9, ror 18\n\t" "shl v9.2d, v2.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x5, ror 34\n\t" "sri v9.2d, v2.2d, #61\n\t" - "ror x13, x9, #41\n\t" + "eor x12, x12, x9, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x5, #39\n\t" + "eor x15, x14, x5, ror 39\n\t" "ushr v8.2d, v2.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v3.2d, v3.2d, v9.2d\n\t" "add x4, x4, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x5, x6\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x10, x11\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x9\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x4, x4, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x11\n\t" "add v3.2d, v3.2d, v9.2d\n\t" + "eor x12, x10, x11\n\t" + "ext v9.16b, v7.16b, v0.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "and x12, x12, x9\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x4, x4, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x11\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x4, x4, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x6\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" "add x8, x8, x4\n\t" "add x4, x4, x15\n\t" /* Round 8 */ @@ -438,14 +410,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" - "ror x13, x8, #18\n\t" - "ror x15, x4, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x8, #41\n\t" - "ror x15, x4, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" @@ -464,43 +432,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v4.16b, v5.16b, #8\n\t" "ror x12, x7, #14\n\t" - "ext v9.16b, v0.16b, v1.16b, #8\n\t" - "ror x14, x11, #28\n\t" - "add v4.2d, v4.2d, v9.2d\n\t" - "ror x13, x7, #18\n\t" "shl v8.2d, v3.2d, #45\n\t" - "ror x15, x11, #34\n\t" + "ror x14, x11, #28\n\t" "sri v8.2d, v3.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x7, ror 18\n\t" "shl v9.2d, v3.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x11, ror 34\n\t" "sri v9.2d, v3.2d, #61\n\t" - "ror x13, x7, #41\n\t" + "eor x12, x12, x7, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x11, #39\n\t" + "eor x15, x14, x11, ror 39\n\t" "ushr v8.2d, v3.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v4.2d, v4.2d, v9.2d\n\t" "add x10, x10, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x11, x4\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x8, x9\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x7\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x10, x10, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x9\n\t" "add v4.2d, v4.2d, v9.2d\n\t" + "eor x12, x8, x9\n\t" + "ext v9.16b, v0.16b, v1.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "and x12, x12, x7\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x10, x10, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x9\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x10, x10, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x4\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" "add x6, x6, x10\n\t" "add x10, x10, x15\n\t" /* Round 10 */ @@ -508,14 +472,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" - "ror x13, x6, #18\n\t" - "ror x15, x10, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x6, #41\n\t" - "ror x15, x10, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" @@ -534,43 +494,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v5.16b, v6.16b, #8\n\t" "ror x12, x5, #14\n\t" - "ext v9.16b, v1.16b, v2.16b, #8\n\t" - "ror x14, x9, #28\n\t" - "add v5.2d, v5.2d, v9.2d\n\t" - "ror x13, x5, #18\n\t" "shl v8.2d, v4.2d, #45\n\t" - "ror x15, x9, #34\n\t" + "ror x14, x9, #28\n\t" "sri v8.2d, v4.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x5, ror 18\n\t" "shl v9.2d, v4.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x9, ror 34\n\t" "sri v9.2d, v4.2d, #61\n\t" - "ror x13, x5, #41\n\t" + "eor x12, x12, x5, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x9, #39\n\t" + "eor x15, x14, x9, ror 39\n\t" "ushr v8.2d, v4.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v5.2d, v5.2d, v9.2d\n\t" "add x8, x8, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x9, x10\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x6, x7\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x5\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x8, x8, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x7\n\t" "add v5.2d, v5.2d, v9.2d\n\t" + "eor x12, x6, x7\n\t" + "ext v9.16b, v1.16b, v2.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "and x12, x12, x5\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x8, x8, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x7\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x8, x8, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x10\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" "add x4, x4, x8\n\t" "add x8, x8, x15\n\t" /* Round 12 */ @@ -578,14 +534,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" - "ror x13, x4, #18\n\t" - "ror x15, x8, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x4, #41\n\t" - "ror x15, x8, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" @@ -604,43 +556,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v6.16b, v7.16b, #8\n\t" "ror x12, x11, #14\n\t" - "ext v9.16b, v2.16b, v3.16b, #8\n\t" - "ror x14, x7, #28\n\t" - "add v6.2d, v6.2d, v9.2d\n\t" - "ror x13, x11, #18\n\t" "shl v8.2d, v5.2d, #45\n\t" - "ror x15, x7, #34\n\t" + "ror x14, x7, #28\n\t" "sri v8.2d, v5.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x11, ror 18\n\t" "shl v9.2d, v5.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x7, ror 34\n\t" "sri v9.2d, v5.2d, #61\n\t" - "ror x13, x11, #41\n\t" + "eor x12, x12, x11, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x7, #39\n\t" + "eor x15, x14, x7, ror 39\n\t" "ushr v8.2d, v5.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v6.2d, v6.2d, v9.2d\n\t" "add x6, x6, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x7, x8\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x4, x5\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x11\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x6, x6, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x5\n\t" "add v6.2d, v6.2d, v9.2d\n\t" + "eor x12, x4, x5\n\t" + "ext v9.16b, v2.16b, v3.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "and x12, x12, x11\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x6, x6, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x5\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x6, x6, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x8\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" "add x10, x10, x6\n\t" "add x6, x6, x15\n\t" /* Round 14 */ @@ -648,14 +596,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" - "ror x13, x10, #18\n\t" - "ror x15, x6, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x10, #41\n\t" - "ror x15, x6, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" @@ -674,43 +618,39 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ext v10.16b, v7.16b, v0.16b, #8\n\t" "ror x12, x9, #14\n\t" - "ext v9.16b, v3.16b, v4.16b, #8\n\t" - "ror x14, x5, #28\n\t" - "add v7.2d, v7.2d, v9.2d\n\t" - "ror x13, x9, #18\n\t" "shl v8.2d, v6.2d, #45\n\t" - "ror x15, x5, #34\n\t" + "ror x14, x5, #28\n\t" "sri v8.2d, v6.2d, #19\n\t" - "eor x12, x13, x12\n\t" + "eor x12, x12, x9, ror 18\n\t" "shl v9.2d, v6.2d, #3\n\t" - "eor x14, x15, x14\n\t" + "eor x14, x14, x5, ror 34\n\t" "sri v9.2d, v6.2d, #61\n\t" - "ror x13, x9, #41\n\t" + "eor x12, x12, x9, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "ror x15, x5, #39\n\t" + "eor x15, x14, x5, ror 39\n\t" "ushr v8.2d, v6.2d, #6\n\t" - "eor x12, x13, x12\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x15, x14\n\t" - "add v7.2d, v7.2d, v9.2d\n\t" "add x4, x4, x12\n\t" - "shl v8.2d, v10.2d, #63\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x16, x5, x6\n\t" - "sri v8.2d, v10.2d, #1\n\t" - "eor x12, x10, x11\n\t" - "tbl v9.16b, { v10.16b }, v11.16b\n\t" - "and x17, x16, x17\n\t" - "eor v9.16b, v9.16b, v8.16b\n\t" - "and x12, x12, x9\n\t" - "ushr v10.2d, v10.2d, #7\n\t" - "add x4, x4, x18\n\t" - "eor v9.16b, v9.16b, v10.16b\n\t" - "eor x12, x12, x11\n\t" "add v7.2d, v7.2d, v9.2d\n\t" + "eor x12, x10, x11\n\t" + "ext v9.16b, v3.16b, v4.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "and x12, x12, x9\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x4, x4, x18\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x11\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" "add x4, x4, x19\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x6\n\t" + "ushr v10.2d, v10.2d, #7\n\t" "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" "add x15, x15, x17\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" "add x8, x8, x4\n\t" "add x4, x4, x15\n\t" "subs x28, x28, #1\n\t" @@ -720,14 +660,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" - "ror x13, x8, #18\n\t" - "ror x15, x4, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x8, #41\n\t" - "ror x15, x4, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" @@ -746,14 +682,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x7, #14\n\t" "ror x14, x11, #28\n\t" - "ror x13, x7, #18\n\t" - "ror x15, x11, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x7, #41\n\t" - "ror x15, x11, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x7, ror 18\n\t" + "eor x14, x14, x11, ror 34\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor x15, x14, x11, ror 39\n\t" "add x10, x10, x12\n\t" "eor x16, x11, x4\n\t" "eor x12, x8, x9\n\t" @@ -772,14 +704,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" - "ror x13, x6, #18\n\t" - "ror x15, x10, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x6, #41\n\t" - "ror x15, x10, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" @@ -798,14 +726,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x5, #14\n\t" "ror x14, x9, #28\n\t" - "ror x13, x5, #18\n\t" - "ror x15, x9, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x5, #41\n\t" - "ror x15, x9, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x5, ror 18\n\t" + "eor x14, x14, x9, ror 34\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor x15, x14, x9, ror 39\n\t" "add x8, x8, x12\n\t" "eor x16, x9, x10\n\t" "eor x12, x6, x7\n\t" @@ -824,14 +748,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" - "ror x13, x4, #18\n\t" - "ror x15, x8, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x4, #41\n\t" - "ror x15, x8, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" @@ -850,14 +770,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x11, #14\n\t" "ror x14, x7, #28\n\t" - "ror x13, x11, #18\n\t" - "ror x15, x7, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x11, #41\n\t" - "ror x15, x7, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x11, ror 18\n\t" + "eor x14, x14, x7, ror 34\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor x15, x14, x7, ror 39\n\t" "add x6, x6, x12\n\t" "eor x16, x7, x8\n\t" "eor x12, x4, x5\n\t" @@ -876,14 +792,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" - "ror x13, x10, #18\n\t" - "ror x15, x6, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x10, #41\n\t" - "ror x15, x6, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" @@ -902,14 +814,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x9, #14\n\t" "ror x14, x5, #28\n\t" - "ror x13, x9, #18\n\t" - "ror x15, x5, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x9, #41\n\t" - "ror x15, x5, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x9, ror 18\n\t" + "eor x14, x14, x5, ror 34\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor x15, x14, x5, ror 39\n\t" "add x4, x4, x12\n\t" "eor x16, x5, x6\n\t" "eor x12, x10, x11\n\t" @@ -928,14 +836,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" - "ror x13, x8, #18\n\t" - "ror x15, x4, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x8, #41\n\t" - "ror x15, x4, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x15, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" @@ -954,14 +858,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x7, #14\n\t" "ror x14, x11, #28\n\t" - "ror x13, x7, #18\n\t" - "ror x15, x11, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x7, #41\n\t" - "ror x15, x11, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x7, ror 18\n\t" + "eor x14, x14, x11, ror 34\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor x15, x14, x11, ror 39\n\t" "add x10, x10, x12\n\t" "eor x16, x11, x4\n\t" "eor x12, x8, x9\n\t" @@ -980,14 +880,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" - "ror x13, x6, #18\n\t" - "ror x15, x10, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x6, #41\n\t" - "ror x15, x10, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x15, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" @@ -1006,14 +902,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x5, #14\n\t" "ror x14, x9, #28\n\t" - "ror x13, x5, #18\n\t" - "ror x15, x9, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x5, #41\n\t" - "ror x15, x9, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x5, ror 18\n\t" + "eor x14, x14, x9, ror 34\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor x15, x14, x9, ror 39\n\t" "add x8, x8, x12\n\t" "eor x16, x9, x10\n\t" "eor x12, x6, x7\n\t" @@ -1032,14 +924,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" - "ror x13, x4, #18\n\t" - "ror x15, x8, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x4, #41\n\t" - "ror x15, x8, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x15, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" @@ -1058,14 +946,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x11, #14\n\t" "ror x14, x7, #28\n\t" - "ror x13, x11, #18\n\t" - "ror x15, x7, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x11, #41\n\t" - "ror x15, x7, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x11, ror 18\n\t" + "eor x14, x14, x7, ror 34\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor x15, x14, x7, ror 39\n\t" "add x6, x6, x12\n\t" "eor x16, x7, x8\n\t" "eor x12, x4, x5\n\t" @@ -1084,14 +968,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" - "ror x13, x10, #18\n\t" - "ror x15, x6, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x10, #41\n\t" - "ror x15, x6, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x15, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" @@ -1110,14 +990,10 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldr x19, [x3], #8\n\t" "ror x12, x9, #14\n\t" "ror x14, x5, #28\n\t" - "ror x13, x9, #18\n\t" - "ror x15, x5, #34\n\t" - "eor x12, x13, x12\n\t" - "eor x14, x15, x14\n\t" - "ror x13, x9, #41\n\t" - "ror x15, x5, #39\n\t" - "eor x12, x13, x12\n\t" - "eor x15, x15, x14\n\t" + "eor x12, x12, x9, ror 18\n\t" + "eor x14, x14, x5, ror 34\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor x15, x14, x5, ror 39\n\t" "add x4, x4, x12\n\t" "eor x16, x5, x6\n\t" "eor x12, x10, x11\n\t" @@ -1140,17 +1016,17 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add x5, x5, x21\n\t" "add x4, x4, x20\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" - "subs %[len], %[len], #0x80\n\t" + "subs %w[len], %w[len], #0x80\n\t" "bne L_sha512_len_neon_begin_%=\n\t" - "stp x4, x5, [%[sha512]]\n\t" - "stp x6, x7, [%[sha512], #16]\n\t" - "stp x8, x9, [%[sha512], #32]\n\t" - "stp x10, x11, [%[sha512], #48]\n\t" + "stp x4, x5, [%x[sha512]]\n\t" + "stp x6, x7, [%x[sha512], #16]\n\t" + "stp x8, x9, [%x[sha512], #32]\n\t" + "stp x10, x11, [%x[sha512], #48]\n\t" "eor x0, x0, x0\n\t" "ldp x29, x30, [sp], #16\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" ); return (uint32_t)(size_t)sha512; }