diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index fb417b50c..f68ccfca4 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -23,8 +23,6 @@ * cd ../scripts * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S */ - -#ifdef WOLFSSL_ARMASM #ifdef __aarch64__ .text .align 2 @@ -219,223 +217,225 @@ fe_isnegative: .globl fe_cmov_table .type fe_cmov_table, %function fe_cmov_table: - stp x29, x30, [sp, #-112]! + stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #16] - stp x18, x19, [x29, #24] - stp x20, x21, [x29, #40] - stp x22, x23, [x29, #56] - stp x24, x25, [x29, #72] - stp x26, x27, [x29, #88] - str x28, [x29, #104] + str x17, [x29, #40] + str x19, [x29, #48] + stp x20, x21, [x29, #56] + stp x22, x23, [x29, #72] + stp x24, x25, [x29, #88] + stp x26, x27, [x29, #104] + str x28, [x29, #120] + str x0, [x29, #16] sxtb x2, w2 - sbfx x15, x2, #7, #1 - eor x16, x2, x15 - sub x16, x16, x15 - mov x3, #1 - mov x4, xzr + sbfx x3, x2, #7, #1 + eor x0, x2, x3 + sub x0, x0, x3 + mov x4, #1 mov x5, xzr mov x6, xzr - mov x7, #1 - mov x8, xzr + mov x7, xzr + mov x8, #1 mov x9, xzr mov x10, xzr mov x11, xzr mov x12, xzr mov x13, xzr mov x14, xzr - cmp x16, #1 - ldp x17, x18, [x1] + mov x15, xzr + cmp x0, #1 + ldp x16, x17, [x1] ldp x19, x20, [x1, #16] ldp x21, x22, [x1, #32] ldp x23, x24, [x1, #48] ldp x25, x26, [x1, #64] ldp x27, x28, [x1, #80] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - cmp x16, #2 - ldp x17, x18, [x1, #96] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #2 + ldp x16, x17, [x1, #96] ldp x19, x20, [x1, #112] ldp x21, x22, [x1, #128] ldp x23, x24, [x1, #144] ldp x25, x26, [x1, #160] ldp x27, x28, [x1, #176] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - cmp x16, #3 - ldp x17, x18, [x1, #192] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #3 + ldp x16, x17, [x1, #192] ldp x19, x20, [x1, #208] ldp x21, x22, [x1, #224] ldp x23, x24, [x1, #240] ldp x25, x26, [x1, #256] ldp x27, x28, [x1, #272] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - cmp x16, #4 - ldp x17, x18, [x1, #288] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #4 + ldp x16, x17, [x1, #288] ldp x19, x20, [x1, #304] ldp x21, x22, [x1, #320] ldp x23, x24, [x1, #336] ldp x25, x26, [x1, #352] ldp x27, x28, [x1, #368] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq add x1, x1, #0x180 - cmp x16, #5 - ldp x17, x18, [x1] + cmp x0, #5 + ldp x16, x17, [x1] ldp x19, x20, [x1, #16] ldp x21, x22, [x1, #32] ldp x23, x24, [x1, #48] ldp x25, x26, [x1, #64] ldp x27, x28, [x1, #80] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - cmp x16, #6 - ldp x17, x18, [x1, #96] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #6 + ldp x16, x17, [x1, #96] ldp x19, x20, [x1, #112] ldp x21, x22, [x1, #128] ldp x23, x24, [x1, #144] ldp x25, x26, [x1, #160] ldp x27, x28, [x1, #176] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - cmp x16, #7 - ldp x17, x18, [x1, #192] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #7 + ldp x16, x17, [x1, #192] ldp x19, x20, [x1, #208] ldp x21, x22, [x1, #224] ldp x23, x24, [x1, #240] ldp x25, x26, [x1, #256] ldp x27, x28, [x1, #272] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - cmp x16, #8 - ldp x17, x18, [x1, #288] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #8 + ldp x16, x17, [x1, #288] ldp x19, x20, [x1, #304] ldp x21, x22, [x1, #320] ldp x23, x24, [x1, #336] ldp x25, x26, [x1, #352] ldp x27, x28, [x1, #368] - csel x3, x17, x3, eq - csel x4, x18, x4, eq - csel x5, x19, x5, eq - csel x6, x20, x6, eq - csel x7, x21, x7, eq - csel x8, x22, x8, eq - csel x9, x23, x9, eq - csel x10, x24, x10, eq - csel x11, x25, x11, eq - csel x12, x26, x12, eq - csel x13, x27, x13, eq - csel x14, x28, x14, eq - mov x17, #-19 - mov x18, #-1 + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + mov x16, #-19 + mov x17, #-1 mov x19, #-1 mov x20, #0x7fffffffffffffff - subs x17, x17, x11 - sbcs x18, x18, x12 - sbcs x19, x19, x13 - sbc x20, x20, x14 + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x19, x19, x14 + sbc x20, x20, x15 cmp x2, #0 - mov x15, x3 - csel x3, x7, x3, lt - csel x7, x15, x7, lt - mov x15, x4 + mov x3, x4 csel x4, x8, x4, lt - csel x8, x15, x8, lt - mov x15, x5 + csel x8, x3, x8, lt + mov x3, x5 csel x5, x9, x5, lt - csel x9, x15, x9, lt - mov x15, x6 + csel x9, x3, x9, lt + mov x3, x6 csel x6, x10, x6, lt - csel x10, x15, x10, lt - csel x11, x17, x11, lt - csel x12, x18, x12, lt - csel x13, x19, x13, lt - csel x14, x20, x14, lt - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - stp x7, x8, [x0, #32] - stp x9, x10, [x0, #48] - stp x11, x12, [x0, #64] - stp x13, x14, [x0, #80] - ldr x17, [x29, #16] - ldp x18, x19, [x29, #24] - ldp x20, x21, [x29, #40] - ldp x22, x23, [x29, #56] - ldp x24, x25, [x29, #72] - ldp x26, x27, [x29, #88] - ldr x28, [x29, #104] - ldp x29, x30, [sp], #0x70 + csel x10, x3, x10, lt + mov x3, x7 + csel x7, x11, x7, lt + csel x11, x3, x11, lt + csel x12, x16, x12, lt + csel x13, x17, x13, lt + csel x14, x19, x14, lt + csel x15, x20, x15, lt + ldr x0, [x29, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + stp x12, x13, [x0, #64] + stp x14, x15, [x0, #80] + ldr x17, [x29, #40] + ldr x19, [x29, #48] + ldp x20, x21, [x29, #56] + ldp x22, x23, [x29, #72] + ldp x24, x25, [x29, #88] + ldp x26, x27, [x29, #104] + ldr x28, [x29, #120] + ldp x29, x30, [sp], #0x80 ret .size fe_cmov_table,.-fe_cmov_table .text @@ -446,101 +446,102 @@ fe_mul: stp x29, x30, [sp, #-64]! add x29, sp, #0 str x17, [x29, #24] - stp x18, x19, [x29, #32] - stp x20, x21, [x29, #48] + str x19, [x29, #32] + stp x20, x21, [x29, #40] + str x22, [x29, #56] # Multiply ldp x14, x15, [x1] ldp x16, x17, [x1, #16] - ldp x18, x19, [x2] - ldp x20, x21, [x2, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] # A[0] * B[0] - mul x6, x14, x18 - umulh x7, x14, x18 + mul x6, x14, x19 + umulh x7, x14, x19 # A[0] * B[1] - mul x3, x14, x19 - umulh x8, x14, x19 + mul x3, x14, x20 + umulh x8, x14, x20 adds x7, x7, x3 adc x8, x8, xzr # A[1] * B[0] - mul x3, x15, x18 - umulh x4, x15, x18 + mul x3, x15, x19 + umulh x4, x15, x19 adds x7, x7, x3 adcs x8, x8, x4 adc x9, xzr, xzr # A[0] * B[2] - mul x3, x14, x20 - umulh x4, x14, x20 + mul x3, x14, x21 + umulh x4, x14, x21 adds x8, x8, x3 adc x9, x9, x4 # A[1] * B[1] - mul x3, x15, x19 - umulh x4, x15, x19 + mul x3, x15, x20 + umulh x4, x15, x20 adds x8, x8, x3 adcs x9, x9, x4 adc x10, xzr, xzr # A[2] * B[0] - mul x3, x16, x18 - umulh x4, x16, x18 + mul x3, x16, x19 + umulh x4, x16, x19 adds x8, x8, x3 adcs x9, x9, x4 adc x10, x10, xzr # A[0] * B[3] - mul x3, x14, x21 - umulh x4, x14, x21 + mul x3, x14, x22 + umulh x4, x14, x22 adds x9, x9, x3 adcs x10, x10, x4 adc x11, xzr, xzr # A[1] * B[2] - mul x3, x15, x20 - umulh x4, x15, x20 + mul x3, x15, x21 + umulh x4, x15, x21 adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr # A[2] * B[1] - mul x3, x16, x19 - umulh x4, x16, x19 + mul x3, x16, x20 + umulh x4, x16, x20 adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr # A[3] * B[0] - mul x3, x17, x18 - umulh x4, x17, x18 + mul x3, x17, x19 + umulh x4, x17, x19 adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr # A[1] * B[3] - mul x3, x15, x21 - umulh x4, x15, x21 + mul x3, x15, x22 + umulh x4, x15, x22 adds x10, x10, x3 adcs x11, x11, x4 adc x12, xzr, xzr # A[2] * B[2] - mul x3, x16, x20 - umulh x4, x16, x20 + mul x3, x16, x21 + umulh x4, x16, x21 adds x10, x10, x3 adcs x11, x11, x4 adc x12, x12, xzr # A[3] * B[1] - mul x3, x17, x19 - umulh x4, x17, x19 + mul x3, x17, x20 + umulh x4, x17, x20 adds x10, x10, x3 adcs x11, x11, x4 adc x12, x12, xzr # A[2] * B[3] - mul x3, x16, x21 - umulh x4, x16, x21 + mul x3, x16, x22 + umulh x4, x16, x22 adds x11, x11, x3 adcs x12, x12, x4 adc x13, xzr, xzr # A[3] * B[2] - mul x3, x17, x20 - umulh x4, x17, x20 + mul x3, x17, x21 + umulh x4, x17, x21 adds x11, x11, x3 adcs x12, x12, x4 adc x13, x13, xzr # A[3] * B[3] - mul x3, x17, x21 - umulh x4, x17, x21 + mul x3, x17, x22 + umulh x4, x17, x22 adds x12, x12, x3 adc x13, x13, x4 # Reduce @@ -589,8 +590,9 @@ fe_mul: stp x6, x7, [x0] stp x8, x9, [x0, #16] ldr x17, [x29, #24] - ldp x18, x19, [x29, #32] - ldp x20, x21, [x29, #48] + ldr x19, [x29, #32] + ldp x20, x21, [x29, #40] + ldr x22, [x29, #56] ldp x29, x30, [sp], #0x40 ret .size fe_mul,.-fe_mul @@ -839,41 +841,42 @@ L_fe_invert8: curve25519: stp x29, x30, [sp, #-288]! add x29, sp, #0 - str x17, [x29, #192] - stp x18, x19, [x29, #200] + str x17, [x29, #200] + str x19, [x29, #208] stp x20, x21, [x29, #216] stp x22, x23, [x29, #232] stp x24, x25, [x29, #248] stp x26, x27, [x29, #264] str x28, [x29, #280] - mov x22, xzr + mov x23, xzr str x0, [x29, #176] - # Set one - mov x23, #1 - stp x23, xzr, [x0] - stp xzr, xzr, [x0, #16] - # Set zero - stp xzr, xzr, [x29, #16] - stp xzr, xzr, [x29, #32] - # Set one - mov x23, #1 - stp x23, xzr, [x29, #48] - stp xzr, xzr, [x29, #64] + str x2, [x29, #184] # Copy ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] + # Set one + mov x2, #1 + stp x2, xzr, [x0] + stp xzr, xzr, [x0, #16] + # Set zero + stp xzr, xzr, [x29, #16] + stp xzr, xzr, [x29, #32] + # Set one + mov x2, #1 + stp x2, xzr, [x29, #48] + stp xzr, xzr, [x29, #64] mov x25, #62 mov x24, #24 L_curve25519_words: L_curve25519_bits: - ldr x23, [x1, x24] - lsr x23, x23, x25 - and x23, x23, #1 - eor x22, x22, x23 + ldr x2, [x1, x24] + lsr x2, x2, x25 + and x2, x2, #1 + eor x23, x23, x2 # Conditional Swap - cmp x22, #1 + cmp x23, #1 ldp x10, x11, [x0] ldp x12, x13, [x0, #16] ldp x6, x7, [x29, #80] @@ -887,66 +890,66 @@ L_curve25519_bits: csel x17, x13, x9, eq csel x13, x9, x13, eq # Conditional Swap - cmp x22, #1 - ldp x18, x19, [x29, #16] - ldp x20, x21, [x29, #32] + cmp x23, #1 + ldp x19, x20, [x29, #16] + ldp x21, x22, [x29, #32] ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] - csel x5, x18, x6, eq - csel x18, x6, x18, eq - csel x26, x19, x7, eq - csel x19, x7, x19, eq - csel x27, x20, x8, eq - csel x20, x8, x20, eq - csel x28, x21, x9, eq - csel x21, x9, x21, eq - mov x22, x23 + csel x5, x19, x6, eq + csel x19, x6, x19, eq + csel x26, x20, x7, eq + csel x20, x7, x20, eq + csel x27, x21, x8, eq + csel x21, x8, x21, eq + csel x28, x22, x9, eq + csel x22, x9, x22, eq + mov x23, x2 # Add - adds x6, x10, x18 - adcs x7, x11, x19 - adcs x8, x12, x20 - adc x9, x13, x21 + adds x6, x10, x19 + adcs x7, x11, x20 + adcs x8, x12, x21 + adc x9, x13, x22 mov x3, #-19 - asr x23, x9, #63 + asr x2, x9, #63 # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Sub modulus (if overflow) subs x6, x6, x3 - sbcs x7, x7, x23 - sbcs x8, x8, x23 + sbcs x7, x7, x2 + sbcs x8, x8, x2 sbc x9, x9, x4 # Sub - subs x18, x10, x18 - sbcs x19, x11, x19 - sbcs x20, x12, x20 - sbcs x21, x13, x21 + subs x19, x10, x19 + sbcs x20, x11, x20 + sbcs x21, x12, x21 + sbcs x22, x13, x22 mov x3, #-19 - csetm x23, cc + csetm x2, cc # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Add modulus (if underflow) - adds x18, x18, x3 - adcs x19, x19, x23 - adcs x20, x20, x23 - adc x21, x21, x4 - stp x18, x19, [x29, #144] - stp x20, x21, [x29, #160] + adds x19, x19, x3 + adcs x20, x20, x2 + adcs x21, x21, x2 + adc x22, x22, x4 + stp x19, x20, [x29, #144] + stp x21, x22, [x29, #160] # Add adds x10, x14, x5 adcs x11, x15, x26 adcs x12, x16, x27 adc x13, x17, x28 mov x3, #-19 - asr x23, x13, #63 + asr x2, x13, #63 # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Sub modulus (if overflow) subs x10, x10, x3 - sbcs x11, x11, x23 - sbcs x12, x12, x23 + sbcs x11, x11, x2 + sbcs x12, x12, x2 sbc x13, x13, x4 # Sub subs x14, x14, x5 @@ -954,87 +957,87 @@ L_curve25519_bits: sbcs x16, x16, x27 sbcs x17, x17, x28 mov x3, #-19 - csetm x23, cc + csetm x2, cc # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Add modulus (if underflow) adds x14, x14, x3 - adcs x15, x15, x23 - adcs x16, x16, x23 + adcs x15, x15, x2 + adcs x16, x16, x2 adc x17, x17, x4 # Multiply # A[0] * B[0] - mul x18, x14, x6 - umulh x19, x14, x6 + mul x19, x14, x6 + umulh x20, x14, x6 # A[0] * B[1] mul x3, x14, x7 - umulh x20, x14, x7 - adds x19, x19, x3 - adc x20, x20, xzr + umulh x21, x14, x7 + adds x20, x20, x3 + adc x21, x21, xzr # A[1] * B[0] mul x3, x15, x6 umulh x4, x15, x6 - adds x19, x19, x3 - adcs x20, x20, x4 - adc x21, xzr, xzr + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr # A[0] * B[2] mul x3, x14, x8 umulh x4, x14, x8 - adds x20, x20, x3 - adc x21, x21, x4 + adds x21, x21, x3 + adc x22, x22, x4 # A[1] * B[1] mul x3, x15, x7 umulh x4, x15, x7 - adds x20, x20, x3 - adcs x21, x21, x4 - adc x23, xzr, xzr + adds x21, x21, x3 + adcs x22, x22, x4 + adc x2, xzr, xzr # A[2] * B[0] mul x3, x16, x6 umulh x4, x16, x6 - adds x20, x20, x3 - adcs x21, x21, x4 - adc x23, x23, xzr + adds x21, x21, x3 + adcs x22, x22, x4 + adc x2, x2, xzr # A[0] * B[3] mul x3, x14, x9 umulh x4, x14, x9 - adds x21, x21, x3 - adcs x23, x23, x4 + adds x22, x22, x3 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * B[2] mul x3, x15, x8 umulh x4, x15, x8 - adds x21, x21, x3 - adcs x23, x23, x4 + adds x22, x22, x3 + adcs x2, x2, x4 adc x26, x26, xzr # A[2] * B[1] mul x3, x16, x7 umulh x4, x16, x7 - adds x21, x21, x3 - adcs x23, x23, x4 + adds x22, x22, x3 + adcs x2, x2, x4 adc x26, x26, xzr # A[3] * B[0] mul x3, x17, x6 umulh x4, x17, x6 - adds x21, x21, x3 - adcs x23, x23, x4 + adds x22, x22, x3 + adcs x2, x2, x4 adc x26, x26, xzr # A[1] * B[3] mul x3, x15, x9 umulh x4, x15, x9 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, xzr, xzr # A[2] * B[2] mul x3, x16, x8 umulh x4, x16, x8 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[3] * B[1] mul x3, x17, x7 umulh x4, x17, x7 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[3] @@ -1058,103 +1061,103 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x21, #63 - and x21, x21, #0x7fffffffffffffff + extr x26, x26, x2, #63 + extr x2, x2, x22, #63 + and x22, x22, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 - adds x18, x18, x4 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x19, x19, x4 mul x4, x3, x26 umulh x26, x3, x26 - adcs x19, x19, x4 + adcs x20, x20, x4 mul x4, x3, x27 umulh x27, x3, x27 - adcs x20, x20, x4 + adcs x21, x21, x4 mul x4, x3, x28 umulh x5, x3, x28 - adcs x21, x21, x4 + adcs x22, x22, x4 adc x5, x5, xzr # Add remaining product results in - adds x19, x19, x23 - adcs x20, x20, x26 - adcs x21, x21, x27 + adds x20, x20, x2 + adcs x21, x21, x26 + adcs x22, x22, x27 adc x5, x5, xzr # Overflow - extr x5, x5, x21, #63 + extr x5, x5, x22, #63 mul x5, x5, x3 - and x21, x21, #0x7fffffffffffffff - adds x18, x18, x5 - adcs x19, x19, xzr + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 adcs x20, x20, xzr - adc x21, x21, xzr + adcs x21, x21, xzr + adc x22, x22, xzr # Reduce if top bit set - and x5, x3, x21, asr 63 - and x21, x21, #0x7fffffffffffffff - adds x18, x18, x5 - adcs x19, x19, xzr + and x5, x3, x22, asr 63 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 adcs x20, x20, xzr - adc x21, x21, xzr + adcs x21, x21, xzr + adc x22, x22, xzr # Store - stp x18, x19, [x29, #112] - stp x20, x21, [x29, #128] + stp x19, x20, [x29, #112] + stp x21, x22, [x29, #128] # Multiply - ldp x23, x26, [x29, #144] + ldp x2, x26, [x29, #144] ldp x27, x28, [x29, #160] # A[0] * B[0] - mul x18, x10, x23 - umulh x19, x10, x23 + mul x19, x10, x2 + umulh x20, x10, x2 # A[0] * B[1] mul x3, x10, x26 - umulh x20, x10, x26 - adds x19, x19, x3 - adc x20, x20, xzr + umulh x21, x10, x26 + adds x20, x20, x3 + adc x21, x21, xzr # A[1] * B[0] - mul x3, x11, x23 - umulh x4, x11, x23 - adds x19, x19, x3 - adcs x20, x20, x4 - adc x21, xzr, xzr + mul x3, x11, x2 + umulh x4, x11, x2 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr # A[0] * B[2] mul x3, x10, x27 umulh x4, x10, x27 - adds x20, x20, x3 - adc x21, x21, x4 + adds x21, x21, x3 + adc x22, x22, x4 # A[1] * B[1] mul x3, x11, x26 umulh x4, x11, x26 - adds x20, x20, x3 - adcs x21, x21, x4 + adds x21, x21, x3 + adcs x22, x22, x4 adc x14, xzr, xzr # A[2] * B[0] - mul x3, x12, x23 - umulh x4, x12, x23 - adds x20, x20, x3 - adcs x21, x21, x4 + mul x3, x12, x2 + umulh x4, x12, x2 + adds x21, x21, x3 + adcs x22, x22, x4 adc x14, x14, xzr # A[0] * B[3] mul x3, x10, x28 umulh x4, x10, x28 - adds x21, x21, x3 + adds x22, x22, x3 adcs x14, x14, x4 adc x15, xzr, xzr # A[1] * B[2] mul x3, x11, x27 umulh x4, x11, x27 - adds x21, x21, x3 + adds x22, x22, x3 adcs x14, x14, x4 adc x15, x15, xzr # A[2] * B[1] mul x3, x12, x26 umulh x4, x12, x26 - adds x21, x21, x3 + adds x22, x22, x3 adcs x14, x14, x4 adc x15, x15, xzr # A[3] * B[0] - mul x3, x13, x23 - umulh x4, x13, x23 - adds x21, x21, x3 + mul x3, x13, x2 + umulh x4, x13, x2 + adds x22, x22, x3 adcs x14, x14, x4 adc x15, x15, xzr # A[1] * B[3] @@ -1197,56 +1200,56 @@ L_curve25519_bits: extr x17, x17, x16, #63 extr x16, x16, x15, #63 extr x15, x15, x14, #63 - extr x14, x14, x21, #63 - and x21, x21, #0x7fffffffffffffff + extr x14, x14, x22, #63 + and x22, x22, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 mul x4, x3, x14 umulh x14, x3, x14 - adds x18, x18, x4 + adds x19, x19, x4 mul x4, x3, x15 umulh x15, x3, x15 - adcs x19, x19, x4 + adcs x20, x20, x4 mul x4, x3, x16 umulh x16, x3, x16 - adcs x20, x20, x4 + adcs x21, x21, x4 mul x4, x3, x17 umulh x5, x3, x17 - adcs x21, x21, x4 + adcs x22, x22, x4 adc x5, x5, xzr # Add remaining product results in - adds x19, x19, x14 - adcs x20, x20, x15 - adcs x21, x21, x16 + adds x20, x20, x14 + adcs x21, x21, x15 + adcs x22, x22, x16 adc x5, x5, xzr # Overflow - extr x5, x5, x21, #63 + extr x5, x5, x22, #63 mul x5, x5, x3 - and x21, x21, #0x7fffffffffffffff - adds x18, x18, x5 - adcs x19, x19, xzr + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 adcs x20, x20, xzr - adc x21, x21, xzr + adcs x21, x21, xzr + adc x22, x22, xzr # Reduce if top bit set - and x5, x3, x21, asr 63 - and x21, x21, #0x7fffffffffffffff - adds x18, x18, x5 - adcs x19, x19, xzr + and x5, x3, x22, asr 63 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 adcs x20, x20, xzr - adc x21, x21, xzr + adcs x21, x21, xzr + adc x22, x22, xzr # Store # Square # A[0] * A[1] - mul x11, x23, x26 - umulh x12, x23, x26 + mul x11, x2, x26 + umulh x12, x2, x26 # A[0] * A[2] - mul x3, x23, x27 - umulh x13, x23, x27 + mul x3, x2, x27 + umulh x13, x2, x27 adds x12, x12, x3 adc x13, x13, xzr # A[0] * A[3] - mul x3, x23, x28 - umulh x14, x23, x28 + mul x3, x2, x28 + umulh x14, x2, x28 adds x13, x13, x3 adc x14, x14, xzr # A[1] * A[2] @@ -1274,8 +1277,8 @@ L_curve25519_bits: adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] - mul x10, x23, x23 - umulh x5, x23, x23 + mul x10, x2, x2 + umulh x5, x2, x2 # A[1] * A[1] mul x3, x26, x26 umulh x4, x26, x26 @@ -1348,19 +1351,19 @@ L_curve25519_bits: adc x17, x17, xzr # A[0] * A[3] mul x3, x6, x9 - umulh x23, x6, x9 + umulh x2, x6, x9 adds x17, x17, x3 - adc x23, x23, xzr + adc x2, x2, xzr # A[1] * A[2] mul x3, x7, x8 umulh x4, x7, x8 adds x17, x17, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * A[3] mul x3, x7, x9 umulh x4, x7, x9 - adds x23, x23, x3 + adds x2, x2, x3 adc x26, x26, x4 # A[2] * A[3] mul x3, x8, x9 @@ -1371,7 +1374,7 @@ L_curve25519_bits: adds x15, x15, x15 adcs x16, x16, x16 adcs x17, x17, x17 - adcs x23, x23, x23 + adcs x2, x2, x2 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr @@ -1388,7 +1391,7 @@ L_curve25519_bits: mul x3, x8, x8 umulh x4, x8, x8 adds x17, x17, x5 - adcs x23, x23, x3 + adcs x2, x2, x3 adc x5, x4, xzr # A[3] * A[3] mul x3, x9, x9 @@ -1400,13 +1403,13 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x17, #63 + extr x26, x26, x2, #63 + extr x2, x2, x17, #63 and x17, x17, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 + mul x4, x3, x2 + umulh x2, x3, x2 adds x14, x14, x4 mul x4, x3, x26 umulh x26, x3, x26 @@ -1419,7 +1422,7 @@ L_curve25519_bits: adcs x17, x17, x4 adc x5, x5, xzr # Add remaining product results in - adds x15, x15, x23 + adds x15, x15, x2 adcs x16, x16, x26 adcs x17, x17, x27 adc x5, x5, xzr @@ -1464,53 +1467,53 @@ L_curve25519_bits: umulh x4, x15, x11 adds x8, x8, x3 adcs x9, x9, x4 - adc x23, xzr, xzr + adc x2, xzr, xzr # A[2] * B[0] mul x3, x16, x10 umulh x4, x16, x10 adds x8, x8, x3 adcs x9, x9, x4 - adc x23, x23, xzr + adc x2, x2, xzr # A[0] * B[3] mul x3, x14, x13 umulh x4, x14, x13 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * B[2] mul x3, x15, x12 umulh x4, x15, x12 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[2] * B[1] mul x3, x16, x11 umulh x4, x16, x11 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[3] * B[0] mul x3, x17, x10 umulh x4, x17, x10 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[1] * B[3] mul x3, x15, x13 umulh x4, x15, x13 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, xzr, xzr # A[2] * B[2] mul x3, x16, x12 umulh x4, x16, x12 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[3] * B[1] mul x3, x17, x11 umulh x4, x17, x11 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[3] @@ -1534,13 +1537,13 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x9, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 and x9, x9, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 + mul x4, x3, x2 + umulh x2, x3, x2 adds x6, x6, x4 mul x4, x3, x26 umulh x26, x3, x26 @@ -1553,7 +1556,7 @@ L_curve25519_bits: adcs x9, x9, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x23 + adds x7, x7, x2 adcs x8, x8, x26 adcs x9, x9, x27 adc x5, x5, xzr @@ -1581,14 +1584,14 @@ L_curve25519_bits: sbcs x16, x16, x12 sbcs x17, x17, x13 mov x3, #-19 - csetm x23, cc + csetm x2, cc # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Add modulus (if underflow) adds x14, x14, x3 - adcs x15, x15, x23 - adcs x16, x16, x23 + adcs x15, x15, x2 + adcs x16, x16, x2 adc x17, x17, x4 # Multiply by 121666 mov x5, #0xdb42 @@ -1621,14 +1624,14 @@ L_curve25519_bits: adcs x12, x12, x8 adc x13, x13, x9 mov x3, #-19 - asr x23, x13, #63 + asr x2, x13, #63 # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Sub modulus (if overflow) subs x10, x10, x3 - sbcs x11, x11, x23 - sbcs x12, x12, x23 + sbcs x11, x11, x2 + sbcs x12, x12, x2 sbc x13, x13, x4 # Multiply # A[0] * B[0] @@ -1655,53 +1658,53 @@ L_curve25519_bits: umulh x4, x15, x11 adds x8, x8, x3 adcs x9, x9, x4 - adc x23, xzr, xzr + adc x2, xzr, xzr # A[2] * B[0] mul x3, x16, x10 umulh x4, x16, x10 adds x8, x8, x3 adcs x9, x9, x4 - adc x23, x23, xzr + adc x2, x2, xzr # A[0] * B[3] mul x3, x14, x13 umulh x4, x14, x13 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * B[2] mul x3, x15, x12 umulh x4, x15, x12 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[2] * B[1] mul x3, x16, x11 umulh x4, x16, x11 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[3] * B[0] mul x3, x17, x10 umulh x4, x17, x10 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[1] * B[3] mul x3, x15, x13 umulh x4, x15, x13 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, xzr, xzr # A[2] * B[2] mul x3, x16, x12 umulh x4, x16, x12 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[3] * B[1] mul x3, x17, x11 umulh x4, x17, x11 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[3] @@ -1725,13 +1728,13 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x9, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 and x9, x9, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 + mul x4, x3, x2 + umulh x2, x3, x2 adds x6, x6, x4 mul x4, x3, x26 umulh x26, x3, x26 @@ -1744,7 +1747,7 @@ L_curve25519_bits: adcs x9, x9, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x23 + adds x7, x7, x2 adcs x8, x8, x26 adcs x9, x9, x27 adc x5, x5, xzr @@ -1769,35 +1772,35 @@ L_curve25519_bits: # Add ldp x6, x7, [x29, #112] ldp x8, x9, [x29, #128] - adds x10, x6, x18 - adcs x11, x7, x19 - adcs x12, x8, x20 - adc x13, x9, x21 + adds x10, x6, x19 + adcs x11, x7, x20 + adcs x12, x8, x21 + adc x13, x9, x22 mov x3, #-19 - asr x23, x13, #63 + asr x2, x13, #63 # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Sub modulus (if overflow) subs x10, x10, x3 - sbcs x11, x11, x23 - sbcs x12, x12, x23 + sbcs x11, x11, x2 + sbcs x12, x12, x2 sbc x13, x13, x4 # Sub - subs x18, x6, x18 - sbcs x19, x7, x19 - sbcs x20, x8, x20 - sbcs x21, x9, x21 + subs x19, x6, x19 + sbcs x20, x7, x20 + sbcs x21, x8, x21 + sbcs x22, x9, x22 mov x3, #-19 - csetm x23, cc + csetm x2, cc # Mask the modulus - and x3, x23, x3 - and x4, x23, #0x7fffffffffffffff + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff # Add modulus (if underflow) - adds x18, x18, x3 - adcs x19, x19, x23 - adcs x20, x20, x23 - adc x21, x21, x4 + adds x19, x19, x3 + adcs x20, x20, x2 + adcs x21, x21, x2 + adc x22, x22, x4 # Square # A[0] * A[1] mul x7, x10, x11 @@ -1809,19 +1812,19 @@ L_curve25519_bits: adc x9, x9, xzr # A[0] * A[3] mul x3, x10, x13 - umulh x23, x10, x13 + umulh x2, x10, x13 adds x9, x9, x3 - adc x23, x23, xzr + adc x2, x2, xzr # A[1] * A[2] mul x3, x11, x12 umulh x4, x11, x12 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * A[3] mul x3, x11, x13 umulh x4, x11, x13 - adds x23, x23, x3 + adds x2, x2, x3 adc x26, x26, x4 # A[2] * A[3] mul x3, x12, x13 @@ -1832,7 +1835,7 @@ L_curve25519_bits: adds x7, x7, x7 adcs x8, x8, x8 adcs x9, x9, x9 - adcs x23, x23, x23 + adcs x2, x2, x2 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr @@ -1849,7 +1852,7 @@ L_curve25519_bits: mul x3, x12, x12 umulh x4, x12, x12 adds x9, x9, x5 - adcs x23, x23, x3 + adcs x2, x2, x3 adc x5, x4, xzr # A[3] * A[3] mul x3, x13, x13 @@ -1861,13 +1864,13 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x9, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 and x9, x9, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 + mul x4, x3, x2 + umulh x2, x3, x2 adds x6, x6, x4 mul x4, x3, x26 umulh x26, x3, x26 @@ -1880,7 +1883,7 @@ L_curve25519_bits: adcs x9, x9, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x23 + adds x7, x7, x2 adcs x8, x8, x26 adcs x9, x9, x27 adc x5, x5, xzr @@ -1904,60 +1907,60 @@ L_curve25519_bits: stp x8, x9, [x29, #96] # Square # A[0] * A[1] - mul x7, x18, x19 - umulh x8, x18, x19 + mul x7, x19, x20 + umulh x8, x19, x20 # A[0] * A[2] - mul x3, x18, x20 - umulh x9, x18, x20 + mul x3, x19, x21 + umulh x9, x19, x21 adds x8, x8, x3 adc x9, x9, xzr # A[0] * A[3] - mul x3, x18, x21 - umulh x23, x18, x21 + mul x3, x19, x22 + umulh x2, x19, x22 adds x9, x9, x3 - adc x23, x23, xzr + adc x2, x2, xzr # A[1] * A[2] - mul x3, x19, x20 - umulh x4, x19, x20 + mul x3, x20, x21 + umulh x4, x20, x21 adds x9, x9, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * A[3] - mul x3, x19, x21 - umulh x4, x19, x21 - adds x23, x23, x3 + mul x3, x20, x22 + umulh x4, x20, x22 + adds x2, x2, x3 adc x26, x26, x4 # A[2] * A[3] - mul x3, x20, x21 - umulh x27, x20, x21 + mul x3, x21, x22 + umulh x27, x21, x22 adds x26, x26, x3 adc x27, x27, xzr # Double adds x7, x7, x7 adcs x8, x8, x8 adcs x9, x9, x9 - adcs x23, x23, x23 + adcs x2, x2, x2 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] - mul x6, x18, x18 - umulh x5, x18, x18 + mul x6, x19, x19 + umulh x5, x19, x19 # A[1] * A[1] - mul x3, x19, x19 - umulh x4, x19, x19 + mul x3, x20, x20 + umulh x4, x20, x20 adds x7, x7, x5 adcs x8, x8, x3 adc x5, x4, xzr # A[2] * A[2] - mul x3, x20, x20 - umulh x4, x20, x20 - adds x9, x9, x5 - adcs x23, x23, x3 - adc x5, x4, xzr - # A[3] * A[3] mul x3, x21, x21 umulh x4, x21, x21 + adds x9, x9, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x22, x22 + umulh x4, x22, x22 adds x26, x26, x5 adcs x27, x27, x3 adc x28, x28, x4 @@ -1965,13 +1968,13 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x9, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 and x9, x9, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 + mul x4, x3, x2 + umulh x2, x3, x2 adds x6, x6, x4 mul x4, x3, x26 umulh x26, x3, x26 @@ -1984,7 +1987,7 @@ L_curve25519_bits: adcs x9, x9, x4 adc x5, x5, xzr # Add remaining product results in - adds x7, x7, x23 + adds x7, x7, x2 adcs x8, x8, x26 adcs x9, x9, x27 adc x5, x5, xzr @@ -2004,6 +2007,7 @@ L_curve25519_bits: adcs x8, x8, xzr adc x9, x9, xzr # Store + ldr x2, [x29, #184] # Multiply ldp x14, x15, [x2] ldp x16, x17, [x2, #16] @@ -2031,53 +2035,53 @@ L_curve25519_bits: umulh x4, x15, x7 adds x12, x12, x3 adcs x13, x13, x4 - adc x23, xzr, xzr + adc x2, xzr, xzr # A[2] * B[0] mul x3, x16, x6 umulh x4, x16, x6 adds x12, x12, x3 adcs x13, x13, x4 - adc x23, x23, xzr + adc x2, x2, xzr # A[0] * B[3] mul x3, x14, x9 umulh x4, x14, x9 adds x13, x13, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, xzr, xzr # A[1] * B[2] mul x3, x15, x8 umulh x4, x15, x8 adds x13, x13, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[2] * B[1] mul x3, x16, x7 umulh x4, x16, x7 adds x13, x13, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[3] * B[0] mul x3, x17, x6 umulh x4, x17, x6 adds x13, x13, x3 - adcs x23, x23, x4 + adcs x2, x2, x4 adc x26, x26, xzr # A[1] * B[3] mul x3, x15, x9 umulh x4, x15, x9 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, xzr, xzr # A[2] * B[2] mul x3, x16, x8 umulh x4, x16, x8 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[3] * B[1] mul x3, x17, x7 umulh x4, x17, x7 - adds x23, x23, x3 + adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[3] @@ -2101,13 +2105,13 @@ L_curve25519_bits: # Move top half into t4-t7 and remove top bit from t3 extr x28, x28, x27, #63 extr x27, x27, x26, #63 - extr x26, x26, x23, #63 - extr x23, x23, x13, #63 + extr x26, x26, x2, #63 + extr x2, x2, x13, #63 and x13, x13, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x23 - umulh x23, x3, x23 + mul x4, x3, x2 + umulh x2, x3, x2 adds x10, x10, x4 mul x4, x3, x26 umulh x26, x3, x26 @@ -2120,7 +2124,7 @@ L_curve25519_bits: adcs x13, x13, x4 adc x5, x5, xzr # Add remaining product results in - adds x11, x11, x23 + adds x11, x11, x2 adcs x12, x12, x26 adcs x13, x13, x27 adc x5, x5, xzr @@ -2291,98 +2295,98 @@ L_curve25519_inv_8: umulh x4, x7, x11 adds x16, x16, x3 adcs x17, x17, x4 - adc x18, xzr, xzr + adc x19, xzr, xzr # A[2] * B[0] mul x3, x8, x10 umulh x4, x8, x10 adds x16, x16, x3 adcs x17, x17, x4 - adc x18, x18, xzr + adc x19, x19, xzr # A[0] * B[3] mul x3, x6, x13 umulh x4, x6, x13 adds x17, x17, x3 - adcs x18, x18, x4 - adc x19, xzr, xzr + adcs x19, x19, x4 + adc x20, xzr, xzr # A[1] * B[2] mul x3, x7, x12 umulh x4, x7, x12 adds x17, x17, x3 - adcs x18, x18, x4 - adc x19, x19, xzr + adcs x19, x19, x4 + adc x20, x20, xzr # A[2] * B[1] mul x3, x8, x11 umulh x4, x8, x11 adds x17, x17, x3 - adcs x18, x18, x4 - adc x19, x19, xzr + adcs x19, x19, x4 + adc x20, x20, xzr # A[3] * B[0] mul x3, x9, x10 umulh x4, x9, x10 adds x17, x17, x3 - adcs x18, x18, x4 - adc x19, x19, xzr + adcs x19, x19, x4 + adc x20, x20, xzr # A[1] * B[3] mul x3, x7, x13 umulh x4, x7, x13 - adds x18, x18, x3 - adcs x19, x19, x4 - adc x20, xzr, xzr - # A[2] * B[2] - mul x3, x8, x12 - umulh x4, x8, x12 - adds x18, x18, x3 - adcs x19, x19, x4 - adc x20, x20, xzr - # A[3] * B[1] - mul x3, x9, x11 - umulh x4, x9, x11 - adds x18, x18, x3 - adcs x19, x19, x4 - adc x20, x20, xzr - # A[2] * B[3] - mul x3, x8, x13 - umulh x4, x8, x13 adds x19, x19, x3 adcs x20, x20, x4 adc x21, xzr, xzr - # A[3] * B[2] - mul x3, x9, x12 - umulh x4, x9, x12 + # A[2] * B[2] + mul x3, x8, x12 + umulh x4, x8, x12 adds x19, x19, x3 adcs x20, x20, x4 adc x21, x21, xzr + # A[3] * B[1] + mul x3, x9, x11 + umulh x4, x9, x11 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[3] + mul x3, x8, x13 + umulh x4, x8, x13 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[3] * B[2] + mul x3, x9, x12 + umulh x4, x9, x12 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, x22, xzr # A[3] * B[3] mul x3, x9, x13 umulh x4, x9, x13 - adds x20, x20, x3 - adc x21, x21, x4 + adds x21, x21, x3 + adc x22, x22, x4 # Reduce # Move top half into t4-t7 and remove top bit from t3 + extr x22, x22, x21, #63 extr x21, x21, x20, #63 extr x20, x20, x19, #63 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x19, x19, x17, #63 and x17, x17, #0x7fffffffffffffff # Multiply top half by 19 mov x3, #19 - mul x4, x3, x18 - umulh x18, x3, x18 - adds x14, x14, x4 mul x4, x3, x19 umulh x19, x3, x19 - adcs x15, x15, x4 + adds x14, x14, x4 mul x4, x3, x20 umulh x20, x3, x20 - adcs x16, x16, x4 + adcs x15, x15, x4 mul x4, x3, x21 - umulh x5, x3, x21 + umulh x21, x3, x21 + adcs x16, x16, x4 + mul x4, x3, x22 + umulh x5, x3, x22 adcs x17, x17, x4 adc x5, x5, xzr # Add remaining product results in - adds x15, x15, x18 - adcs x16, x16, x19 - adcs x17, x17, x20 + adds x15, x15, x19 + adcs x16, x16, x20 + adcs x17, x17, x21 adc x5, x5, xzr # Overflow extr x5, x5, x17, #63 @@ -2403,8 +2407,8 @@ L_curve25519_inv_8: stp x14, x15, [x0] stp x16, x17, [x0, #16] mov x0, xzr - ldr x17, [x29, #192] - ldp x18, x19, [x29, #200] + ldr x17, [x29, #200] + ldr x19, [x29, #208] ldp x20, x21, [x29, #216] ldp x22, x23, [x29, #232] ldp x24, x25, [x29, #248] @@ -2545,8 +2549,9 @@ fe_ge_to_p2: stp x29, x30, [sp, #-112]! add x29, sp, #0 str x17, [x29, #72] - stp x18, x19, [x29, #80] - stp x20, x21, [x29, #96] + str x19, [x29, #80] + stp x20, x21, [x29, #88] + str x22, [x29, #104] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -2559,97 +2564,97 @@ fe_ge_to_p2: ldp x11, x12, [x1] ldp x13, x14, [x1, #16] ldp x15, x16, [x2] - ldp x17, x18, [x2, #16] + ldp x17, x19, [x2, #16] # A[0] * B[0] mul x3, x11, x15 umulh x4, x11, x15 # A[0] * B[1] - mul x19, x11, x16 + mul x20, x11, x16 umulh x5, x11, x16 - adds x4, x4, x19 + adds x4, x4, x20 adc x5, x5, xzr # A[1] * B[0] - mul x19, x12, x15 - umulh x20, x12, x15 - adds x4, x4, x19 - adcs x5, x5, x20 + mul x20, x12, x15 + umulh x21, x12, x15 + adds x4, x4, x20 + adcs x5, x5, x21 adc x6, xzr, xzr # A[0] * B[2] - mul x19, x11, x17 - umulh x20, x11, x17 - adds x5, x5, x19 - adc x6, x6, x20 + mul x20, x11, x17 + umulh x21, x11, x17 + adds x5, x5, x20 + adc x6, x6, x21 # A[1] * B[1] - mul x19, x12, x16 - umulh x20, x12, x16 - adds x5, x5, x19 - adcs x6, x6, x20 + mul x20, x12, x16 + umulh x21, x12, x16 + adds x5, x5, x20 + adcs x6, x6, x21 adc x7, xzr, xzr # A[2] * B[0] - mul x19, x13, x15 - umulh x20, x13, x15 - adds x5, x5, x19 - adcs x6, x6, x20 + mul x20, x13, x15 + umulh x21, x13, x15 + adds x5, x5, x20 + adcs x6, x6, x21 adc x7, x7, xzr # A[0] * B[3] - mul x19, x11, x18 - umulh x20, x11, x18 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x11, x19 + umulh x21, x11, x19 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, xzr, xzr # A[1] * B[2] - mul x19, x12, x17 - umulh x20, x12, x17 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x12, x17 + umulh x21, x12, x17 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[2] * B[1] - mul x19, x13, x16 - umulh x20, x13, x16 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x13, x16 + umulh x21, x13, x16 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[3] * B[0] - mul x19, x14, x15 - umulh x20, x14, x15 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x14, x15 + umulh x21, x14, x15 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[1] * B[3] - mul x19, x12, x18 - umulh x20, x12, x18 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x12, x19 + umulh x21, x12, x19 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, xzr, xzr # A[2] * B[2] - mul x19, x13, x17 - umulh x20, x13, x17 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x13, x17 + umulh x21, x13, x17 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, x9, xzr # A[3] * B[1] - mul x19, x14, x16 - umulh x20, x14, x16 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x14, x16 + umulh x21, x14, x16 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, x9, xzr # A[2] * B[3] - mul x19, x13, x18 - umulh x20, x13, x18 - adds x8, x8, x19 - adcs x9, x9, x20 + mul x20, x13, x19 + umulh x21, x13, x19 + adds x8, x8, x20 + adcs x9, x9, x21 adc x10, xzr, xzr # A[3] * B[2] - mul x19, x14, x17 - umulh x20, x14, x17 - adds x8, x8, x19 - adcs x9, x9, x20 + mul x20, x14, x17 + umulh x21, x14, x17 + adds x8, x8, x20 + adcs x9, x9, x21 adc x10, x10, xzr # A[3] * B[3] - mul x19, x14, x18 - umulh x20, x14, x18 - adds x9, x9, x19 - adc x10, x10, x20 + mul x20, x14, x19 + umulh x21, x14, x19 + adds x9, x9, x20 + adc x10, x10, x21 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -2658,37 +2663,37 @@ fe_ge_to_p2: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x19, #19 - mul x20, x19, x7 - umulh x7, x19, x7 - adds x3, x3, x20 - mul x20, x19, x8 - umulh x8, x19, x8 - adcs x4, x4, x20 - mul x20, x19, x9 - umulh x9, x19, x9 - adcs x5, x5, x20 - mul x20, x19, x10 - umulh x21, x19, x10 - adcs x6, x6, x20 - adc x21, x21, xzr + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x21, x21, xzr + adc x22, x22, xzr # Overflow - extr x21, x21, x6, #63 - mul x21, x21, x19 + extr x22, x22, x6, #63 + mul x22, x22, x20 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x21 + adds x3, x3, x22 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x21, x19, x6, asr 63 + and x22, x20, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x21 + adds x3, x3, x22 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -2702,97 +2707,97 @@ fe_ge_to_p2: ldp x11, x12, [x1] ldp x13, x14, [x1, #16] ldp x15, x16, [x2] - ldp x17, x18, [x2, #16] + ldp x17, x19, [x2, #16] # A[0] * B[0] mul x3, x11, x15 umulh x4, x11, x15 # A[0] * B[1] - mul x19, x11, x16 + mul x20, x11, x16 umulh x5, x11, x16 - adds x4, x4, x19 + adds x4, x4, x20 adc x5, x5, xzr # A[1] * B[0] - mul x19, x12, x15 - umulh x20, x12, x15 - adds x4, x4, x19 - adcs x5, x5, x20 + mul x20, x12, x15 + umulh x21, x12, x15 + adds x4, x4, x20 + adcs x5, x5, x21 adc x6, xzr, xzr # A[0] * B[2] - mul x19, x11, x17 - umulh x20, x11, x17 - adds x5, x5, x19 - adc x6, x6, x20 + mul x20, x11, x17 + umulh x21, x11, x17 + adds x5, x5, x20 + adc x6, x6, x21 # A[1] * B[1] - mul x19, x12, x16 - umulh x20, x12, x16 - adds x5, x5, x19 - adcs x6, x6, x20 + mul x20, x12, x16 + umulh x21, x12, x16 + adds x5, x5, x20 + adcs x6, x6, x21 adc x7, xzr, xzr # A[2] * B[0] - mul x19, x13, x15 - umulh x20, x13, x15 - adds x5, x5, x19 - adcs x6, x6, x20 + mul x20, x13, x15 + umulh x21, x13, x15 + adds x5, x5, x20 + adcs x6, x6, x21 adc x7, x7, xzr # A[0] * B[3] - mul x19, x11, x18 - umulh x20, x11, x18 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x11, x19 + umulh x21, x11, x19 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, xzr, xzr # A[1] * B[2] - mul x19, x12, x17 - umulh x20, x12, x17 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x12, x17 + umulh x21, x12, x17 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[2] * B[1] - mul x19, x13, x16 - umulh x20, x13, x16 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x13, x16 + umulh x21, x13, x16 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[3] * B[0] - mul x19, x14, x15 - umulh x20, x14, x15 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x14, x15 + umulh x21, x14, x15 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[1] * B[3] - mul x19, x12, x18 - umulh x20, x12, x18 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x12, x19 + umulh x21, x12, x19 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, xzr, xzr # A[2] * B[2] - mul x19, x13, x17 - umulh x20, x13, x17 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x13, x17 + umulh x21, x13, x17 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, x9, xzr # A[3] * B[1] - mul x19, x14, x16 - umulh x20, x14, x16 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x14, x16 + umulh x21, x14, x16 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, x9, xzr # A[2] * B[3] - mul x19, x13, x18 - umulh x20, x13, x18 - adds x8, x8, x19 - adcs x9, x9, x20 + mul x20, x13, x19 + umulh x21, x13, x19 + adds x8, x8, x20 + adcs x9, x9, x21 adc x10, xzr, xzr # A[3] * B[2] - mul x19, x14, x17 - umulh x20, x14, x17 - adds x8, x8, x19 - adcs x9, x9, x20 + mul x20, x14, x17 + umulh x21, x14, x17 + adds x8, x8, x20 + adcs x9, x9, x21 adc x10, x10, xzr # A[3] * B[3] - mul x19, x14, x18 - umulh x20, x14, x18 - adds x9, x9, x19 - adc x10, x10, x20 + mul x20, x14, x19 + umulh x21, x14, x19 + adds x9, x9, x20 + adc x10, x10, x21 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -2801,37 +2806,37 @@ fe_ge_to_p2: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x19, #19 - mul x20, x19, x7 - umulh x7, x19, x7 - adds x3, x3, x20 - mul x20, x19, x8 - umulh x8, x19, x8 - adcs x4, x4, x20 - mul x20, x19, x9 - umulh x9, x19, x9 - adcs x5, x5, x20 - mul x20, x19, x10 - umulh x21, x19, x10 - adcs x6, x6, x20 - adc x21, x21, xzr + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x21, x21, xzr + adc x22, x22, xzr # Overflow - extr x21, x21, x6, #63 - mul x21, x21, x19 + extr x22, x22, x6, #63 + mul x22, x22, x20 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x21 + adds x3, x3, x22 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x21, x19, x6, asr 63 + and x22, x20, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x21 + adds x3, x3, x22 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -2847,92 +2852,92 @@ fe_ge_to_p2: mul x3, x15, x11 umulh x4, x15, x11 # A[0] * B[1] - mul x19, x15, x12 + mul x20, x15, x12 umulh x5, x15, x12 - adds x4, x4, x19 + adds x4, x4, x20 adc x5, x5, xzr # A[1] * B[0] - mul x19, x16, x11 - umulh x20, x16, x11 - adds x4, x4, x19 - adcs x5, x5, x20 + mul x20, x16, x11 + umulh x21, x16, x11 + adds x4, x4, x20 + adcs x5, x5, x21 adc x6, xzr, xzr # A[0] * B[2] - mul x19, x15, x13 - umulh x20, x15, x13 - adds x5, x5, x19 - adc x6, x6, x20 + mul x20, x15, x13 + umulh x21, x15, x13 + adds x5, x5, x20 + adc x6, x6, x21 # A[1] * B[1] - mul x19, x16, x12 - umulh x20, x16, x12 - adds x5, x5, x19 - adcs x6, x6, x20 + mul x20, x16, x12 + umulh x21, x16, x12 + adds x5, x5, x20 + adcs x6, x6, x21 adc x7, xzr, xzr # A[2] * B[0] - mul x19, x17, x11 - umulh x20, x17, x11 - adds x5, x5, x19 - adcs x6, x6, x20 + mul x20, x17, x11 + umulh x21, x17, x11 + adds x5, x5, x20 + adcs x6, x6, x21 adc x7, x7, xzr # A[0] * B[3] - mul x19, x15, x14 - umulh x20, x15, x14 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x15, x14 + umulh x21, x15, x14 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, xzr, xzr # A[1] * B[2] - mul x19, x16, x13 - umulh x20, x16, x13 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x16, x13 + umulh x21, x16, x13 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[2] * B[1] - mul x19, x17, x12 - umulh x20, x17, x12 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x17, x12 + umulh x21, x17, x12 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[3] * B[0] - mul x19, x18, x11 - umulh x20, x18, x11 - adds x6, x6, x19 - adcs x7, x7, x20 + mul x20, x19, x11 + umulh x21, x19, x11 + adds x6, x6, x20 + adcs x7, x7, x21 adc x8, x8, xzr # A[1] * B[3] - mul x19, x16, x14 - umulh x20, x16, x14 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x16, x14 + umulh x21, x16, x14 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, xzr, xzr # A[2] * B[2] - mul x19, x17, x13 - umulh x20, x17, x13 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x17, x13 + umulh x21, x17, x13 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, x9, xzr # A[3] * B[1] - mul x19, x18, x12 - umulh x20, x18, x12 - adds x7, x7, x19 - adcs x8, x8, x20 + mul x20, x19, x12 + umulh x21, x19, x12 + adds x7, x7, x20 + adcs x8, x8, x21 adc x9, x9, xzr # A[2] * B[3] - mul x19, x17, x14 - umulh x20, x17, x14 - adds x8, x8, x19 - adcs x9, x9, x20 + mul x20, x17, x14 + umulh x21, x17, x14 + adds x8, x8, x20 + adcs x9, x9, x21 adc x10, xzr, xzr # A[3] * B[2] - mul x19, x18, x13 - umulh x20, x18, x13 - adds x8, x8, x19 - adcs x9, x9, x20 + mul x20, x19, x13 + umulh x21, x19, x13 + adds x8, x8, x20 + adcs x9, x9, x21 adc x10, x10, xzr # A[3] * B[3] - mul x19, x18, x14 - umulh x20, x18, x14 - adds x9, x9, x19 - adc x10, x10, x20 + mul x20, x19, x14 + umulh x21, x19, x14 + adds x9, x9, x20 + adc x10, x10, x21 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -2941,37 +2946,37 @@ fe_ge_to_p2: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x19, #19 - mul x20, x19, x7 - umulh x7, x19, x7 - adds x3, x3, x20 - mul x20, x19, x8 - umulh x8, x19, x8 - adcs x4, x4, x20 - mul x20, x19, x9 - umulh x9, x19, x9 - adcs x5, x5, x20 - mul x20, x19, x10 - umulh x21, x19, x10 - adcs x6, x6, x20 - adc x21, x21, xzr + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x21, x21, xzr + adc x22, x22, xzr # Overflow - extr x21, x21, x6, #63 - mul x21, x21, x19 + extr x22, x22, x6, #63 + mul x22, x22, x20 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x21 + adds x3, x3, x22 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x21, x19, x6, asr 63 + and x22, x20, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x21 + adds x3, x3, x22 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -2979,8 +2984,9 @@ fe_ge_to_p2: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x17, [x29, #72] - ldp x18, x19, [x29, #80] - ldp x20, x21, [x29, #96] + ldr x19, [x29, #80] + ldp x20, x21, [x29, #88] + ldr x22, [x29, #104] ldp x29, x30, [sp], #0x70 ret .size fe_ge_to_p2,.-fe_ge_to_p2 @@ -2992,10 +2998,11 @@ fe_ge_to_p3: stp x29, x30, [sp, #-160]! add x29, sp, #0 str x17, [x29, #88] - stp x18, x19, [x29, #96] - stp x20, x21, [x29, #112] - stp x22, x23, [x29, #128] - stp x24, x25, [x29, #144] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + str x26, [x29, #152] str x1, [x29, #16] str x2, [x29, #24] str x3, [x29, #32] @@ -3009,97 +3016,97 @@ fe_ge_to_p3: ldp x11, x12, [x1] ldp x13, x14, [x1, #16] ldp x15, x16, [x2] - ldp x17, x18, [x2, #16] + ldp x17, x19, [x2, #16] # A[0] * B[0] mul x3, x11, x15 umulh x4, x11, x15 # A[0] * B[1] - mul x23, x11, x16 + mul x24, x11, x16 umulh x5, x11, x16 - adds x4, x4, x23 + adds x4, x4, x24 adc x5, x5, xzr # A[1] * B[0] - mul x23, x12, x15 - umulh x24, x12, x15 - adds x4, x4, x23 - adcs x5, x5, x24 + mul x24, x12, x15 + umulh x25, x12, x15 + adds x4, x4, x24 + adcs x5, x5, x25 adc x6, xzr, xzr # A[0] * B[2] - mul x23, x11, x17 - umulh x24, x11, x17 - adds x5, x5, x23 - adc x6, x6, x24 + mul x24, x11, x17 + umulh x25, x11, x17 + adds x5, x5, x24 + adc x6, x6, x25 # A[1] * B[1] - mul x23, x12, x16 - umulh x24, x12, x16 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x12, x16 + umulh x25, x12, x16 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[2] * B[0] - mul x23, x13, x15 - umulh x24, x13, x15 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x13, x15 + umulh x25, x13, x15 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, x7, xzr # A[0] * B[3] - mul x23, x11, x18 - umulh x24, x11, x18 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x11, x19 + umulh x25, x11, x19 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[1] * B[2] - mul x23, x12, x17 - umulh x24, x12, x17 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x12, x17 + umulh x25, x12, x17 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[2] * B[1] - mul x23, x13, x16 - umulh x24, x13, x16 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x13, x16 + umulh x25, x13, x16 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[3] * B[0] - mul x23, x14, x15 - umulh x24, x14, x15 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x14, x15 + umulh x25, x14, x15 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[1] * B[3] - mul x23, x12, x18 - umulh x24, x12, x18 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[2] * B[2] - mul x23, x13, x17 - umulh x24, x13, x17 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x13, x17 + umulh x25, x13, x17 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[1] - mul x23, x14, x16 - umulh x24, x14, x16 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x14, x16 + umulh x25, x14, x16 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[3] - mul x23, x13, x18 - umulh x24, x13, x18 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[3] * B[2] - mul x23, x14, x17 - umulh x24, x14, x17 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x14, x17 + umulh x25, x14, x17 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[3] - mul x23, x14, x18 - umulh x24, x14, x18 - adds x9, x9, x23 - adc x10, x10, x24 + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adc x10, x10, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3108,37 +3115,37 @@ fe_ge_to_p3: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x23, #19 - mul x24, x23, x7 - umulh x7, x23, x7 - adds x3, x3, x24 - mul x24, x23, x8 - umulh x8, x23, x8 - adcs x4, x4, x24 - mul x24, x23, x9 - umulh x9, x23, x9 - adcs x5, x5, x24 - mul x24, x23, x10 - umulh x25, x23, x10 - adcs x6, x6, x24 - adc x25, x25, xzr + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x25, x25, xzr + adc x26, x26, xzr # Overflow - extr x25, x25, x6, #63 - mul x25, x25, x23 + extr x26, x26, x6, #63 + mul x26, x26, x24 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x25, x23, x6, asr 63 + and x26, x24, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3148,98 +3155,98 @@ fe_ge_to_p3: ldr x0, [x29, #32] ldr x2, [x29, #48] # Multiply - ldp x19, x20, [x2] - ldp x21, x22, [x2, #16] + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] # A[0] * B[0] - mul x3, x11, x19 - umulh x4, x11, x19 + mul x3, x11, x20 + umulh x4, x11, x20 # A[0] * B[1] - mul x23, x11, x20 - umulh x5, x11, x20 - adds x4, x4, x23 + mul x24, x11, x21 + umulh x5, x11, x21 + adds x4, x4, x24 adc x5, x5, xzr # A[1] * B[0] - mul x23, x12, x19 - umulh x24, x12, x19 - adds x4, x4, x23 - adcs x5, x5, x24 + mul x24, x12, x20 + umulh x25, x12, x20 + adds x4, x4, x24 + adcs x5, x5, x25 adc x6, xzr, xzr # A[0] * B[2] - mul x23, x11, x21 - umulh x24, x11, x21 - adds x5, x5, x23 - adc x6, x6, x24 + mul x24, x11, x22 + umulh x25, x11, x22 + adds x5, x5, x24 + adc x6, x6, x25 # A[1] * B[1] - mul x23, x12, x20 - umulh x24, x12, x20 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x12, x21 + umulh x25, x12, x21 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[2] * B[0] - mul x23, x13, x19 - umulh x24, x13, x19 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x13, x20 + umulh x25, x13, x20 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, x7, xzr # A[0] * B[3] - mul x23, x11, x22 - umulh x24, x11, x22 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x11, x23 + umulh x25, x11, x23 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[1] * B[2] - mul x23, x12, x21 - umulh x24, x12, x21 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x12, x22 + umulh x25, x12, x22 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[2] * B[1] - mul x23, x13, x20 - umulh x24, x13, x20 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x13, x21 + umulh x25, x13, x21 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[3] * B[0] - mul x23, x14, x19 - umulh x24, x14, x19 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x14, x20 + umulh x25, x14, x20 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[1] * B[3] - mul x23, x12, x22 - umulh x24, x12, x22 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x12, x23 + umulh x25, x12, x23 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[2] * B[2] - mul x23, x13, x21 - umulh x24, x13, x21 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x13, x22 + umulh x25, x13, x22 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[1] - mul x23, x14, x20 - umulh x24, x14, x20 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x14, x21 + umulh x25, x14, x21 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[3] - mul x23, x13, x22 - umulh x24, x13, x22 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x13, x23 + umulh x25, x13, x23 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[3] * B[2] - mul x23, x14, x21 - umulh x24, x14, x21 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x14, x22 + umulh x25, x14, x22 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[3] - mul x23, x14, x22 - umulh x24, x14, x22 - adds x9, x9, x23 - adc x10, x10, x24 + mul x24, x14, x23 + umulh x25, x14, x23 + adds x9, x9, x24 + adc x10, x10, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3248,37 +3255,37 @@ fe_ge_to_p3: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x23, #19 - mul x24, x23, x7 - umulh x7, x23, x7 - adds x3, x3, x24 - mul x24, x23, x8 - umulh x8, x23, x8 - adcs x4, x4, x24 - mul x24, x23, x9 - umulh x9, x23, x9 - adcs x5, x5, x24 - mul x24, x23, x10 - umulh x25, x23, x10 - adcs x6, x6, x24 - adc x25, x25, xzr + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x25, x25, xzr + adc x26, x26, xzr # Overflow - extr x25, x25, x6, #63 - mul x25, x25, x23 + extr x26, x26, x6, #63 + mul x26, x26, x24 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x25, x23, x6, asr 63 + and x26, x24, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3291,95 +3298,95 @@ fe_ge_to_p3: ldp x11, x12, [x2] ldp x13, x14, [x2, #16] # A[0] * B[0] - mul x3, x19, x11 - umulh x4, x19, x11 + mul x3, x20, x11 + umulh x4, x20, x11 # A[0] * B[1] - mul x23, x19, x12 - umulh x5, x19, x12 - adds x4, x4, x23 + mul x24, x20, x12 + umulh x5, x20, x12 + adds x4, x4, x24 adc x5, x5, xzr # A[1] * B[0] - mul x23, x20, x11 - umulh x24, x20, x11 - adds x4, x4, x23 - adcs x5, x5, x24 + mul x24, x21, x11 + umulh x25, x21, x11 + adds x4, x4, x24 + adcs x5, x5, x25 adc x6, xzr, xzr # A[0] * B[2] - mul x23, x19, x13 - umulh x24, x19, x13 - adds x5, x5, x23 - adc x6, x6, x24 + mul x24, x20, x13 + umulh x25, x20, x13 + adds x5, x5, x24 + adc x6, x6, x25 # A[1] * B[1] - mul x23, x20, x12 - umulh x24, x20, x12 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x21, x12 + umulh x25, x21, x12 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[2] * B[0] - mul x23, x21, x11 - umulh x24, x21, x11 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x22, x11 + umulh x25, x22, x11 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, x7, xzr # A[0] * B[3] - mul x23, x19, x14 - umulh x24, x19, x14 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x20, x14 + umulh x25, x20, x14 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[1] * B[2] - mul x23, x20, x13 - umulh x24, x20, x13 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x21, x13 + umulh x25, x21, x13 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[2] * B[1] - mul x23, x21, x12 - umulh x24, x21, x12 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x22, x12 + umulh x25, x22, x12 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[3] * B[0] - mul x23, x22, x11 - umulh x24, x22, x11 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x23, x11 + umulh x25, x23, x11 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[1] * B[3] - mul x23, x20, x14 - umulh x24, x20, x14 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x21, x14 + umulh x25, x21, x14 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[2] * B[2] - mul x23, x21, x13 - umulh x24, x21, x13 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x22, x13 + umulh x25, x22, x13 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[1] - mul x23, x22, x12 - umulh x24, x22, x12 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x23, x12 + umulh x25, x23, x12 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[3] - mul x23, x21, x14 - umulh x24, x21, x14 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x22, x14 + umulh x25, x22, x14 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[3] * B[2] - mul x23, x22, x13 - umulh x24, x22, x13 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x23, x13 + umulh x25, x23, x13 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[3] - mul x23, x22, x14 - umulh x24, x22, x14 - adds x9, x9, x23 - adc x10, x10, x24 + mul x24, x23, x14 + umulh x25, x23, x14 + adds x9, x9, x24 + adc x10, x10, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3388,37 +3395,37 @@ fe_ge_to_p3: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x23, #19 - mul x24, x23, x7 - umulh x7, x23, x7 - adds x3, x3, x24 - mul x24, x23, x8 - umulh x8, x23, x8 - adcs x4, x4, x24 - mul x24, x23, x9 - umulh x9, x23, x9 - adcs x5, x5, x24 - mul x24, x23, x10 - umulh x25, x23, x10 - adcs x6, x6, x24 - adc x25, x25, xzr + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x25, x25, xzr + adc x26, x26, xzr # Overflow - extr x25, x25, x6, #63 - mul x25, x25, x23 + extr x26, x26, x6, #63 + mul x26, x26, x24 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x25, x23, x6, asr 63 + and x26, x24, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3431,92 +3438,92 @@ fe_ge_to_p3: mul x3, x11, x15 umulh x4, x11, x15 # A[0] * B[1] - mul x23, x11, x16 + mul x24, x11, x16 umulh x5, x11, x16 - adds x4, x4, x23 + adds x4, x4, x24 adc x5, x5, xzr # A[1] * B[0] - mul x23, x12, x15 - umulh x24, x12, x15 - adds x4, x4, x23 - adcs x5, x5, x24 + mul x24, x12, x15 + umulh x25, x12, x15 + adds x4, x4, x24 + adcs x5, x5, x25 adc x6, xzr, xzr # A[0] * B[2] - mul x23, x11, x17 - umulh x24, x11, x17 - adds x5, x5, x23 - adc x6, x6, x24 + mul x24, x11, x17 + umulh x25, x11, x17 + adds x5, x5, x24 + adc x6, x6, x25 # A[1] * B[1] - mul x23, x12, x16 - umulh x24, x12, x16 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x12, x16 + umulh x25, x12, x16 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, xzr, xzr # A[2] * B[0] - mul x23, x13, x15 - umulh x24, x13, x15 - adds x5, x5, x23 - adcs x6, x6, x24 + mul x24, x13, x15 + umulh x25, x13, x15 + adds x5, x5, x24 + adcs x6, x6, x25 adc x7, x7, xzr # A[0] * B[3] - mul x23, x11, x18 - umulh x24, x11, x18 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x11, x19 + umulh x25, x11, x19 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, xzr, xzr # A[1] * B[2] - mul x23, x12, x17 - umulh x24, x12, x17 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x12, x17 + umulh x25, x12, x17 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[2] * B[1] - mul x23, x13, x16 - umulh x24, x13, x16 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x13, x16 + umulh x25, x13, x16 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[3] * B[0] - mul x23, x14, x15 - umulh x24, x14, x15 - adds x6, x6, x23 - adcs x7, x7, x24 + mul x24, x14, x15 + umulh x25, x14, x15 + adds x6, x6, x24 + adcs x7, x7, x25 adc x8, x8, xzr # A[1] * B[3] - mul x23, x12, x18 - umulh x24, x12, x18 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, xzr, xzr # A[2] * B[2] - mul x23, x13, x17 - umulh x24, x13, x17 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x13, x17 + umulh x25, x13, x17 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[3] * B[1] - mul x23, x14, x16 - umulh x24, x14, x16 - adds x7, x7, x23 - adcs x8, x8, x24 + mul x24, x14, x16 + umulh x25, x14, x16 + adds x7, x7, x24 + adcs x8, x8, x25 adc x9, x9, xzr # A[2] * B[3] - mul x23, x13, x18 - umulh x24, x13, x18 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, xzr, xzr # A[3] * B[2] - mul x23, x14, x17 - umulh x24, x14, x17 - adds x8, x8, x23 - adcs x9, x9, x24 + mul x24, x14, x17 + umulh x25, x14, x17 + adds x8, x8, x24 + adcs x9, x9, x25 adc x10, x10, xzr # A[3] * B[3] - mul x23, x14, x18 - umulh x24, x14, x18 - adds x9, x9, x23 - adc x10, x10, x24 + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adc x10, x10, x25 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x10, x10, x9, #63 @@ -3525,37 +3532,37 @@ fe_ge_to_p3: extr x7, x7, x6, #63 and x6, x6, #0x7fffffffffffffff # Multiply top half by 19 - mov x23, #19 - mul x24, x23, x7 - umulh x7, x23, x7 - adds x3, x3, x24 - mul x24, x23, x8 - umulh x8, x23, x8 - adcs x4, x4, x24 - mul x24, x23, x9 - umulh x9, x23, x9 - adcs x5, x5, x24 - mul x24, x23, x10 - umulh x25, x23, x10 - adcs x6, x6, x24 - adc x25, x25, xzr + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr # Add remaining product results in adds x4, x4, x7 adcs x5, x5, x8 adcs x6, x6, x9 - adc x25, x25, xzr + adc x26, x26, xzr # Overflow - extr x25, x25, x6, #63 - mul x25, x25, x23 + extr x26, x26, x6, #63 + mul x26, x26, x24 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # Reduce if top bit set - and x25, x23, x6, asr 63 + and x26, x24, x6, asr 63 and x6, x6, #0x7fffffffffffffff - adds x3, x3, x25 + adds x3, x3, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr @@ -3563,10 +3570,11 @@ fe_ge_to_p3: stp x3, x4, [x0] stp x5, x6, [x0, #16] ldr x17, [x29, #88] - ldp x18, x19, [x29, #96] - ldp x20, x21, [x29, #112] - ldp x22, x23, [x29, #128] - ldp x24, x25, [x29, #144] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldr x26, [x29, #152] ldp x29, x30, [sp], #0xa0 ret .size fe_ge_to_p3,.-fe_ge_to_p3 @@ -3578,11 +3586,12 @@ fe_ge_dbl: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - stp x18, x19, [x29, #96] - stp x20, x21, [x29, #112] - stp x22, x23, [x29, #128] - stp x24, x25, [x29, #144] - stp x26, x27, [x29, #160] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -3598,421 +3607,30 @@ fe_ge_dbl: mul x5, x12, x13 umulh x6, x12, x13 # A[0] * A[2] - mul x24, x12, x14 + mul x25, x12, x14 umulh x7, x12, x14 - adds x6, x6, x24 + adds x6, x6, x25 adc x7, x7, xzr # A[0] * A[3] - mul x24, x12, x15 + mul x25, x12, x15 umulh x8, x12, x15 - adds x7, x7, x24 + adds x7, x7, x25 adc x8, x8, xzr # A[1] * A[2] - mul x24, x13, x14 - umulh x25, x13, x14 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x13, x14 + umulh x26, x13, x14 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, xzr, xzr # A[1] * A[3] - mul x24, x13, x15 - umulh x25, x13, x15 - adds x8, x8, x24 - adc x9, x9, x25 - # A[2] * A[3] - mul x24, x14, x15 - umulh x10, x14, x15 - adds x9, x9, x24 - adc x10, x10, xzr - # Double - adds x5, x5, x5 - adcs x6, x6, x6 - adcs x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adc x11, xzr, xzr - # A[0] * A[0] - mul x4, x12, x12 - umulh x26, x12, x12 - # A[1] * A[1] - mul x24, x13, x13 - umulh x25, x13, x13 - adds x5, x5, x26 - adcs x6, x6, x24 - adc x26, x25, xzr - # A[2] * A[2] - mul x24, x14, x14 - umulh x25, x14, x14 - adds x7, x7, x26 - adcs x8, x8, x24 - adc x26, x25, xzr - # A[3] * A[3] - mul x24, x15, x15 - umulh x25, x15, x15 - adds x9, x9, x26 - adcs x10, x10, x24 - adc x11, x11, x25 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x8 - umulh x8, x24, x8 - adds x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x10, x24, x10 - adcs x6, x6, x25 - mul x25, x24, x11 - umulh x26, x24, x11 - adcs x7, x7, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x26, x26, xzr - # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x26, x24, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x0, [x29, #32] - ldr x1, [x29, #56] - # Square - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] - # A[0] * A[1] - mul x9, x20, x21 - umulh x10, x20, x21 - # A[0] * A[2] - mul x24, x20, x22 - umulh x11, x20, x22 - adds x10, x10, x24 - adc x11, x11, xzr - # A[0] * A[3] - mul x24, x20, x23 - umulh x16, x20, x23 - adds x11, x11, x24 - adc x16, x16, xzr - # A[1] * A[2] - mul x24, x21, x22 - umulh x25, x21, x22 - adds x11, x11, x24 - adcs x16, x16, x25 - adc x17, xzr, xzr - # A[1] * A[3] - mul x24, x21, x23 - umulh x25, x21, x23 - adds x16, x16, x24 - adc x17, x17, x25 - # A[2] * A[3] - mul x24, x22, x23 - umulh x18, x22, x23 - adds x17, x17, x24 - adc x18, x18, xzr - # Double - adds x9, x9, x9 - adcs x10, x10, x10 - adcs x11, x11, x11 - adcs x16, x16, x16 - adcs x17, x17, x17 - adcs x18, x18, x18 - adc x19, xzr, xzr - # A[0] * A[0] - mul x8, x20, x20 - umulh x26, x20, x20 - # A[1] * A[1] - mul x24, x21, x21 - umulh x25, x21, x21 - adds x9, x9, x26 - adcs x10, x10, x24 - adc x26, x25, xzr - # A[2] * A[2] - mul x24, x22, x22 - umulh x25, x22, x22 - adds x11, x11, x26 - adcs x16, x16, x24 - adc x26, x25, xzr - # A[3] * A[3] - mul x24, x23, x23 - umulh x25, x23, x23 - adds x17, x17, x26 - adcs x18, x18, x24 - adc x19, x19, x25 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x11, #63 - and x11, x11, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 + mul x25, x13, x15 + umulh x26, x13, x15 adds x8, x8, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x9, x9, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x10, x10, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x11, x11, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x9, x9, x16 - adcs x10, x10, x17 - adcs x11, x11, x18 - adc x26, x26, xzr - # Overflow - extr x26, x26, x11, #63 - mul x26, x26, x24 - and x11, x11, #0x7fffffffffffffff - adds x8, x8, x26 - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x11, x11, xzr - # Reduce if top bit set - and x26, x24, x11, asr 63 - and x11, x11, #0x7fffffffffffffff - adds x8, x8, x26 - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x11, x11, xzr - # Store - stp x8, x9, [x0] - stp x10, x11, [x0, #16] - ldr x0, [x29, #24] - # Add - adds x12, x12, x20 - adcs x13, x13, x21 - adcs x14, x14, x22 - adc x15, x15, x23 - mov x24, #-19 - asr x27, x15, #63 - # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x12, x12, x24 - sbcs x13, x13, x27 - sbcs x14, x14, x27 - sbc x15, x15, x25 - ldr x0, [x29, #40] - # Square - # A[0] * A[1] - mul x17, x12, x13 - umulh x18, x12, x13 - # A[0] * A[2] - mul x24, x12, x14 - umulh x19, x12, x14 - adds x18, x18, x24 - adc x19, x19, xzr - # A[0] * A[3] - mul x24, x12, x15 - umulh x20, x12, x15 - adds x19, x19, x24 - adc x20, x20, xzr - # A[1] * A[2] - mul x24, x13, x14 - umulh x25, x13, x14 - adds x19, x19, x24 - adcs x20, x20, x25 - adc x21, xzr, xzr - # A[1] * A[3] - mul x24, x13, x15 - umulh x25, x13, x15 - adds x20, x20, x24 - adc x21, x21, x25 + adc x9, x9, x26 # A[2] * A[3] - mul x24, x14, x15 - umulh x22, x14, x15 - adds x21, x21, x24 - adc x22, x22, xzr - # Double - adds x17, x17, x17 - adcs x18, x18, x18 - adcs x19, x19, x19 - adcs x20, x20, x20 - adcs x21, x21, x21 - adcs x22, x22, x22 - adc x23, xzr, xzr - # A[0] * A[0] - mul x16, x12, x12 - umulh x26, x12, x12 - # A[1] * A[1] - mul x24, x13, x13 - umulh x25, x13, x13 - adds x17, x17, x26 - adcs x18, x18, x24 - adc x26, x25, xzr - # A[2] * A[2] - mul x24, x14, x14 - umulh x25, x14, x14 - adds x19, x19, x26 - adcs x20, x20, x24 - adc x26, x25, xzr - # A[3] * A[3] - mul x24, x15, x15 - umulh x25, x15, x15 - adds x21, x21, x26 - adcs x22, x22, x24 - adc x23, x23, x25 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x23, x23, x22, #63 - extr x22, x22, x21, #63 - extr x21, x21, x20, #63 - extr x20, x20, x19, #63 - and x19, x19, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x20 - umulh x20, x24, x20 - adds x16, x16, x25 - mul x25, x24, x21 - umulh x21, x24, x21 - adcs x17, x17, x25 - mul x25, x24, x22 - umulh x22, x24, x22 - adcs x18, x18, x25 - mul x25, x24, x23 - umulh x26, x24, x23 - adcs x19, x19, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x17, x17, x20 - adcs x18, x18, x21 - adcs x19, x19, x22 - adc x26, x26, xzr - # Overflow - extr x26, x26, x19, #63 - mul x26, x26, x24 - and x19, x19, #0x7fffffffffffffff - adds x16, x16, x26 - adcs x17, x17, xzr - adcs x18, x18, xzr - adc x19, x19, xzr - # Reduce if top bit set - and x26, x24, x19, asr 63 - and x19, x19, #0x7fffffffffffffff - adds x16, x16, x26 - adcs x17, x17, xzr - adcs x18, x18, xzr - adc x19, x19, xzr - # Store - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - ldr x0, [x29, #24] - ldr x1, [x29, #32] - # Add - adds x12, x8, x4 - adcs x13, x9, x5 - adcs x14, x10, x6 - adc x15, x11, x7 - mov x24, #-19 - asr x27, x15, #63 - # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x12, x12, x24 - sbcs x13, x13, x27 - sbcs x14, x14, x27 - sbc x15, x15, x25 - # Sub - subs x20, x8, x4 - sbcs x21, x9, x5 - sbcs x22, x10, x6 - sbcs x23, x11, x7 - mov x24, #-19 - csetm x27, cc - # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x20, x20, x24 - adcs x21, x21, x27 - adcs x22, x22, x27 - adc x23, x23, x25 - stp x12, x13, [x0] - stp x14, x15, [x0, #16] - stp x20, x21, [x1] - stp x22, x23, [x1, #16] - ldr x0, [x29, #16] - # Sub - subs x16, x16, x12 - sbcs x17, x17, x13 - sbcs x18, x18, x14 - sbcs x19, x19, x15 - mov x24, #-19 - csetm x27, cc - # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 - stp x16, x17, [x0] - stp x18, x19, [x0, #16] - ldr x0, [x29, #40] - ldr x1, [x29, #64] - # Square * 2 - ldp x12, x13, [x1] - ldp x14, x15, [x1, #16] - # A[0] * A[1] - mul x5, x12, x13 - umulh x6, x12, x13 - # A[0] * A[2] - mul x24, x12, x14 - umulh x7, x12, x14 - adds x6, x6, x24 - adc x7, x7, xzr - # A[0] * A[3] - mul x24, x12, x15 - umulh x8, x12, x15 - adds x7, x7, x24 - adc x8, x8, xzr - # A[1] * A[2] - mul x24, x13, x14 - umulh x25, x13, x14 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, xzr, xzr - # A[1] * A[3] - mul x24, x13, x15 - umulh x25, x13, x15 - adds x8, x8, x24 - adc x9, x9, x25 - # A[2] * A[3] - mul x24, x14, x15 + mul x25, x14, x15 umulh x10, x14, x15 - adds x9, x9, x24 + adds x9, x9, x25 adc x10, x10, xzr # Double adds x5, x5, x5 @@ -4026,27 +3644,418 @@ fe_ge_dbl: mul x4, x12, x12 umulh x27, x12, x12 # A[1] * A[1] - mul x24, x13, x13 - umulh x25, x13, x13 + mul x25, x13, x13 + umulh x26, x13, x13 adds x5, x5, x27 - adcs x6, x6, x24 - adc x27, x25, xzr + adcs x6, x6, x25 + adc x27, x26, xzr # A[2] * A[2] - mul x24, x14, x14 - umulh x25, x14, x14 + mul x25, x14, x14 + umulh x26, x14, x14 adds x7, x7, x27 - adcs x8, x8, x24 - adc x27, x25, xzr + adcs x8, x8, x25 + adc x27, x26, xzr # A[3] * A[3] - mul x24, x15, x15 - umulh x25, x15, x15 + mul x25, x15, x15 + umulh x26, x15, x15 adds x9, x9, x27 - adcs x10, x10, x24 - adc x11, x11, x25 - # Double and Reduce - mov x24, #0x169 + adcs x10, x10, x25 + adc x11, x11, x26 + # Reduce # Move top half into t4-t7 and remove top bit from t3 - lsr x27, x11, #61 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #32] + ldr x1, [x29, #56] + # Square + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * A[1] + mul x9, x21, x22 + umulh x10, x21, x22 + # A[0] * A[2] + mul x25, x21, x23 + umulh x11, x21, x23 + adds x10, x10, x25 + adc x11, x11, xzr + # A[0] * A[3] + mul x25, x21, x24 + umulh x16, x21, x24 + adds x11, x11, x25 + adc x16, x16, xzr + # A[1] * A[2] + mul x25, x22, x23 + umulh x26, x22, x23 + adds x11, x11, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * A[3] + mul x25, x22, x24 + umulh x26, x22, x24 + adds x16, x16, x25 + adc x17, x17, x26 + # A[2] * A[3] + mul x25, x23, x24 + umulh x19, x23, x24 + adds x17, x17, x25 + adc x19, x19, xzr + # Double + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x19, x19, x19 + adc x20, xzr, xzr + # A[0] * A[0] + mul x8, x21, x21 + umulh x27, x21, x21 + # A[1] * A[1] + mul x25, x22, x22 + umulh x26, x22, x22 + adds x9, x9, x27 + adcs x10, x10, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x23, x23 + umulh x26, x23, x23 + adds x11, x11, x27 + adcs x16, x16, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x24, x24 + umulh x26, x24, x24 + adds x17, x17, x27 + adcs x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x8, x8, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x9, x9, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x10, x10, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x16 + adcs x10, x10, x17 + adcs x11, x11, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + ldr x0, [x29, #24] + # Add + adds x12, x12, x21 + adcs x13, x13, x22 + adcs x14, x14, x23 + adc x15, x15, x24 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + ldr x0, [x29, #40] + # Square + # A[0] * A[1] + mul x17, x12, x13 + umulh x19, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x20, x12, x14 + adds x19, x19, x25 + adc x20, x20, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x21, x12, x15 + adds x20, x20, x25 + adc x21, x21, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x20, x20, x25 + adcs x21, x21, x26 + adc x22, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x21, x21, x25 + adc x22, x22, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x23, x14, x15 + adds x22, x22, x25 + adc x23, x23, xzr + # Double + adds x17, x17, x17 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x24, xzr, xzr + # A[0] * A[0] + mul x16, x12, x12 + umulh x27, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x17, x17, x27 + adcs x19, x19, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x20, x20, x27 + adcs x21, x21, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x22, x22, x27 + adcs x23, x23, x25 + adc x24, x24, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x24, x24, x23, #63 + extr x23, x23, x22, #63 + extr x22, x22, x21, #63 + extr x21, x21, x20, #63 + and x20, x20, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x21 + umulh x21, x25, x21 + adds x16, x16, x26 + mul x26, x25, x22 + umulh x22, x25, x22 + adcs x17, x17, x26 + mul x26, x25, x23 + umulh x23, x25, x23 + adcs x19, x19, x26 + mul x26, x25, x24 + umulh x27, x25, x24 + adcs x20, x20, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x17, x17, x21 + adcs x19, x19, x22 + adcs x20, x20, x23 + adc x27, x27, xzr + # Overflow + extr x27, x27, x20, #63 + mul x27, x27, x25 + and x20, x20, #0x7fffffffffffffff + adds x16, x16, x27 + adcs x17, x17, xzr + adcs x19, x19, xzr + adc x20, x20, xzr + # Reduce if top bit set + and x27, x25, x20, asr 63 + and x20, x20, #0x7fffffffffffffff + adds x16, x16, x27 + adcs x17, x17, xzr + adcs x19, x19, xzr + adc x20, x20, xzr + # Store + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #32] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x21, x8, x4 + sbcs x22, x9, x5 + sbcs x23, x10, x6 + sbcs x24, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x21, x21, x25 + adcs x22, x22, x28 + adcs x23, x23, x28 + adc x24, x24, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x21, x22, [x1] + stp x23, x24, [x1, #16] + ldr x0, [x29, #16] + # Sub + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x19, x19, x14 + sbcs x20, x20, x15 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #64] + # Square * 2 + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + # A[0] * A[1] + mul x5, x12, x13 + umulh x6, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x25 + adc x7, x7, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x25 + adc x8, x8, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x8, x8, x25 + adc x9, x9, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x25 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x12, x12 + umulh x28, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x5, x5, x28 + adcs x6, x6, x25 + adc x28, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x7, x7, x28 + adcs x8, x8, x25 + adc x28, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x9, x9, x28 + adcs x10, x10, x25 + adc x11, x11, x26 + # Double and Reduce + mov x25, #0x169 + # Move top half into t4-t7 and remove top bit from t3 + lsr x28, x11, #61 extr x11, x11, x10, #62 extr x10, x10, x9, #62 extr x9, x9, x8, #62 @@ -4059,68 +4068,69 @@ fe_ge_dbl: # Two left, only one right and x11, x11, #0x7fffffffffffffff # Multiply top bits by 19*19 - mul x27, x27, x24 + mul x28, x28, x25 # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x8 - umulh x8, x24, x8 - adds x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x10, x24, x10 - adcs x6, x6, x25 - mul x25, x24, x11 - umulh x26, x24, x11 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in - adds x4, x4, x27 + adds x4, x4, x28 adcs x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Store ldr x0, [x29, #40] # Sub - subs x4, x4, x20 - sbcs x5, x5, x21 - sbcs x6, x6, x22 - sbcs x7, x7, x23 - mov x24, #-19 - csetm x27, cc + subs x4, x4, x21 + sbcs x5, x5, x22 + sbcs x6, x6, x23 + sbcs x7, x7, x24 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x4, x4, x24 - adcs x5, x5, x27 - adcs x6, x6, x27 - adc x7, x7, x25 + adds x4, x4, x25 + adcs x5, x5, x28 + adcs x6, x6, x28 + adc x7, x7, x26 stp x4, x5, [x0] stp x6, x7, [x0, #16] ldr x17, [x29, #88] - ldp x18, x19, [x29, #96] - ldp x20, x21, [x29, #112] - ldp x22, x23, [x29, #128] - ldp x24, x25, [x29, #144] - ldp x26, x27, [x29, #160] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] ldp x29, x30, [sp], #0xb0 ret .size fe_ge_dbl,.-fe_ge_dbl @@ -4132,11 +4142,12 @@ fe_ge_madd: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - stp x18, x19, [x29, #96] - stp x20, x21, [x29, #112] - stp x22, x23, [x29, #128] - stp x24, x25, [x29, #144] - stp x26, x27, [x29, #160] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -4151,170 +4162,170 @@ fe_ge_madd: ldp x12, x13, [x2] ldp x14, x15, [x2, #16] ldp x16, x17, [x3] - ldp x18, x19, [x3, #16] + ldp x19, x20, [x3, #16] adds x4, x12, x16 adcs x5, x13, x17 - adcs x6, x14, x18 - adc x7, x15, x19 - mov x24, #-19 - asr x27, x7, #63 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x24 - sbcs x5, x5, x27 - sbcs x6, x6, x27 - sbc x7, x7, x25 + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 # Sub subs x8, x12, x16 sbcs x9, x13, x17 - sbcs x10, x14, x18 - sbcs x11, x15, x19 - mov x24, #-19 - csetm x27, cc + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x8, x8, x24 - adcs x9, x9, x27 - adcs x10, x10, x27 - adc x11, x11, x25 + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 ldr x0, [x29, #32] ldr x2, [x29, #184] # Multiply - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] # A[0] * B[0] - mul x12, x4, x20 - umulh x13, x4, x20 + mul x12, x4, x21 + umulh x13, x4, x21 # A[0] * B[1] - mul x24, x4, x21 - umulh x14, x4, x21 - adds x13, x13, x24 + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 adc x14, x14, xzr # A[1] * B[0] - mul x24, x5, x20 - umulh x25, x5, x20 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, xzr, xzr # A[0] * B[2] - mul x24, x4, x22 - umulh x25, x4, x22 - adds x14, x14, x24 - adc x15, x15, x25 + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 # A[1] * B[1] - mul x24, x5, x21 - umulh x25, x5, x21 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x6, x20 - umulh x25, x6, x20 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x4, x23 - umulh x25, x4, x23 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x5, x22 - umulh x25, x5, x22 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x6, x21 - umulh x25, x6, x21 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x7, x20 - umulh x25, x7, x20 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x5, x23 - umulh x25, x5, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x6, x22 - umulh x25, x6, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x7, x21 - umulh x25, x7, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x6, x23 - umulh x25, x6, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x7, x22 - umulh x25, x7, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x7, x23 - umulh x25, x7, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x15, #63 and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x12, x12, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x13, x13, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x14, x14, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x15, x15, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr # Add remaining product results in adds x13, x13, x16 adcs x14, x14, x17 - adcs x15, x15, x18 - adc x26, x26, xzr + adcs x15, x15, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x15, #63 - mul x26, x26, x24 + extr x27, x27, x15, #63 + mul x27, x27, x25 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - and x26, x24, x15, asr 63 + and x27, x25, x15, asr 63 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr @@ -4322,137 +4333,137 @@ fe_ge_madd: ldr x0, [x29, #24] ldr x1, [x29, #192] # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] # A[0] * B[0] - mul x4, x8, x20 - umulh x5, x8, x20 + mul x4, x8, x21 + umulh x5, x8, x21 # A[0] * B[1] - mul x24, x8, x21 - umulh x6, x8, x21 - adds x5, x5, x24 + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x9, x20 - umulh x25, x9, x20 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x8, x22 - umulh x25, x8, x22 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x9, x21 - umulh x25, x9, x21 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x10, x20 - umulh x25, x10, x20 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x8, x23 - umulh x25, x8, x23 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x9, x22 - umulh x25, x9, x22 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x10, x21 - umulh x25, x10, x21 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x11, x20 - umulh x25, x11, x20 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x9, x23 - umulh x25, x9, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x10, x22 - umulh x25, x10, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x11, x21 - umulh x25, x11, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x10, x23 - umulh x25, x10, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x11, x22 - umulh x25, x11, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x11, x23 - umulh x25, x11, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x4, x4, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x5, x5, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x6, x6, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x16 adcs x6, x6, x17 - adcs x7, x7, x18 - adc x26, x26, xzr + adcs x7, x7, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -4464,133 +4475,133 @@ fe_ge_madd: adcs x9, x13, x5 adcs x10, x14, x6 adc x11, x15, x7 - mov x24, #-19 - asr x27, x11, #63 + mov x25, #-19 + asr x28, x11, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x24 - sbcs x9, x9, x27 - sbcs x10, x10, x27 - sbc x11, x11, x25 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 # Sub subs x16, x12, x4 sbcs x17, x13, x5 - sbcs x18, x14, x6 - sbcs x19, x15, x7 - mov x24, #-19 - csetm x27, cc + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x0, [x29, #40] ldr x1, [x29, #176] ldr x3, [x29, #72] # Multiply ldp x16, x17, [x1] - ldp x18, x19, [x1, #16] - ldp x20, x21, [x3] - ldp x22, x23, [x3, #16] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x3] + ldp x23, x24, [x3, #16] # A[0] * B[0] - mul x4, x16, x20 - umulh x5, x16, x20 + mul x4, x16, x21 + umulh x5, x16, x21 # A[0] * B[1] - mul x24, x16, x21 - umulh x6, x16, x21 - adds x5, x5, x24 + mul x25, x16, x22 + umulh x6, x16, x22 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x17, x20 - umulh x25, x17, x20 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x17, x21 + umulh x26, x17, x21 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x16, x22 - umulh x25, x16, x22 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x16, x23 + umulh x26, x16, x23 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x17, x21 - umulh x25, x17, x21 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x17, x22 + umulh x26, x17, x22 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, xzr, xzr # A[2] * B[0] - mul x24, x18, x20 - umulh x25, x18, x20 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x19, x21 + umulh x26, x19, x21 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, x8, xzr # A[0] * B[3] - mul x24, x16, x23 - umulh x25, x16, x23 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x16, x24 + umulh x26, x16, x24 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, xzr, xzr # A[1] * B[2] - mul x24, x17, x22 - umulh x25, x17, x22 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x17, x23 + umulh x26, x17, x23 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[2] * B[1] - mul x24, x18, x21 - umulh x25, x18, x21 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x19, x22 + umulh x26, x19, x22 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[3] * B[0] - mul x24, x19, x20 - umulh x25, x19, x20 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x20, x21 + umulh x26, x20, x21 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[3] - mul x24, x17, x23 - umulh x25, x17, x23 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x17, x24 + umulh x26, x17, x24 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, xzr, xzr # A[2] * B[2] - mul x24, x18, x22 - umulh x25, x18, x22 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x19, x23 + umulh x26, x19, x23 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[3] * B[1] - mul x24, x19, x21 - umulh x25, x19, x21 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x20, x22 + umulh x26, x20, x22 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[3] - mul x24, x18, x23 - umulh x25, x18, x23 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x19, x24 + umulh x26, x19, x24 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, xzr, xzr # A[3] * B[2] - mul x24, x19, x22 - umulh x25, x19, x22 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x20, x23 + umulh x26, x20, x23 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[3] - mul x24, x19, x23 - umulh x25, x19, x23 - adds x10, x10, x24 - adc x11, x11, x25 + mul x25, x20, x24 + umulh x26, x20, x24 + adds x10, x10, x25 + adc x11, x11, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -4599,37 +4610,37 @@ fe_ge_madd: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x8 - umulh x8, x24, x8 - adds x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x10, x24, x10 - adcs x6, x6, x25 - mul x25, x24, x11 - umulh x26, x24, x11 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -4643,57 +4654,58 @@ fe_ge_madd: adcs x9, x9, x9 adcs x10, x10, x10 adc x11, x11, x11 - mov x24, #-19 - asr x27, x11, #63 + mov x25, #-19 + asr x28, x11, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x24 - sbcs x9, x9, x27 - sbcs x10, x10, x27 - sbc x11, x11, x25 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 ldr x1, [x29, #40] # Add adds x12, x8, x4 adcs x13, x9, x5 adcs x14, x10, x6 adc x15, x11, x7 - mov x24, #-19 - asr x27, x15, #63 + mov x25, #-19 + asr x28, x15, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x12, x12, x24 - sbcs x13, x13, x27 - sbcs x14, x14, x27 - sbc x15, x15, x25 + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 # Sub subs x16, x8, x4 sbcs x17, x9, x5 - sbcs x18, x10, x6 - sbcs x19, x11, x7 - mov x24, #-19 - csetm x27, cc + sbcs x19, x10, x6 + sbcs x20, x11, x7 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x12, x13, [x0] stp x14, x15, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x17, [x29, #88] - ldp x18, x19, [x29, #96] - ldp x20, x21, [x29, #112] - ldp x22, x23, [x29, #128] - ldp x24, x25, [x29, #144] - ldp x26, x27, [x29, #160] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] ldp x29, x30, [sp], #0xb0 ret .size fe_ge_madd,.-fe_ge_madd @@ -4705,11 +4717,12 @@ fe_ge_msub: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - stp x18, x19, [x29, #96] - stp x20, x21, [x29, #112] - stp x22, x23, [x29, #128] - stp x24, x25, [x29, #144] - stp x26, x27, [x29, #160] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -4724,170 +4737,170 @@ fe_ge_msub: ldp x12, x13, [x2] ldp x14, x15, [x2, #16] ldp x16, x17, [x3] - ldp x18, x19, [x3, #16] + ldp x19, x20, [x3, #16] adds x4, x12, x16 adcs x5, x13, x17 - adcs x6, x14, x18 - adc x7, x15, x19 - mov x24, #-19 - asr x27, x7, #63 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x24 - sbcs x5, x5, x27 - sbcs x6, x6, x27 - sbc x7, x7, x25 + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 # Sub subs x8, x12, x16 sbcs x9, x13, x17 - sbcs x10, x14, x18 - sbcs x11, x15, x19 - mov x24, #-19 - csetm x27, cc + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x8, x8, x24 - adcs x9, x9, x27 - adcs x10, x10, x27 - adc x11, x11, x25 + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 ldr x0, [x29, #32] ldr x2, [x29, #192] # Multiply - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] # A[0] * B[0] - mul x12, x4, x20 - umulh x13, x4, x20 + mul x12, x4, x21 + umulh x13, x4, x21 # A[0] * B[1] - mul x24, x4, x21 - umulh x14, x4, x21 - adds x13, x13, x24 + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 adc x14, x14, xzr # A[1] * B[0] - mul x24, x5, x20 - umulh x25, x5, x20 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, xzr, xzr # A[0] * B[2] - mul x24, x4, x22 - umulh x25, x4, x22 - adds x14, x14, x24 - adc x15, x15, x25 + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 # A[1] * B[1] - mul x24, x5, x21 - umulh x25, x5, x21 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x6, x20 - umulh x25, x6, x20 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x4, x23 - umulh x25, x4, x23 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x5, x22 - umulh x25, x5, x22 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x6, x21 - umulh x25, x6, x21 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x7, x20 - umulh x25, x7, x20 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x5, x23 - umulh x25, x5, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x6, x22 - umulh x25, x6, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x7, x21 - umulh x25, x7, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x6, x23 - umulh x25, x6, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x7, x22 - umulh x25, x7, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x7, x23 - umulh x25, x7, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x15, #63 and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x12, x12, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x13, x13, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x14, x14, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x15, x15, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr # Add remaining product results in adds x13, x13, x16 adcs x14, x14, x17 - adcs x15, x15, x18 - adc x26, x26, xzr + adcs x15, x15, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x15, #63 - mul x26, x26, x24 + extr x27, x27, x15, #63 + mul x27, x27, x25 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - and x26, x24, x15, asr 63 + and x27, x25, x15, asr 63 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr @@ -4895,137 +4908,137 @@ fe_ge_msub: ldr x0, [x29, #24] ldr x1, [x29, #184] # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] # A[0] * B[0] - mul x4, x8, x20 - umulh x5, x8, x20 + mul x4, x8, x21 + umulh x5, x8, x21 # A[0] * B[1] - mul x24, x8, x21 - umulh x6, x8, x21 - adds x5, x5, x24 + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x9, x20 - umulh x25, x9, x20 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x8, x22 - umulh x25, x8, x22 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x9, x21 - umulh x25, x9, x21 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x10, x20 - umulh x25, x10, x20 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x8, x23 - umulh x25, x8, x23 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x9, x22 - umulh x25, x9, x22 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x10, x21 - umulh x25, x10, x21 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x11, x20 - umulh x25, x11, x20 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x9, x23 - umulh x25, x9, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x10, x22 - umulh x25, x10, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x11, x21 - umulh x25, x11, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x10, x23 - umulh x25, x10, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x11, x22 - umulh x25, x11, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x11, x23 - umulh x25, x11, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x4, x4, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x5, x5, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x6, x6, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x16 adcs x6, x6, x17 - adcs x7, x7, x18 - adc x26, x26, xzr + adcs x7, x7, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -5037,133 +5050,133 @@ fe_ge_msub: adcs x9, x13, x5 adcs x10, x14, x6 adc x11, x15, x7 - mov x24, #-19 - asr x27, x11, #63 + mov x25, #-19 + asr x28, x11, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x24 - sbcs x9, x9, x27 - sbcs x10, x10, x27 - sbc x11, x11, x25 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 # Sub subs x16, x12, x4 sbcs x17, x13, x5 - sbcs x18, x14, x6 - sbcs x19, x15, x7 - mov x24, #-19 - csetm x27, cc + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x0, [x29, #40] ldr x1, [x29, #176] ldr x3, [x29, #72] # Multiply ldp x16, x17, [x1] - ldp x18, x19, [x1, #16] - ldp x20, x21, [x3] - ldp x22, x23, [x3, #16] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x3] + ldp x23, x24, [x3, #16] # A[0] * B[0] - mul x4, x16, x20 - umulh x5, x16, x20 + mul x4, x16, x21 + umulh x5, x16, x21 # A[0] * B[1] - mul x24, x16, x21 - umulh x6, x16, x21 - adds x5, x5, x24 + mul x25, x16, x22 + umulh x6, x16, x22 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x17, x20 - umulh x25, x17, x20 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x17, x21 + umulh x26, x17, x21 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x16, x22 - umulh x25, x16, x22 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x16, x23 + umulh x26, x16, x23 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x17, x21 - umulh x25, x17, x21 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x17, x22 + umulh x26, x17, x22 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, xzr, xzr # A[2] * B[0] - mul x24, x18, x20 - umulh x25, x18, x20 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x19, x21 + umulh x26, x19, x21 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, x8, xzr # A[0] * B[3] - mul x24, x16, x23 - umulh x25, x16, x23 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x16, x24 + umulh x26, x16, x24 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, xzr, xzr # A[1] * B[2] - mul x24, x17, x22 - umulh x25, x17, x22 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x17, x23 + umulh x26, x17, x23 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[2] * B[1] - mul x24, x18, x21 - umulh x25, x18, x21 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x19, x22 + umulh x26, x19, x22 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[3] * B[0] - mul x24, x19, x20 - umulh x25, x19, x20 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x20, x21 + umulh x26, x20, x21 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[3] - mul x24, x17, x23 - umulh x25, x17, x23 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x17, x24 + umulh x26, x17, x24 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, xzr, xzr # A[2] * B[2] - mul x24, x18, x22 - umulh x25, x18, x22 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x19, x23 + umulh x26, x19, x23 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[3] * B[1] - mul x24, x19, x21 - umulh x25, x19, x21 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x20, x22 + umulh x26, x20, x22 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[3] - mul x24, x18, x23 - umulh x25, x18, x23 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x19, x24 + umulh x26, x19, x24 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, xzr, xzr # A[3] * B[2] - mul x24, x19, x22 - umulh x25, x19, x22 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x20, x23 + umulh x26, x20, x23 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[3] - mul x24, x19, x23 - umulh x25, x19, x23 - adds x10, x10, x24 - adc x11, x11, x25 + mul x25, x20, x24 + umulh x26, x20, x24 + adds x10, x10, x25 + adc x11, x11, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -5172,37 +5185,37 @@ fe_ge_msub: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x8 - umulh x8, x24, x8 - adds x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x10, x24, x10 - adcs x6, x6, x25 - mul x25, x24, x11 - umulh x26, x24, x11 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -5216,57 +5229,58 @@ fe_ge_msub: adcs x9, x9, x9 adcs x10, x10, x10 adc x11, x11, x11 - mov x24, #-19 - asr x27, x11, #63 + mov x25, #-19 + asr x28, x11, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x24 - sbcs x9, x9, x27 - sbcs x10, x10, x27 - sbc x11, x11, x25 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 ldr x1, [x29, #40] # Add adds x12, x8, x4 adcs x13, x9, x5 adcs x14, x10, x6 adc x15, x11, x7 - mov x24, #-19 - asr x27, x15, #63 + mov x25, #-19 + asr x28, x15, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x12, x12, x24 - sbcs x13, x13, x27 - sbcs x14, x14, x27 - sbc x15, x15, x25 + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 # Sub subs x16, x8, x4 sbcs x17, x9, x5 - sbcs x18, x10, x6 - sbcs x19, x11, x7 - mov x24, #-19 - csetm x27, cc + sbcs x19, x10, x6 + sbcs x20, x11, x7 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x12, x13, [x1] stp x14, x15, [x1, #16] stp x16, x17, [x0] - stp x18, x19, [x0, #16] + stp x19, x20, [x0, #16] ldr x17, [x29, #88] - ldp x18, x19, [x29, #96] - ldp x20, x21, [x29, #112] - ldp x22, x23, [x29, #128] - ldp x24, x25, [x29, #144] - ldp x26, x27, [x29, #160] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] ldp x29, x30, [sp], #0xb0 ret .size fe_ge_msub,.-fe_ge_msub @@ -5278,11 +5292,12 @@ fe_ge_add: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - stp x18, x19, [x29, #96] - stp x20, x21, [x29, #112] - stp x22, x23, [x29, #128] - stp x24, x25, [x29, #144] - stp x26, x27, [x29, #160] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -5297,170 +5312,170 @@ fe_ge_add: ldp x12, x13, [x2] ldp x14, x15, [x2, #16] ldp x16, x17, [x3] - ldp x18, x19, [x3, #16] + ldp x19, x20, [x3, #16] adds x4, x12, x16 adcs x5, x13, x17 - adcs x6, x14, x18 - adc x7, x15, x19 - mov x24, #-19 - asr x27, x7, #63 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x24 - sbcs x5, x5, x27 - sbcs x6, x6, x27 - sbc x7, x7, x25 + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 # Sub subs x8, x12, x16 sbcs x9, x13, x17 - sbcs x10, x14, x18 - sbcs x11, x15, x19 - mov x24, #-19 - csetm x27, cc + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x8, x8, x24 - adcs x9, x9, x27 - adcs x10, x10, x27 - adc x11, x11, x25 + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 ldr x0, [x29, #32] ldr x2, [x29, #192] # Multiply - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] # A[0] * B[0] - mul x12, x4, x20 - umulh x13, x4, x20 + mul x12, x4, x21 + umulh x13, x4, x21 # A[0] * B[1] - mul x24, x4, x21 - umulh x14, x4, x21 - adds x13, x13, x24 + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 adc x14, x14, xzr # A[1] * B[0] - mul x24, x5, x20 - umulh x25, x5, x20 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, xzr, xzr # A[0] * B[2] - mul x24, x4, x22 - umulh x25, x4, x22 - adds x14, x14, x24 - adc x15, x15, x25 + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 # A[1] * B[1] - mul x24, x5, x21 - umulh x25, x5, x21 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x6, x20 - umulh x25, x6, x20 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x4, x23 - umulh x25, x4, x23 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x5, x22 - umulh x25, x5, x22 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x6, x21 - umulh x25, x6, x21 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x7, x20 - umulh x25, x7, x20 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x5, x23 - umulh x25, x5, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x6, x22 - umulh x25, x6, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x7, x21 - umulh x25, x7, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x6, x23 - umulh x25, x6, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x7, x22 - umulh x25, x7, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x7, x23 - umulh x25, x7, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x15, #63 and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x12, x12, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x13, x13, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x14, x14, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x15, x15, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr # Add remaining product results in adds x13, x13, x16 adcs x14, x14, x17 - adcs x15, x15, x18 - adc x26, x26, xzr + adcs x15, x15, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x15, #63 - mul x26, x26, x24 + extr x27, x27, x15, #63 + mul x27, x27, x25 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - and x26, x24, x15, asr 63 + and x27, x25, x15, asr 63 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr @@ -5468,137 +5483,137 @@ fe_ge_add: ldr x0, [x29, #24] ldr x1, [x29, #200] # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] # A[0] * B[0] - mul x4, x8, x20 - umulh x5, x8, x20 + mul x4, x8, x21 + umulh x5, x8, x21 # A[0] * B[1] - mul x24, x8, x21 - umulh x6, x8, x21 - adds x5, x5, x24 + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x9, x20 - umulh x25, x9, x20 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x8, x22 - umulh x25, x8, x22 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x9, x21 - umulh x25, x9, x21 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x10, x20 - umulh x25, x10, x20 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x8, x23 - umulh x25, x8, x23 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x9, x22 - umulh x25, x9, x22 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x10, x21 - umulh x25, x10, x21 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x11, x20 - umulh x25, x11, x20 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x9, x23 - umulh x25, x9, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x10, x22 - umulh x25, x10, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x11, x21 - umulh x25, x11, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x10, x23 - umulh x25, x10, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x11, x22 - umulh x25, x11, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x11, x23 - umulh x25, x11, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x4, x4, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x5, x5, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x6, x6, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x16 adcs x6, x6, x17 - adcs x7, x7, x18 - adc x26, x26, xzr + adcs x7, x7, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -5610,35 +5625,35 @@ fe_ge_add: adcs x9, x13, x5 adcs x10, x14, x6 adc x11, x15, x7 - mov x24, #-19 - asr x27, x11, #63 + mov x25, #-19 + asr x28, x11, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x24 - sbcs x9, x9, x27 - sbcs x10, x10, x27 - sbc x11, x11, x25 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 # Sub subs x16, x12, x4 sbcs x17, x13, x5 - sbcs x18, x14, x6 - sbcs x19, x15, x7 - mov x24, #-19 - csetm x27, cc + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x0, [x29, #48] ldr x1, [x29, #64] ldr x2, [x29, #176] @@ -5646,97 +5661,97 @@ fe_ge_add: ldp x12, x13, [x1] ldp x14, x15, [x1, #16] ldp x16, x17, [x2] - ldp x18, x19, [x2, #16] + ldp x19, x20, [x2, #16] # A[0] * B[0] mul x4, x12, x16 umulh x5, x12, x16 # A[0] * B[1] - mul x24, x12, x17 + mul x25, x12, x17 umulh x6, x12, x17 - adds x5, x5, x24 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x13, x16 - umulh x25, x13, x16 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x13, x16 + umulh x26, x13, x16 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x12, x18 - umulh x25, x12, x18 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x12, x19 + umulh x26, x12, x19 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x13, x17 - umulh x25, x13, x17 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x13, x17 + umulh x26, x13, x17 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, xzr, xzr # A[2] * B[0] - mul x24, x14, x16 - umulh x25, x14, x16 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x14, x16 + umulh x26, x14, x16 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, x8, xzr # A[0] * B[3] - mul x24, x12, x19 - umulh x25, x12, x19 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x12, x20 + umulh x26, x12, x20 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, xzr, xzr # A[1] * B[2] - mul x24, x13, x18 - umulh x25, x13, x18 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x13, x19 + umulh x26, x13, x19 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[2] * B[1] - mul x24, x14, x17 - umulh x25, x14, x17 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x14, x17 + umulh x26, x14, x17 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[3] * B[0] - mul x24, x15, x16 - umulh x25, x15, x16 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x15, x16 + umulh x26, x15, x16 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[3] - mul x24, x13, x19 - umulh x25, x13, x19 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x13, x20 + umulh x26, x13, x20 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, xzr, xzr # A[2] * B[2] - mul x24, x14, x18 - umulh x25, x14, x18 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x14, x19 + umulh x26, x14, x19 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[3] * B[1] - mul x24, x15, x17 - umulh x25, x15, x17 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x15, x17 + umulh x26, x15, x17 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[3] - mul x24, x14, x19 - umulh x25, x14, x19 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x14, x20 + umulh x26, x14, x20 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, xzr, xzr # A[3] * B[2] - mul x24, x15, x18 - umulh x25, x15, x18 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x15, x19 + umulh x26, x15, x19 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[3] - mul x24, x15, x19 - umulh x25, x15, x19 - adds x10, x10, x24 - adc x11, x11, x25 + mul x25, x15, x20 + umulh x26, x15, x20 + adds x10, x10, x25 + adc x11, x11, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -5745,37 +5760,37 @@ fe_ge_add: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x8 - umulh x8, x24, x8 - adds x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x10, x24, x10 - adcs x6, x6, x25 - mul x25, x24, x11 - umulh x26, x24, x11 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -5786,114 +5801,114 @@ fe_ge_add: adcs x5, x5, x5 adcs x6, x6, x6 adc x7, x7, x7 - mov x24, #-19 - asr x27, x7, #63 + mov x25, #-19 + asr x28, x7, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x24 - sbcs x5, x5, x27 - sbcs x6, x6, x27 - sbc x7, x7, x25 + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 ldr x0, [x29, #40] ldr x1, [x29, #184] ldr x2, [x29, #72] # Multiply ldp x16, x17, [x1] - ldp x18, x19, [x1, #16] - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] # A[0] * B[0] - mul x8, x16, x20 - umulh x9, x16, x20 + mul x8, x16, x21 + umulh x9, x16, x21 # A[0] * B[1] - mul x24, x16, x21 - umulh x10, x16, x21 - adds x9, x9, x24 + mul x25, x16, x22 + umulh x10, x16, x22 + adds x9, x9, x25 adc x10, x10, xzr # A[1] * B[0] - mul x24, x17, x20 - umulh x25, x17, x20 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x17, x21 + umulh x26, x17, x21 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, xzr, xzr # A[0] * B[2] - mul x24, x16, x22 - umulh x25, x16, x22 - adds x10, x10, x24 - adc x11, x11, x25 + mul x25, x16, x23 + umulh x26, x16, x23 + adds x10, x10, x25 + adc x11, x11, x26 # A[1] * B[1] - mul x24, x17, x21 - umulh x25, x17, x21 - adds x10, x10, x24 - adcs x11, x11, x25 + mul x25, x17, x22 + umulh x26, x17, x22 + adds x10, x10, x25 + adcs x11, x11, x26 adc x12, xzr, xzr # A[2] * B[0] - mul x24, x18, x20 - umulh x25, x18, x20 - adds x10, x10, x24 - adcs x11, x11, x25 + mul x25, x19, x21 + umulh x26, x19, x21 + adds x10, x10, x25 + adcs x11, x11, x26 adc x12, x12, xzr # A[0] * B[3] - mul x24, x16, x23 - umulh x25, x16, x23 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x16, x24 + umulh x26, x16, x24 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, xzr, xzr # A[1] * B[2] - mul x24, x17, x22 - umulh x25, x17, x22 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x17, x23 + umulh x26, x17, x23 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, x13, xzr # A[2] * B[1] - mul x24, x18, x21 - umulh x25, x18, x21 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x19, x22 + umulh x26, x19, x22 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, x13, xzr # A[3] * B[0] - mul x24, x19, x20 - umulh x25, x19, x20 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x20, x21 + umulh x26, x20, x21 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, x13, xzr # A[1] * B[3] - mul x24, x17, x23 - umulh x25, x17, x23 - adds x12, x12, x24 - adcs x13, x13, x25 + mul x25, x17, x24 + umulh x26, x17, x24 + adds x12, x12, x25 + adcs x13, x13, x26 adc x14, xzr, xzr # A[2] * B[2] - mul x24, x18, x22 - umulh x25, x18, x22 - adds x12, x12, x24 - adcs x13, x13, x25 + mul x25, x19, x23 + umulh x26, x19, x23 + adds x12, x12, x25 + adcs x13, x13, x26 adc x14, x14, xzr # A[3] * B[1] - mul x24, x19, x21 - umulh x25, x19, x21 - adds x12, x12, x24 - adcs x13, x13, x25 + mul x25, x20, x22 + umulh x26, x20, x22 + adds x12, x12, x25 + adcs x13, x13, x26 adc x14, x14, xzr # A[2] * B[3] - mul x24, x18, x23 - umulh x25, x18, x23 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x19, x24 + umulh x26, x19, x24 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, xzr, xzr # A[3] * B[2] - mul x24, x19, x22 - umulh x25, x19, x22 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x20, x23 + umulh x26, x20, x23 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, x15, xzr # A[3] * B[3] - mul x24, x19, x23 - umulh x25, x19, x23 - adds x14, x14, x24 - adc x15, x15, x25 + mul x25, x20, x24 + umulh x26, x20, x24 + adds x14, x14, x25 + adc x15, x15, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x15, x15, x14, #63 @@ -5902,37 +5917,37 @@ fe_ge_add: extr x12, x12, x11, #63 and x11, x11, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x12 - umulh x12, x24, x12 - adds x8, x8, x25 - mul x25, x24, x13 - umulh x13, x24, x13 - adcs x9, x9, x25 - mul x25, x24, x14 - umulh x14, x24, x14 - adcs x10, x10, x25 - mul x25, x24, x15 - umulh x26, x24, x15 - adcs x11, x11, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x12 + umulh x12, x25, x12 + adds x8, x8, x26 + mul x26, x25, x13 + umulh x13, x25, x13 + adcs x9, x9, x26 + mul x26, x25, x14 + umulh x14, x25, x14 + adcs x10, x10, x26 + mul x26, x25, x15 + umulh x27, x25, x15 + adcs x11, x11, x26 + adc x27, x27, xzr # Add remaining product results in adds x9, x9, x12 adcs x10, x10, x13 adcs x11, x11, x14 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x11, #63 - mul x26, x26, x24 + extr x27, x27, x11, #63 + mul x27, x27, x25 and x11, x11, #0x7fffffffffffffff - adds x8, x8, x26 + adds x8, x8, x27 adcs x9, x9, xzr adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - and x26, x24, x11, asr 63 + and x27, x25, x11, asr 63 and x11, x11, #0x7fffffffffffffff - adds x8, x8, x26 + adds x8, x8, x27 adcs x9, x9, xzr adcs x10, x10, xzr adc x11, x11, xzr @@ -5944,41 +5959,42 @@ fe_ge_add: adcs x13, x5, x9 adcs x14, x6, x10 adc x15, x7, x11 - mov x24, #-19 - asr x27, x15, #63 + mov x25, #-19 + asr x28, x15, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x12, x12, x24 - sbcs x13, x13, x27 - sbcs x14, x14, x27 - sbc x15, x15, x25 + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 # Sub subs x16, x4, x8 sbcs x17, x5, x9 - sbcs x18, x6, x10 - sbcs x19, x7, x11 - mov x24, #-19 - csetm x27, cc + sbcs x19, x6, x10 + sbcs x20, x7, x11 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x12, x13, [x0] stp x14, x15, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x17, [x29, #88] - ldp x18, x19, [x29, #96] - ldp x20, x21, [x29, #112] - ldp x22, x23, [x29, #128] - ldp x24, x25, [x29, #144] - ldp x26, x27, [x29, #160] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] ldp x29, x30, [sp], #0xb0 ret .size fe_ge_add,.-fe_ge_add @@ -5990,11 +6006,12 @@ fe_ge_sub: stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #88] - stp x18, x19, [x29, #96] - stp x20, x21, [x29, #112] - stp x22, x23, [x29, #128] - stp x24, x25, [x29, #144] - stp x26, x27, [x29, #160] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] @@ -6009,170 +6026,170 @@ fe_ge_sub: ldp x12, x13, [x2] ldp x14, x15, [x2, #16] ldp x16, x17, [x3] - ldp x18, x19, [x3, #16] + ldp x19, x20, [x3, #16] adds x4, x12, x16 adcs x5, x13, x17 - adcs x6, x14, x18 - adc x7, x15, x19 - mov x24, #-19 - asr x27, x7, #63 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x24 - sbcs x5, x5, x27 - sbcs x6, x6, x27 - sbc x7, x7, x25 + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 # Sub subs x8, x12, x16 sbcs x9, x13, x17 - sbcs x10, x14, x18 - sbcs x11, x15, x19 - mov x24, #-19 - csetm x27, cc + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x8, x8, x24 - adcs x9, x9, x27 - adcs x10, x10, x27 - adc x11, x11, x25 + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 ldr x0, [x29, #32] ldr x2, [x29, #200] # Multiply - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] # A[0] * B[0] - mul x12, x4, x20 - umulh x13, x4, x20 + mul x12, x4, x21 + umulh x13, x4, x21 # A[0] * B[1] - mul x24, x4, x21 - umulh x14, x4, x21 - adds x13, x13, x24 + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 adc x14, x14, xzr # A[1] * B[0] - mul x24, x5, x20 - umulh x25, x5, x20 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, xzr, xzr # A[0] * B[2] - mul x24, x4, x22 - umulh x25, x4, x22 - adds x14, x14, x24 - adc x15, x15, x25 + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 # A[1] * B[1] - mul x24, x5, x21 - umulh x25, x5, x21 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x6, x20 - umulh x25, x6, x20 - adds x14, x14, x24 - adcs x15, x15, x25 + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x4, x23 - umulh x25, x4, x23 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x5, x22 - umulh x25, x5, x22 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x6, x21 - umulh x25, x6, x21 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x7, x20 - umulh x25, x7, x20 - adds x15, x15, x24 - adcs x16, x16, x25 + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x5, x23 - umulh x25, x5, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x6, x22 - umulh x25, x6, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x7, x21 - umulh x25, x7, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x6, x23 - umulh x25, x6, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x7, x22 - umulh x25, x7, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x7, x23 - umulh x25, x7, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x15, #63 and x15, x15, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x12, x12, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x13, x13, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x14, x14, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x15, x15, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr # Add remaining product results in adds x13, x13, x16 adcs x14, x14, x17 - adcs x15, x15, x18 - adc x26, x26, xzr + adcs x15, x15, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x15, #63 - mul x26, x26, x24 + extr x27, x27, x15, #63 + mul x27, x27, x25 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr # Reduce if top bit set - and x26, x24, x15, asr 63 + and x27, x25, x15, asr 63 and x15, x15, #0x7fffffffffffffff - adds x12, x12, x26 + adds x12, x12, x27 adcs x13, x13, xzr adcs x14, x14, xzr adc x15, x15, xzr @@ -6180,137 +6197,137 @@ fe_ge_sub: ldr x0, [x29, #24] ldr x1, [x29, #192] # Multiply - ldp x20, x21, [x1] - ldp x22, x23, [x1, #16] + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] # A[0] * B[0] - mul x4, x8, x20 - umulh x5, x8, x20 + mul x4, x8, x21 + umulh x5, x8, x21 # A[0] * B[1] - mul x24, x8, x21 - umulh x6, x8, x21 - adds x5, x5, x24 + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x9, x20 - umulh x25, x9, x20 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x8, x22 - umulh x25, x8, x22 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x9, x21 - umulh x25, x9, x21 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, xzr, xzr # A[2] * B[0] - mul x24, x10, x20 - umulh x25, x10, x20 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 adc x16, x16, xzr # A[0] * B[3] - mul x24, x8, x23 - umulh x25, x8, x23 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, xzr, xzr # A[1] * B[2] - mul x24, x9, x22 - umulh x25, x9, x22 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[2] * B[1] - mul x24, x10, x21 - umulh x25, x10, x21 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[3] * B[0] - mul x24, x11, x20 - umulh x25, x11, x20 - adds x7, x7, x24 - adcs x16, x16, x25 + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[3] - mul x24, x9, x23 - umulh x25, x9, x23 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, xzr, xzr - # A[2] * B[2] - mul x24, x10, x22 - umulh x25, x10, x22 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[3] * B[1] - mul x24, x11, x21 - umulh x25, x11, x21 - adds x16, x16, x24 - adcs x17, x17, x25 - adc x18, x18, xzr - # A[2] * B[3] - mul x24, x10, x23 - umulh x25, x10, x23 - adds x17, x17, x24 - adcs x18, x18, x25 + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, xzr, xzr - # A[3] * B[2] - mul x24, x11, x22 - umulh x25, x11, x22 - adds x17, x17, x24 - adcs x18, x18, x25 + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr # A[3] * B[3] - mul x24, x11, x23 - umulh x25, x11, x23 - adds x18, x18, x24 - adc x19, x19, x25 + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 - extr x19, x19, x18, #63 - extr x18, x18, x17, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 extr x17, x17, x16, #63 extr x16, x16, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x16 - umulh x16, x24, x16 - adds x4, x4, x25 - mul x25, x24, x17 - umulh x17, x24, x17 - adcs x5, x5, x25 - mul x25, x24, x18 - umulh x18, x24, x18 - adcs x6, x6, x25 - mul x25, x24, x19 - umulh x26, x24, x19 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x16 adcs x6, x6, x17 - adcs x7, x7, x18 - adc x26, x26, xzr + adcs x7, x7, x19 + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -6322,35 +6339,35 @@ fe_ge_sub: adcs x9, x13, x5 adcs x10, x14, x6 adc x11, x15, x7 - mov x24, #-19 - asr x27, x11, #63 + mov x25, #-19 + asr x28, x11, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x24 - sbcs x9, x9, x27 - sbcs x10, x10, x27 - sbc x11, x11, x25 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 # Sub subs x16, x12, x4 sbcs x17, x13, x5 - sbcs x18, x14, x6 - sbcs x19, x15, x7 - mov x24, #-19 - csetm x27, cc + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x0, [x29, #48] ldr x1, [x29, #64] ldr x2, [x29, #176] @@ -6358,97 +6375,97 @@ fe_ge_sub: ldp x12, x13, [x1] ldp x14, x15, [x1, #16] ldp x16, x17, [x2] - ldp x18, x19, [x2, #16] + ldp x19, x20, [x2, #16] # A[0] * B[0] mul x4, x12, x16 umulh x5, x12, x16 # A[0] * B[1] - mul x24, x12, x17 + mul x25, x12, x17 umulh x6, x12, x17 - adds x5, x5, x24 + adds x5, x5, x25 adc x6, x6, xzr # A[1] * B[0] - mul x24, x13, x16 - umulh x25, x13, x16 - adds x5, x5, x24 - adcs x6, x6, x25 + mul x25, x13, x16 + umulh x26, x13, x16 + adds x5, x5, x25 + adcs x6, x6, x26 adc x7, xzr, xzr # A[0] * B[2] - mul x24, x12, x18 - umulh x25, x12, x18 - adds x6, x6, x24 - adc x7, x7, x25 + mul x25, x12, x19 + umulh x26, x12, x19 + adds x6, x6, x25 + adc x7, x7, x26 # A[1] * B[1] - mul x24, x13, x17 - umulh x25, x13, x17 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x13, x17 + umulh x26, x13, x17 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, xzr, xzr # A[2] * B[0] - mul x24, x14, x16 - umulh x25, x14, x16 - adds x6, x6, x24 - adcs x7, x7, x25 + mul x25, x14, x16 + umulh x26, x14, x16 + adds x6, x6, x25 + adcs x7, x7, x26 adc x8, x8, xzr # A[0] * B[3] - mul x24, x12, x19 - umulh x25, x12, x19 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x12, x20 + umulh x26, x12, x20 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, xzr, xzr # A[1] * B[2] - mul x24, x13, x18 - umulh x25, x13, x18 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x13, x19 + umulh x26, x13, x19 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[2] * B[1] - mul x24, x14, x17 - umulh x25, x14, x17 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x14, x17 + umulh x26, x14, x17 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[3] * B[0] - mul x24, x15, x16 - umulh x25, x15, x16 - adds x7, x7, x24 - adcs x8, x8, x25 + mul x25, x15, x16 + umulh x26, x15, x16 + adds x7, x7, x25 + adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[3] - mul x24, x13, x19 - umulh x25, x13, x19 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x13, x20 + umulh x26, x13, x20 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, xzr, xzr # A[2] * B[2] - mul x24, x14, x18 - umulh x25, x14, x18 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x14, x19 + umulh x26, x14, x19 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[3] * B[1] - mul x24, x15, x17 - umulh x25, x15, x17 - adds x8, x8, x24 - adcs x9, x9, x25 + mul x25, x15, x17 + umulh x26, x15, x17 + adds x8, x8, x25 + adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[3] - mul x24, x14, x19 - umulh x25, x14, x19 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x14, x20 + umulh x26, x14, x20 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, xzr, xzr # A[3] * B[2] - mul x24, x15, x18 - umulh x25, x15, x18 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x15, x19 + umulh x26, x15, x19 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[3] - mul x24, x15, x19 - umulh x25, x15, x19 - adds x10, x10, x24 - adc x11, x11, x25 + mul x25, x15, x20 + umulh x26, x15, x20 + adds x10, x10, x25 + adc x11, x11, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x11, x11, x10, #63 @@ -6457,37 +6474,37 @@ fe_ge_sub: extr x8, x8, x7, #63 and x7, x7, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x8 - umulh x8, x24, x8 - adds x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x10, x24, x10 - adcs x6, x6, x25 - mul x25, x24, x11 - umulh x26, x24, x11 - adcs x7, x7, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr # Add remaining product results in adds x5, x5, x8 adcs x6, x6, x9 adcs x7, x7, x10 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x7, #63 - mul x26, x26, x24 + extr x27, x27, x7, #63 + mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr # Reduce if top bit set - and x26, x24, x7, asr 63 + and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x26 + adds x4, x4, x27 adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr @@ -6498,114 +6515,114 @@ fe_ge_sub: adcs x5, x5, x5 adcs x6, x6, x6 adc x7, x7, x7 - mov x24, #-19 - asr x27, x7, #63 + mov x25, #-19 + asr x28, x7, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x4, x4, x24 - sbcs x5, x5, x27 - sbcs x6, x6, x27 - sbc x7, x7, x25 + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 ldr x0, [x29, #40] ldr x1, [x29, #184] ldr x2, [x29, #72] # Multiply ldp x16, x17, [x1] - ldp x18, x19, [x1, #16] - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] # A[0] * B[0] - mul x8, x16, x20 - umulh x9, x16, x20 + mul x8, x16, x21 + umulh x9, x16, x21 # A[0] * B[1] - mul x24, x16, x21 - umulh x10, x16, x21 - adds x9, x9, x24 + mul x25, x16, x22 + umulh x10, x16, x22 + adds x9, x9, x25 adc x10, x10, xzr # A[1] * B[0] - mul x24, x17, x20 - umulh x25, x17, x20 - adds x9, x9, x24 - adcs x10, x10, x25 + mul x25, x17, x21 + umulh x26, x17, x21 + adds x9, x9, x25 + adcs x10, x10, x26 adc x11, xzr, xzr # A[0] * B[2] - mul x24, x16, x22 - umulh x25, x16, x22 - adds x10, x10, x24 - adc x11, x11, x25 + mul x25, x16, x23 + umulh x26, x16, x23 + adds x10, x10, x25 + adc x11, x11, x26 # A[1] * B[1] - mul x24, x17, x21 - umulh x25, x17, x21 - adds x10, x10, x24 - adcs x11, x11, x25 + mul x25, x17, x22 + umulh x26, x17, x22 + adds x10, x10, x25 + adcs x11, x11, x26 adc x12, xzr, xzr # A[2] * B[0] - mul x24, x18, x20 - umulh x25, x18, x20 - adds x10, x10, x24 - adcs x11, x11, x25 + mul x25, x19, x21 + umulh x26, x19, x21 + adds x10, x10, x25 + adcs x11, x11, x26 adc x12, x12, xzr # A[0] * B[3] - mul x24, x16, x23 - umulh x25, x16, x23 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x16, x24 + umulh x26, x16, x24 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, xzr, xzr # A[1] * B[2] - mul x24, x17, x22 - umulh x25, x17, x22 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x17, x23 + umulh x26, x17, x23 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, x13, xzr # A[2] * B[1] - mul x24, x18, x21 - umulh x25, x18, x21 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x19, x22 + umulh x26, x19, x22 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, x13, xzr # A[3] * B[0] - mul x24, x19, x20 - umulh x25, x19, x20 - adds x11, x11, x24 - adcs x12, x12, x25 + mul x25, x20, x21 + umulh x26, x20, x21 + adds x11, x11, x25 + adcs x12, x12, x26 adc x13, x13, xzr # A[1] * B[3] - mul x24, x17, x23 - umulh x25, x17, x23 - adds x12, x12, x24 - adcs x13, x13, x25 + mul x25, x17, x24 + umulh x26, x17, x24 + adds x12, x12, x25 + adcs x13, x13, x26 adc x14, xzr, xzr # A[2] * B[2] - mul x24, x18, x22 - umulh x25, x18, x22 - adds x12, x12, x24 - adcs x13, x13, x25 + mul x25, x19, x23 + umulh x26, x19, x23 + adds x12, x12, x25 + adcs x13, x13, x26 adc x14, x14, xzr # A[3] * B[1] - mul x24, x19, x21 - umulh x25, x19, x21 - adds x12, x12, x24 - adcs x13, x13, x25 + mul x25, x20, x22 + umulh x26, x20, x22 + adds x12, x12, x25 + adcs x13, x13, x26 adc x14, x14, xzr # A[2] * B[3] - mul x24, x18, x23 - umulh x25, x18, x23 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x19, x24 + umulh x26, x19, x24 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, xzr, xzr # A[3] * B[2] - mul x24, x19, x22 - umulh x25, x19, x22 - adds x13, x13, x24 - adcs x14, x14, x25 + mul x25, x20, x23 + umulh x26, x20, x23 + adds x13, x13, x25 + adcs x14, x14, x26 adc x15, x15, xzr # A[3] * B[3] - mul x24, x19, x23 - umulh x25, x19, x23 - adds x14, x14, x24 - adc x15, x15, x25 + mul x25, x20, x24 + umulh x26, x20, x24 + adds x14, x14, x25 + adc x15, x15, x26 # Reduce # Move top half into t4-t7 and remove top bit from t3 extr x15, x15, x14, #63 @@ -6614,37 +6631,37 @@ fe_ge_sub: extr x12, x12, x11, #63 and x11, x11, #0x7fffffffffffffff # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x12 - umulh x12, x24, x12 - adds x8, x8, x25 - mul x25, x24, x13 - umulh x13, x24, x13 - adcs x9, x9, x25 - mul x25, x24, x14 - umulh x14, x24, x14 - adcs x10, x10, x25 - mul x25, x24, x15 - umulh x26, x24, x15 - adcs x11, x11, x25 - adc x26, x26, xzr + mov x25, #19 + mul x26, x25, x12 + umulh x12, x25, x12 + adds x8, x8, x26 + mul x26, x25, x13 + umulh x13, x25, x13 + adcs x9, x9, x26 + mul x26, x25, x14 + umulh x14, x25, x14 + adcs x10, x10, x26 + mul x26, x25, x15 + umulh x27, x25, x15 + adcs x11, x11, x26 + adc x27, x27, xzr # Add remaining product results in adds x9, x9, x12 adcs x10, x10, x13 adcs x11, x11, x14 - adc x26, x26, xzr + adc x27, x27, xzr # Overflow - extr x26, x26, x11, #63 - mul x26, x26, x24 + extr x27, x27, x11, #63 + mul x27, x27, x25 and x11, x11, #0x7fffffffffffffff - adds x8, x8, x26 + adds x8, x8, x27 adcs x9, x9, xzr adcs x10, x10, xzr adc x11, x11, xzr # Reduce if top bit set - and x26, x24, x11, asr 63 + and x27, x25, x11, asr 63 and x11, x11, #0x7fffffffffffffff - adds x8, x8, x26 + adds x8, x8, x27 adcs x9, x9, xzr adcs x10, x10, xzr adc x11, x11, xzr @@ -6656,43 +6673,43 @@ fe_ge_sub: adcs x13, x5, x9 adcs x14, x6, x10 adc x15, x7, x11 - mov x24, #-19 - asr x27, x15, #63 + mov x25, #-19 + asr x28, x15, #63 # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x12, x12, x24 - sbcs x13, x13, x27 - sbcs x14, x14, x27 - sbc x15, x15, x25 + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 # Sub subs x16, x4, x8 sbcs x17, x5, x9 - sbcs x18, x6, x10 - sbcs x19, x7, x11 - mov x24, #-19 - csetm x27, cc + sbcs x19, x6, x10 + sbcs x20, x7, x11 + mov x25, #-19 + csetm x28, cc # Mask the modulus - and x24, x27, x24 - and x25, x27, #0x7fffffffffffffff + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff # Add modulus (if underflow) - adds x16, x16, x24 - adcs x17, x17, x27 - adcs x18, x18, x27 - adc x19, x19, x25 + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 stp x12, x13, [x0] stp x14, x15, [x0, #16] stp x16, x17, [x1] - stp x18, x19, [x1, #16] + stp x19, x20, [x1, #16] ldr x17, [x29, #88] - ldp x18, x19, [x29, #96] - ldp x20, x21, [x29, #112] - ldp x22, x23, [x29, #128] - ldp x24, x25, [x29, #144] - ldp x26, x27, [x29, #160] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] ldp x29, x30, [sp], #0xb0 ret .size fe_ge_sub,.-fe_ge_sub #endif /* __aarch64__ */ -#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c index 9d04bc6e2..6c62a50dc 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -24,15 +24,12 @@ * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c */ #ifdef __aarch64__ - #include #ifdef HAVE_CONFIG_H #include #endif #include - -#ifdef WOLFSSL_ARMASM #include #include @@ -285,212 +282,214 @@ int fe_isnegative(const fe a) void fe_cmov_table(fe* r, fe* base, signed char b) { __asm__ __volatile__ ( - "stp x29, x30, [sp, #-16]!\n\t" + "stp x29, x30, [sp, #-32]!\n\t" "add x29, sp, #0\n\t" + "str %x[r], [x29, #16]\n\t" "sxtb %x[b], %w[b]\n\t" - "sbfx x15, %x[b], #7, #1\n\t" - "eor x16, %x[b], x15\n\t" - "sub x16, x16, x15\n\t" - "mov x3, #1\n\t" - "mov x4, xzr\n\t" + "sbfx x3, %x[b], #7, #1\n\t" + "eor %x[r], %x[b], x3\n\t" + "sub %x[r], %x[r], x3\n\t" + "mov x4, #1\n\t" "mov x5, xzr\n\t" "mov x6, xzr\n\t" - "mov x7, #1\n\t" - "mov x8, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, #1\n\t" "mov x9, xzr\n\t" "mov x10, xzr\n\t" "mov x11, xzr\n\t" "mov x12, xzr\n\t" "mov x13, xzr\n\t" "mov x14, xzr\n\t" - "cmp x16, #1\n\t" - "ldp x17, x18, [%x[base]]\n\t" + "mov x15, xzr\n\t" + "cmp %x[r], #1\n\t" + "ldp x16, x17, [%x[base]]\n\t" "ldp x19, x20, [%x[base], #16]\n\t" "ldp x21, x22, [%x[base], #32]\n\t" "ldp x23, x24, [%x[base], #48]\n\t" "ldp x25, x26, [%x[base], #64]\n\t" "ldp x27, x28, [%x[base], #80]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "cmp x16, #2\n\t" - "ldp x17, x18, [%x[base], #96]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #2\n\t" + "ldp x16, x17, [%x[base], #96]\n\t" "ldp x19, x20, [%x[base], #112]\n\t" "ldp x21, x22, [%x[base], #128]\n\t" "ldp x23, x24, [%x[base], #144]\n\t" "ldp x25, x26, [%x[base], #160]\n\t" "ldp x27, x28, [%x[base], #176]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "cmp x16, #3\n\t" - "ldp x17, x18, [%x[base], #192]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #3\n\t" + "ldp x16, x17, [%x[base], #192]\n\t" "ldp x19, x20, [%x[base], #208]\n\t" "ldp x21, x22, [%x[base], #224]\n\t" "ldp x23, x24, [%x[base], #240]\n\t" "ldp x25, x26, [%x[base], #256]\n\t" "ldp x27, x28, [%x[base], #272]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "cmp x16, #4\n\t" - "ldp x17, x18, [%x[base], #288]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #4\n\t" + "ldp x16, x17, [%x[base], #288]\n\t" "ldp x19, x20, [%x[base], #304]\n\t" "ldp x21, x22, [%x[base], #320]\n\t" "ldp x23, x24, [%x[base], #336]\n\t" "ldp x25, x26, [%x[base], #352]\n\t" "ldp x27, x28, [%x[base], #368]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" "add %x[base], %x[base], #0x180\n\t" - "cmp x16, #5\n\t" - "ldp x17, x18, [%x[base]]\n\t" + "cmp %x[r], #5\n\t" + "ldp x16, x17, [%x[base]]\n\t" "ldp x19, x20, [%x[base], #16]\n\t" "ldp x21, x22, [%x[base], #32]\n\t" "ldp x23, x24, [%x[base], #48]\n\t" "ldp x25, x26, [%x[base], #64]\n\t" "ldp x27, x28, [%x[base], #80]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "cmp x16, #6\n\t" - "ldp x17, x18, [%x[base], #96]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #6\n\t" + "ldp x16, x17, [%x[base], #96]\n\t" "ldp x19, x20, [%x[base], #112]\n\t" "ldp x21, x22, [%x[base], #128]\n\t" "ldp x23, x24, [%x[base], #144]\n\t" "ldp x25, x26, [%x[base], #160]\n\t" "ldp x27, x28, [%x[base], #176]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "cmp x16, #7\n\t" - "ldp x17, x18, [%x[base], #192]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #7\n\t" + "ldp x16, x17, [%x[base], #192]\n\t" "ldp x19, x20, [%x[base], #208]\n\t" "ldp x21, x22, [%x[base], #224]\n\t" "ldp x23, x24, [%x[base], #240]\n\t" "ldp x25, x26, [%x[base], #256]\n\t" "ldp x27, x28, [%x[base], #272]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "cmp x16, #8\n\t" - "ldp x17, x18, [%x[base], #288]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #8\n\t" + "ldp x16, x17, [%x[base], #288]\n\t" "ldp x19, x20, [%x[base], #304]\n\t" "ldp x21, x22, [%x[base], #320]\n\t" "ldp x23, x24, [%x[base], #336]\n\t" "ldp x25, x26, [%x[base], #352]\n\t" "ldp x27, x28, [%x[base], #368]\n\t" - "csel x3, x17, x3, eq\n\t" - "csel x4, x18, x4, eq\n\t" - "csel x5, x19, x5, eq\n\t" - "csel x6, x20, x6, eq\n\t" - "csel x7, x21, x7, eq\n\t" - "csel x8, x22, x8, eq\n\t" - "csel x9, x23, x9, eq\n\t" - "csel x10, x24, x10, eq\n\t" - "csel x11, x25, x11, eq\n\t" - "csel x12, x26, x12, eq\n\t" - "csel x13, x27, x13, eq\n\t" - "csel x14, x28, x14, eq\n\t" - "mov x17, #-19\n\t" - "mov x18, #-1\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "mov x16, #-19\n\t" + "mov x17, #-1\n\t" "mov x19, #-1\n\t" "mov x20, #0x7fffffffffffffff\n\t" - "subs x17, x17, x11\n\t" - "sbcs x18, x18, x12\n\t" - "sbcs x19, x19, x13\n\t" - "sbc x20, x20, x14\n\t" + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "sbcs x19, x19, x14\n\t" + "sbc x20, x20, x15\n\t" "cmp %x[b], #0\n\t" - "mov x15, x3\n\t" - "csel x3, x7, x3, lt\n\t" - "csel x7, x15, x7, lt\n\t" - "mov x15, x4\n\t" + "mov x3, x4\n\t" "csel x4, x8, x4, lt\n\t" - "csel x8, x15, x8, lt\n\t" - "mov x15, x5\n\t" + "csel x8, x3, x8, lt\n\t" + "mov x3, x5\n\t" "csel x5, x9, x5, lt\n\t" - "csel x9, x15, x9, lt\n\t" - "mov x15, x6\n\t" + "csel x9, x3, x9, lt\n\t" + "mov x3, x6\n\t" "csel x6, x10, x6, lt\n\t" - "csel x10, x15, x10, lt\n\t" - "csel x11, x17, x11, lt\n\t" - "csel x12, x18, x12, lt\n\t" - "csel x13, x19, x13, lt\n\t" - "csel x14, x20, x14, lt\n\t" - "stp x3, x4, [%x[r]]\n\t" - "stp x5, x6, [%x[r], #16]\n\t" - "stp x7, x8, [%x[r], #32]\n\t" - "stp x9, x10, [%x[r], #48]\n\t" - "stp x11, x12, [%x[r], #64]\n\t" - "stp x13, x14, [%x[r], #80]\n\t" - "ldp x29, x30, [sp], #16\n\t" + "csel x10, x3, x10, lt\n\t" + "mov x3, x7\n\t" + "csel x7, x11, x7, lt\n\t" + "csel x11, x3, x11, lt\n\t" + "csel x12, x16, x12, lt\n\t" + "csel x13, x17, x13, lt\n\t" + "csel x14, x19, x14, lt\n\t" + "csel x15, x20, x15, lt\n\t" + "ldr %x[r], [x29, #16]\n\t" + "stp x4, x5, [%x[r]]\n\t" + "stp x6, x7, [%x[r], #16]\n\t" + "stp x8, x9, [%x[r], #32]\n\t" + "stp x10, x11, [%x[r], #48]\n\t" + "stp x12, x13, [%x[r], #64]\n\t" + "stp x14, x15, [%x[r], #80]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } @@ -502,96 +501,96 @@ void fe_mul(fe r, const fe a, const fe b) /* Multiply */ "ldp x14, x15, [%x[a]]\n\t" "ldp x16, x17, [%x[a], #16]\n\t" - "ldp x18, x19, [%x[b]]\n\t" - "ldp x20, x21, [%x[b], #16]\n\t" + "ldp x19, x20, [%x[b]]\n\t" + "ldp x21, x22, [%x[b], #16]\n\t" /* A[0] * B[0] */ - "mul x6, x14, x18\n\t" - "umulh x7, x14, x18\n\t" + "mul x6, x14, x19\n\t" + "umulh x7, x14, x19\n\t" /* A[0] * B[1] */ - "mul x3, x14, x19\n\t" - "umulh x8, x14, x19\n\t" + "mul x3, x14, x20\n\t" + "umulh x8, x14, x20\n\t" "adds x7, x7, x3\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[0] */ - "mul x3, x15, x18\n\t" - "umulh x4, x15, x18\n\t" + "mul x3, x15, x19\n\t" + "umulh x4, x15, x19\n\t" "adds x7, x7, x3\n\t" "adcs x8, x8, x4\n\t" "adc x9, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x3, x14, x20\n\t" - "umulh x4, x14, x20\n\t" + "mul x3, x14, x21\n\t" + "umulh x4, x14, x21\n\t" "adds x8, x8, x3\n\t" "adc x9, x9, x4\n\t" /* A[1] * B[1] */ - "mul x3, x15, x19\n\t" - "umulh x4, x15, x19\n\t" + "mul x3, x15, x20\n\t" + "umulh x4, x15, x20\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x3, x16, x18\n\t" - "umulh x4, x16, x18\n\t" + "mul x3, x16, x19\n\t" + "umulh x4, x16, x19\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" "adc x10, x10, xzr\n\t" /* A[0] * B[3] */ - "mul x3, x14, x21\n\t" - "umulh x4, x14, x21\n\t" + "mul x3, x14, x22\n\t" + "umulh x4, x14, x22\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x3, x15, x20\n\t" - "umulh x4, x15, x20\n\t" + "mul x3, x15, x21\n\t" + "umulh x4, x15, x21\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" /* A[2] * B[1] */ - "mul x3, x16, x19\n\t" - "umulh x4, x16, x19\n\t" + "mul x3, x16, x20\n\t" + "umulh x4, x16, x20\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[0] */ - "mul x3, x17, x18\n\t" - "umulh x4, x17, x18\n\t" + "mul x3, x17, x19\n\t" + "umulh x4, x17, x19\n\t" "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" /* A[1] * B[3] */ - "mul x3, x15, x21\n\t" - "umulh x4, x15, x21\n\t" + "mul x3, x15, x22\n\t" + "umulh x4, x15, x22\n\t" "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x3, x16, x20\n\t" - "umulh x4, x16, x20\n\t" + "mul x3, x16, x21\n\t" + "umulh x4, x16, x21\n\t" "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, x12, xzr\n\t" /* A[3] * B[1] */ - "mul x3, x17, x19\n\t" - "umulh x4, x17, x19\n\t" + "mul x3, x17, x20\n\t" + "umulh x4, x17, x20\n\t" "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, x12, xzr\n\t" /* A[2] * B[3] */ - "mul x3, x16, x21\n\t" - "umulh x4, x16, x21\n\t" + "mul x3, x16, x22\n\t" + "umulh x4, x16, x22\n\t" "adds x11, x11, x3\n\t" "adcs x12, x12, x4\n\t" "adc x13, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x3, x17, x20\n\t" - "umulh x4, x17, x20\n\t" + "mul x3, x17, x21\n\t" + "umulh x4, x17, x21\n\t" "adds x11, x11, x3\n\t" "adcs x12, x12, x4\n\t" "adc x13, x13, xzr\n\t" /* A[3] * B[3] */ - "mul x3, x17, x21\n\t" - "umulh x4, x17, x21\n\t" + "mul x3, x17, x22\n\t" + "umulh x4, x17, x22\n\t" "adds x12, x12, x3\n\t" "adc x13, x13, x4\n\t" /* Reduce */ @@ -642,7 +641,7 @@ void fe_mul(fe r, const fe a, const fe b) "ldp x29, x30, [sp], #16\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" ); } @@ -902,36 +901,37 @@ int curve25519(byte* r, byte* n, byte* a) __asm__ __volatile__ ( "stp x29, x30, [sp, #-192]!\n\t" "add x29, sp, #0\n\t" - "mov x22, xzr\n\t" + "mov x23, xzr\n\t" "str %x[r], [x29, #176]\n\t" - /* Set one */ - "mov x23, #1\n\t" - "stp x23, xzr, [%x[r]]\n\t" - "stp xzr, xzr, [%x[r], #16]\n\t" - /* Set zero */ - "stp xzr, xzr, [x29, #16]\n\t" - "stp xzr, xzr, [x29, #32]\n\t" - /* Set one */ - "mov x23, #1\n\t" - "stp x23, xzr, [x29, #48]\n\t" - "stp xzr, xzr, [x29, #64]\n\t" + "str %x[a], [x29, #184]\n\t" /* Copy */ "ldp x6, x7, [%x[a]]\n\t" "ldp x8, x9, [%x[a], #16]\n\t" "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" + /* Set one */ + "mov %x[a], #1\n\t" + "stp %x[a], xzr, [%x[r]]\n\t" + "stp xzr, xzr, [%x[r], #16]\n\t" + /* Set zero */ + "stp xzr, xzr, [x29, #16]\n\t" + "stp xzr, xzr, [x29, #32]\n\t" + /* Set one */ + "mov %x[a], #1\n\t" + "stp %x[a], xzr, [x29, #48]\n\t" + "stp xzr, xzr, [x29, #64]\n\t" "mov x25, #62\n\t" "mov x24, #24\n\t" "\n" "L_curve25519_words_%=: \n\t" "\n" "L_curve25519_bits_%=: \n\t" - "ldr x23, [%x[n], x24]\n\t" - "lsr x23, x23, x25\n\t" - "and x23, x23, #1\n\t" - "eor x22, x22, x23\n\t" + "ldr %x[a], [%x[n], x24]\n\t" + "lsr %x[a], %x[a], x25\n\t" + "and %x[a], %x[a], #1\n\t" + "eor x23, x23, %x[a]\n\t" /* Conditional Swap */ - "cmp x22, #1\n\t" + "cmp x23, #1\n\t" "ldp x10, x11, [%x[r]]\n\t" "ldp x12, x13, [%x[r], #16]\n\t" "ldp x6, x7, [x29, #80]\n\t" @@ -945,66 +945,66 @@ int curve25519(byte* r, byte* n, byte* a) "csel x17, x13, x9, eq\n\t" "csel x13, x9, x13, eq\n\t" /* Conditional Swap */ - "cmp x22, #1\n\t" - "ldp x18, x19, [x29, #16]\n\t" - "ldp x20, x21, [x29, #32]\n\t" + "cmp x23, #1\n\t" + "ldp x19, x20, [x29, #16]\n\t" + "ldp x21, x22, [x29, #32]\n\t" "ldp x6, x7, [x29, #48]\n\t" "ldp x8, x9, [x29, #64]\n\t" - "csel x5, x18, x6, eq\n\t" - "csel x18, x6, x18, eq\n\t" - "csel x26, x19, x7, eq\n\t" - "csel x19, x7, x19, eq\n\t" - "csel x27, x20, x8, eq\n\t" - "csel x20, x8, x20, eq\n\t" - "csel x28, x21, x9, eq\n\t" - "csel x21, x9, x21, eq\n\t" - "mov x22, x23\n\t" + "csel x5, x19, x6, eq\n\t" + "csel x19, x6, x19, eq\n\t" + "csel x26, x20, x7, eq\n\t" + "csel x20, x7, x20, eq\n\t" + "csel x27, x21, x8, eq\n\t" + "csel x21, x8, x21, eq\n\t" + "csel x28, x22, x9, eq\n\t" + "csel x22, x9, x22, eq\n\t" + "mov x23, %x[a]\n\t" /* Add */ - "adds x6, x10, x18\n\t" - "adcs x7, x11, x19\n\t" - "adcs x8, x12, x20\n\t" - "adc x9, x13, x21\n\t" + "adds x6, x10, x19\n\t" + "adcs x7, x11, x20\n\t" + "adcs x8, x12, x21\n\t" + "adc x9, x13, x22\n\t" "mov x3, #-19\n\t" - "asr x23, x9, #63\n\t" + "asr %x[a], x9, #63\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ "subs x6, x6, x3\n\t" - "sbcs x7, x7, x23\n\t" - "sbcs x8, x8, x23\n\t" + "sbcs x7, x7, %x[a]\n\t" + "sbcs x8, x8, %x[a]\n\t" "sbc x9, x9, x4\n\t" /* Sub */ - "subs x18, x10, x18\n\t" - "sbcs x19, x11, x19\n\t" - "sbcs x20, x12, x20\n\t" - "sbcs x21, x13, x21\n\t" + "subs x19, x10, x19\n\t" + "sbcs x20, x11, x20\n\t" + "sbcs x21, x12, x21\n\t" + "sbcs x22, x13, x22\n\t" "mov x3, #-19\n\t" - "csetm x23, cc\n\t" + "csetm %x[a], cc\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x18, x18, x3\n\t" - "adcs x19, x19, x23\n\t" - "adcs x20, x20, x23\n\t" - "adc x21, x21, x4\n\t" - "stp x18, x19, [x29, #144]\n\t" - "stp x20, x21, [x29, #160]\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, %x[a]\n\t" + "adcs x21, x21, %x[a]\n\t" + "adc x22, x22, x4\n\t" + "stp x19, x20, [x29, #144]\n\t" + "stp x21, x22, [x29, #160]\n\t" /* Add */ "adds x10, x14, x5\n\t" "adcs x11, x15, x26\n\t" "adcs x12, x16, x27\n\t" "adc x13, x17, x28\n\t" "mov x3, #-19\n\t" - "asr x23, x13, #63\n\t" + "asr %x[a], x13, #63\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ "subs x10, x10, x3\n\t" - "sbcs x11, x11, x23\n\t" - "sbcs x12, x12, x23\n\t" + "sbcs x11, x11, %x[a]\n\t" + "sbcs x12, x12, %x[a]\n\t" "sbc x13, x13, x4\n\t" /* Sub */ "subs x14, x14, x5\n\t" @@ -1012,87 +1012,87 @@ int curve25519(byte* r, byte* n, byte* a) "sbcs x16, x16, x27\n\t" "sbcs x17, x17, x28\n\t" "mov x3, #-19\n\t" - "csetm x23, cc\n\t" + "csetm %x[a], cc\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ "adds x14, x14, x3\n\t" - "adcs x15, x15, x23\n\t" - "adcs x16, x16, x23\n\t" + "adcs x15, x15, %x[a]\n\t" + "adcs x16, x16, %x[a]\n\t" "adc x17, x17, x4\n\t" /* Multiply */ /* A[0] * B[0] */ - "mul x18, x14, x6\n\t" - "umulh x19, x14, x6\n\t" + "mul x19, x14, x6\n\t" + "umulh x20, x14, x6\n\t" /* A[0] * B[1] */ "mul x3, x14, x7\n\t" - "umulh x20, x14, x7\n\t" - "adds x19, x19, x3\n\t" - "adc x20, x20, xzr\n\t" + "umulh x21, x14, x7\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, xzr\n\t" /* A[1] * B[0] */ "mul x3, x15, x6\n\t" "umulh x4, x15, x6\n\t" - "adds x19, x19, x3\n\t" - "adcs x20, x20, x4\n\t" - "adc x21, xzr, xzr\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, xzr, xzr\n\t" /* A[0] * B[2] */ "mul x3, x14, x8\n\t" "umulh x4, x14, x8\n\t" - "adds x20, x20, x3\n\t" - "adc x21, x21, x4\n\t" + "adds x21, x21, x3\n\t" + "adc x22, x22, x4\n\t" /* A[1] * B[1] */ "mul x3, x15, x7\n\t" "umulh x4, x15, x7\n\t" - "adds x20, x20, x3\n\t" - "adcs x21, x21, x4\n\t" - "adc x23, xzr, xzr\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" + "adc %x[a], xzr, xzr\n\t" /* A[2] * B[0] */ "mul x3, x16, x6\n\t" "umulh x4, x16, x6\n\t" - "adds x20, x20, x3\n\t" - "adcs x21, x21, x4\n\t" - "adc x23, x23, xzr\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[0] * B[3] */ "mul x3, x14, x9\n\t" "umulh x4, x14, x9\n\t" - "adds x21, x21, x3\n\t" - "adcs x23, x23, x4\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * B[2] */ "mul x3, x15, x8\n\t" "umulh x4, x15, x8\n\t" - "adds x21, x21, x3\n\t" - "adcs x23, x23, x4\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[2] * B[1] */ "mul x3, x16, x7\n\t" "umulh x4, x16, x7\n\t" - "adds x21, x21, x3\n\t" - "adcs x23, x23, x4\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[3] * B[0] */ "mul x3, x17, x6\n\t" "umulh x4, x17, x6\n\t" - "adds x21, x21, x3\n\t" - "adcs x23, x23, x4\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[1] * B[3] */ "mul x3, x15, x9\n\t" "umulh x4, x15, x9\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, xzr, xzr\n\t" /* A[2] * B[2] */ "mul x3, x16, x8\n\t" "umulh x4, x16, x8\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[3] * B[1] */ "mul x3, x17, x7\n\t" "umulh x4, x17, x7\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[2] * B[3] */ @@ -1116,103 +1116,103 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x21, #63\n\t" - "and x21, x21, #0x7fffffffffffffff\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x22, #63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" - "adds x18, x18, x4\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x19, x19, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" - "adcs x19, x19, x4\n\t" + "adcs x20, x20, x4\n\t" "mul x4, x3, x27\n\t" "umulh x27, x3, x27\n\t" - "adcs x20, x20, x4\n\t" + "adcs x21, x21, x4\n\t" "mul x4, x3, x28\n\t" "umulh x5, x3, x28\n\t" - "adcs x21, x21, x4\n\t" + "adcs x22, x22, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x19, x19, x23\n\t" - "adcs x20, x20, x26\n\t" - "adcs x21, x21, x27\n\t" + "adds x20, x20, %x[a]\n\t" + "adcs x21, x21, x26\n\t" + "adcs x22, x22, x27\n\t" "adc x5, x5, xzr\n\t" /* Overflow */ - "extr x5, x5, x21, #63\n\t" + "extr x5, x5, x22, #63\n\t" "mul x5, x5, x3\n\t" - "and x21, x21, #0x7fffffffffffffff\n\t" - "adds x18, x18, x5\n\t" - "adcs x19, x19, xzr\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" "adcs x20, x20, xzr\n\t" - "adc x21, x21, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Reduce if top bit set */ - "and x5, x3, x21, asr 63\n\t" - "and x21, x21, #0x7fffffffffffffff\n\t" - "adds x18, x18, x5\n\t" - "adcs x19, x19, xzr\n\t" + "and x5, x3, x22, asr 63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" "adcs x20, x20, xzr\n\t" - "adc x21, x21, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Store */ - "stp x18, x19, [x29, #112]\n\t" - "stp x20, x21, [x29, #128]\n\t" + "stp x19, x20, [x29, #112]\n\t" + "stp x21, x22, [x29, #128]\n\t" /* Multiply */ - "ldp x23, x26, [x29, #144]\n\t" + "ldp %x[a], x26, [x29, #144]\n\t" "ldp x27, x28, [x29, #160]\n\t" /* A[0] * B[0] */ - "mul x18, x10, x23\n\t" - "umulh x19, x10, x23\n\t" + "mul x19, x10, %x[a]\n\t" + "umulh x20, x10, %x[a]\n\t" /* A[0] * B[1] */ "mul x3, x10, x26\n\t" - "umulh x20, x10, x26\n\t" - "adds x19, x19, x3\n\t" - "adc x20, x20, xzr\n\t" + "umulh x21, x10, x26\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, xzr\n\t" /* A[1] * B[0] */ - "mul x3, x11, x23\n\t" - "umulh x4, x11, x23\n\t" - "adds x19, x19, x3\n\t" - "adcs x20, x20, x4\n\t" - "adc x21, xzr, xzr\n\t" + "mul x3, x11, %x[a]\n\t" + "umulh x4, x11, %x[a]\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, xzr, xzr\n\t" /* A[0] * B[2] */ "mul x3, x10, x27\n\t" "umulh x4, x10, x27\n\t" - "adds x20, x20, x3\n\t" - "adc x21, x21, x4\n\t" + "adds x21, x21, x3\n\t" + "adc x22, x22, x4\n\t" /* A[1] * B[1] */ "mul x3, x11, x26\n\t" "umulh x4, x11, x26\n\t" - "adds x20, x20, x3\n\t" - "adcs x21, x21, x4\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" "adc x14, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x3, x12, x23\n\t" - "umulh x4, x12, x23\n\t" - "adds x20, x20, x3\n\t" - "adcs x21, x21, x4\n\t" + "mul x3, x12, %x[a]\n\t" + "umulh x4, x12, %x[a]\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" "adc x14, x14, xzr\n\t" /* A[0] * B[3] */ "mul x3, x10, x28\n\t" "umulh x4, x10, x28\n\t" - "adds x21, x21, x3\n\t" + "adds x22, x22, x3\n\t" "adcs x14, x14, x4\n\t" "adc x15, xzr, xzr\n\t" /* A[1] * B[2] */ "mul x3, x11, x27\n\t" "umulh x4, x11, x27\n\t" - "adds x21, x21, x3\n\t" + "adds x22, x22, x3\n\t" "adcs x14, x14, x4\n\t" "adc x15, x15, xzr\n\t" /* A[2] * B[1] */ "mul x3, x12, x26\n\t" "umulh x4, x12, x26\n\t" - "adds x21, x21, x3\n\t" + "adds x22, x22, x3\n\t" "adcs x14, x14, x4\n\t" "adc x15, x15, xzr\n\t" /* A[3] * B[0] */ - "mul x3, x13, x23\n\t" - "umulh x4, x13, x23\n\t" - "adds x21, x21, x3\n\t" + "mul x3, x13, %x[a]\n\t" + "umulh x4, x13, %x[a]\n\t" + "adds x22, x22, x3\n\t" "adcs x14, x14, x4\n\t" "adc x15, x15, xzr\n\t" /* A[1] * B[3] */ @@ -1255,56 +1255,56 @@ int curve25519(byte* r, byte* n, byte* a) "extr x17, x17, x16, #63\n\t" "extr x16, x16, x15, #63\n\t" "extr x15, x15, x14, #63\n\t" - "extr x14, x14, x21, #63\n\t" - "and x21, x21, #0x7fffffffffffffff\n\t" + "extr x14, x14, x22, #63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" "mul x4, x3, x14\n\t" "umulh x14, x3, x14\n\t" - "adds x18, x18, x4\n\t" + "adds x19, x19, x4\n\t" "mul x4, x3, x15\n\t" "umulh x15, x3, x15\n\t" - "adcs x19, x19, x4\n\t" + "adcs x20, x20, x4\n\t" "mul x4, x3, x16\n\t" "umulh x16, x3, x16\n\t" - "adcs x20, x20, x4\n\t" + "adcs x21, x21, x4\n\t" "mul x4, x3, x17\n\t" "umulh x5, x3, x17\n\t" - "adcs x21, x21, x4\n\t" + "adcs x22, x22, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x19, x19, x14\n\t" - "adcs x20, x20, x15\n\t" - "adcs x21, x21, x16\n\t" + "adds x20, x20, x14\n\t" + "adcs x21, x21, x15\n\t" + "adcs x22, x22, x16\n\t" "adc x5, x5, xzr\n\t" /* Overflow */ - "extr x5, x5, x21, #63\n\t" + "extr x5, x5, x22, #63\n\t" "mul x5, x5, x3\n\t" - "and x21, x21, #0x7fffffffffffffff\n\t" - "adds x18, x18, x5\n\t" - "adcs x19, x19, xzr\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" "adcs x20, x20, xzr\n\t" - "adc x21, x21, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Reduce if top bit set */ - "and x5, x3, x21, asr 63\n\t" - "and x21, x21, #0x7fffffffffffffff\n\t" - "adds x18, x18, x5\n\t" - "adcs x19, x19, xzr\n\t" + "and x5, x3, x22, asr 63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" "adcs x20, x20, xzr\n\t" - "adc x21, x21, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Store */ /* Square */ /* A[0] * A[1] */ - "mul x11, x23, x26\n\t" - "umulh x12, x23, x26\n\t" + "mul x11, %x[a], x26\n\t" + "umulh x12, %x[a], x26\n\t" /* A[0] * A[2] */ - "mul x3, x23, x27\n\t" - "umulh x13, x23, x27\n\t" + "mul x3, %x[a], x27\n\t" + "umulh x13, %x[a], x27\n\t" "adds x12, x12, x3\n\t" "adc x13, x13, xzr\n\t" /* A[0] * A[3] */ - "mul x3, x23, x28\n\t" - "umulh x14, x23, x28\n\t" + "mul x3, %x[a], x28\n\t" + "umulh x14, %x[a], x28\n\t" "adds x13, x13, x3\n\t" "adc x14, x14, xzr\n\t" /* A[1] * A[2] */ @@ -1332,8 +1332,8 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x16, x16, x16\n\t" "adc x17, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x10, x23, x23\n\t" - "umulh x5, x23, x23\n\t" + "mul x10, %x[a], %x[a]\n\t" + "umulh x5, %x[a], %x[a]\n\t" /* A[1] * A[1] */ "mul x3, x26, x26\n\t" "umulh x4, x26, x26\n\t" @@ -1406,19 +1406,19 @@ int curve25519(byte* r, byte* n, byte* a) "adc x17, x17, xzr\n\t" /* A[0] * A[3] */ "mul x3, x6, x9\n\t" - "umulh x23, x6, x9\n\t" + "umulh %x[a], x6, x9\n\t" "adds x17, x17, x3\n\t" - "adc x23, x23, xzr\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[1] * A[2] */ "mul x3, x7, x8\n\t" "umulh x4, x7, x8\n\t" "adds x17, x17, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * A[3] */ "mul x3, x7, x9\n\t" "umulh x4, x7, x9\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adc x26, x26, x4\n\t" /* A[2] * A[3] */ "mul x3, x8, x9\n\t" @@ -1429,7 +1429,7 @@ int curve25519(byte* r, byte* n, byte* a) "adds x15, x15, x15\n\t" "adcs x16, x16, x16\n\t" "adcs x17, x17, x17\n\t" - "adcs x23, x23, x23\n\t" + "adcs %x[a], %x[a], %x[a]\n\t" "adcs x26, x26, x26\n\t" "adcs x27, x27, x27\n\t" "adc x28, xzr, xzr\n\t" @@ -1446,7 +1446,7 @@ int curve25519(byte* r, byte* n, byte* a) "mul x3, x8, x8\n\t" "umulh x4, x8, x8\n\t" "adds x17, x17, x5\n\t" - "adcs x23, x23, x3\n\t" + "adcs %x[a], %x[a], x3\n\t" "adc x5, x4, xzr\n\t" /* A[3] * A[3] */ "mul x3, x9, x9\n\t" @@ -1458,13 +1458,13 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x17, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x17, #63\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" "adds x14, x14, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" @@ -1477,7 +1477,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x17, x17, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x15, x15, x23\n\t" + "adds x15, x15, %x[a]\n\t" "adcs x16, x16, x26\n\t" "adcs x17, x17, x27\n\t" "adc x5, x5, xzr\n\t" @@ -1522,53 +1522,53 @@ int curve25519(byte* r, byte* n, byte* a) "umulh x4, x15, x11\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x23, xzr, xzr\n\t" + "adc %x[a], xzr, xzr\n\t" /* A[2] * B[0] */ "mul x3, x16, x10\n\t" "umulh x4, x16, x10\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x23, x23, xzr\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[0] * B[3] */ "mul x3, x14, x13\n\t" "umulh x4, x14, x13\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * B[2] */ "mul x3, x15, x12\n\t" "umulh x4, x15, x12\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[2] * B[1] */ "mul x3, x16, x11\n\t" "umulh x4, x16, x11\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[3] * B[0] */ "mul x3, x17, x10\n\t" "umulh x4, x17, x10\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[1] * B[3] */ "mul x3, x15, x13\n\t" "umulh x4, x15, x13\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, xzr, xzr\n\t" /* A[2] * B[2] */ "mul x3, x16, x12\n\t" "umulh x4, x16, x12\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[3] * B[1] */ "mul x3, x17, x11\n\t" "umulh x4, x17, x11\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[2] * B[3] */ @@ -1592,13 +1592,13 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x9, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" "adds x6, x6, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" @@ -1611,7 +1611,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x23\n\t" + "adds x7, x7, %x[a]\n\t" "adcs x8, x8, x26\n\t" "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" @@ -1639,14 +1639,14 @@ int curve25519(byte* r, byte* n, byte* a) "sbcs x16, x16, x12\n\t" "sbcs x17, x17, x13\n\t" "mov x3, #-19\n\t" - "csetm x23, cc\n\t" + "csetm %x[a], cc\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ "adds x14, x14, x3\n\t" - "adcs x15, x15, x23\n\t" - "adcs x16, x16, x23\n\t" + "adcs x15, x15, %x[a]\n\t" + "adcs x16, x16, %x[a]\n\t" "adc x17, x17, x4\n\t" /* Multiply by 121666 */ "mov x5, #0xdb42\n\t" @@ -1679,14 +1679,14 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x12, x12, x8\n\t" "adc x13, x13, x9\n\t" "mov x3, #-19\n\t" - "asr x23, x13, #63\n\t" + "asr %x[a], x13, #63\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ "subs x10, x10, x3\n\t" - "sbcs x11, x11, x23\n\t" - "sbcs x12, x12, x23\n\t" + "sbcs x11, x11, %x[a]\n\t" + "sbcs x12, x12, %x[a]\n\t" "sbc x13, x13, x4\n\t" /* Multiply */ /* A[0] * B[0] */ @@ -1713,53 +1713,53 @@ int curve25519(byte* r, byte* n, byte* a) "umulh x4, x15, x11\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x23, xzr, xzr\n\t" + "adc %x[a], xzr, xzr\n\t" /* A[2] * B[0] */ "mul x3, x16, x10\n\t" "umulh x4, x16, x10\n\t" "adds x8, x8, x3\n\t" "adcs x9, x9, x4\n\t" - "adc x23, x23, xzr\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[0] * B[3] */ "mul x3, x14, x13\n\t" "umulh x4, x14, x13\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * B[2] */ "mul x3, x15, x12\n\t" "umulh x4, x15, x12\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[2] * B[1] */ "mul x3, x16, x11\n\t" "umulh x4, x16, x11\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[3] * B[0] */ "mul x3, x17, x10\n\t" "umulh x4, x17, x10\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[1] * B[3] */ "mul x3, x15, x13\n\t" "umulh x4, x15, x13\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, xzr, xzr\n\t" /* A[2] * B[2] */ "mul x3, x16, x12\n\t" "umulh x4, x16, x12\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[3] * B[1] */ "mul x3, x17, x11\n\t" "umulh x4, x17, x11\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[2] * B[3] */ @@ -1783,13 +1783,13 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x9, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" "adds x6, x6, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" @@ -1802,7 +1802,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x23\n\t" + "adds x7, x7, %x[a]\n\t" "adcs x8, x8, x26\n\t" "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" @@ -1827,35 +1827,35 @@ int curve25519(byte* r, byte* n, byte* a) /* Add */ "ldp x6, x7, [x29, #112]\n\t" "ldp x8, x9, [x29, #128]\n\t" - "adds x10, x6, x18\n\t" - "adcs x11, x7, x19\n\t" - "adcs x12, x8, x20\n\t" - "adc x13, x9, x21\n\t" + "adds x10, x6, x19\n\t" + "adcs x11, x7, x20\n\t" + "adcs x12, x8, x21\n\t" + "adc x13, x9, x22\n\t" "mov x3, #-19\n\t" - "asr x23, x13, #63\n\t" + "asr %x[a], x13, #63\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ "subs x10, x10, x3\n\t" - "sbcs x11, x11, x23\n\t" - "sbcs x12, x12, x23\n\t" + "sbcs x11, x11, %x[a]\n\t" + "sbcs x12, x12, %x[a]\n\t" "sbc x13, x13, x4\n\t" /* Sub */ - "subs x18, x6, x18\n\t" - "sbcs x19, x7, x19\n\t" - "sbcs x20, x8, x20\n\t" - "sbcs x21, x9, x21\n\t" + "subs x19, x6, x19\n\t" + "sbcs x20, x7, x20\n\t" + "sbcs x21, x8, x21\n\t" + "sbcs x22, x9, x22\n\t" "mov x3, #-19\n\t" - "csetm x23, cc\n\t" + "csetm %x[a], cc\n\t" /* Mask the modulus */ - "and x3, x23, x3\n\t" - "and x4, x23, #0x7fffffffffffffff\n\t" + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x18, x18, x3\n\t" - "adcs x19, x19, x23\n\t" - "adcs x20, x20, x23\n\t" - "adc x21, x21, x4\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, %x[a]\n\t" + "adcs x21, x21, %x[a]\n\t" + "adc x22, x22, x4\n\t" /* Square */ /* A[0] * A[1] */ "mul x7, x10, x11\n\t" @@ -1867,19 +1867,19 @@ int curve25519(byte* r, byte* n, byte* a) "adc x9, x9, xzr\n\t" /* A[0] * A[3] */ "mul x3, x10, x13\n\t" - "umulh x23, x10, x13\n\t" + "umulh %x[a], x10, x13\n\t" "adds x9, x9, x3\n\t" - "adc x23, x23, xzr\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[1] * A[2] */ "mul x3, x11, x12\n\t" "umulh x4, x11, x12\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * A[3] */ "mul x3, x11, x13\n\t" "umulh x4, x11, x13\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adc x26, x26, x4\n\t" /* A[2] * A[3] */ "mul x3, x12, x13\n\t" @@ -1890,7 +1890,7 @@ int curve25519(byte* r, byte* n, byte* a) "adds x7, x7, x7\n\t" "adcs x8, x8, x8\n\t" "adcs x9, x9, x9\n\t" - "adcs x23, x23, x23\n\t" + "adcs %x[a], %x[a], %x[a]\n\t" "adcs x26, x26, x26\n\t" "adcs x27, x27, x27\n\t" "adc x28, xzr, xzr\n\t" @@ -1907,7 +1907,7 @@ int curve25519(byte* r, byte* n, byte* a) "mul x3, x12, x12\n\t" "umulh x4, x12, x12\n\t" "adds x9, x9, x5\n\t" - "adcs x23, x23, x3\n\t" + "adcs %x[a], %x[a], x3\n\t" "adc x5, x4, xzr\n\t" /* A[3] * A[3] */ "mul x3, x13, x13\n\t" @@ -1919,13 +1919,13 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x9, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" "adds x6, x6, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" @@ -1938,7 +1938,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x23\n\t" + "adds x7, x7, %x[a]\n\t" "adcs x8, x8, x26\n\t" "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" @@ -1962,60 +1962,60 @@ int curve25519(byte* r, byte* n, byte* a) "stp x8, x9, [x29, #96]\n\t" /* Square */ /* A[0] * A[1] */ - "mul x7, x18, x19\n\t" - "umulh x8, x18, x19\n\t" + "mul x7, x19, x20\n\t" + "umulh x8, x19, x20\n\t" /* A[0] * A[2] */ - "mul x3, x18, x20\n\t" - "umulh x9, x18, x20\n\t" + "mul x3, x19, x21\n\t" + "umulh x9, x19, x21\n\t" "adds x8, x8, x3\n\t" "adc x9, x9, xzr\n\t" /* A[0] * A[3] */ - "mul x3, x18, x21\n\t" - "umulh x23, x18, x21\n\t" + "mul x3, x19, x22\n\t" + "umulh %x[a], x19, x22\n\t" "adds x9, x9, x3\n\t" - "adc x23, x23, xzr\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[1] * A[2] */ - "mul x3, x19, x20\n\t" - "umulh x4, x19, x20\n\t" + "mul x3, x20, x21\n\t" + "umulh x4, x20, x21\n\t" "adds x9, x9, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x3, x19, x21\n\t" - "umulh x4, x19, x21\n\t" - "adds x23, x23, x3\n\t" + "mul x3, x20, x22\n\t" + "umulh x4, x20, x22\n\t" + "adds %x[a], %x[a], x3\n\t" "adc x26, x26, x4\n\t" /* A[2] * A[3] */ - "mul x3, x20, x21\n\t" - "umulh x27, x20, x21\n\t" + "mul x3, x21, x22\n\t" + "umulh x27, x21, x22\n\t" "adds x26, x26, x3\n\t" "adc x27, x27, xzr\n\t" /* Double */ "adds x7, x7, x7\n\t" "adcs x8, x8, x8\n\t" "adcs x9, x9, x9\n\t" - "adcs x23, x23, x23\n\t" + "adcs %x[a], %x[a], %x[a]\n\t" "adcs x26, x26, x26\n\t" "adcs x27, x27, x27\n\t" "adc x28, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x6, x18, x18\n\t" - "umulh x5, x18, x18\n\t" + "mul x6, x19, x19\n\t" + "umulh x5, x19, x19\n\t" /* A[1] * A[1] */ - "mul x3, x19, x19\n\t" - "umulh x4, x19, x19\n\t" + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" "adds x7, x7, x5\n\t" "adcs x8, x8, x3\n\t" "adc x5, x4, xzr\n\t" /* A[2] * A[2] */ - "mul x3, x20, x20\n\t" - "umulh x4, x20, x20\n\t" - "adds x9, x9, x5\n\t" - "adcs x23, x23, x3\n\t" - "adc x5, x4, xzr\n\t" - /* A[3] * A[3] */ "mul x3, x21, x21\n\t" "umulh x4, x21, x21\n\t" + "adds x9, x9, x5\n\t" + "adcs %x[a], %x[a], x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x22, x22\n\t" + "umulh x4, x22, x22\n\t" "adds x26, x26, x5\n\t" "adcs x27, x27, x3\n\t" "adc x28, x28, x4\n\t" @@ -2023,13 +2023,13 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x9, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" "adds x6, x6, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" @@ -2042,7 +2042,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x7, x7, x23\n\t" + "adds x7, x7, %x[a]\n\t" "adcs x8, x8, x26\n\t" "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" @@ -2062,6 +2062,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ + "ldr %x[a], [x29, #184]\n\t" /* Multiply */ "ldp x14, x15, [%x[a]]\n\t" "ldp x16, x17, [%x[a], #16]\n\t" @@ -2089,53 +2090,53 @@ int curve25519(byte* r, byte* n, byte* a) "umulh x4, x15, x7\n\t" "adds x12, x12, x3\n\t" "adcs x13, x13, x4\n\t" - "adc x23, xzr, xzr\n\t" + "adc %x[a], xzr, xzr\n\t" /* A[2] * B[0] */ "mul x3, x16, x6\n\t" "umulh x4, x16, x6\n\t" "adds x12, x12, x3\n\t" "adcs x13, x13, x4\n\t" - "adc x23, x23, xzr\n\t" + "adc %x[a], %x[a], xzr\n\t" /* A[0] * B[3] */ "mul x3, x14, x9\n\t" "umulh x4, x14, x9\n\t" "adds x13, x13, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, xzr, xzr\n\t" /* A[1] * B[2] */ "mul x3, x15, x8\n\t" "umulh x4, x15, x8\n\t" "adds x13, x13, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[2] * B[1] */ "mul x3, x16, x7\n\t" "umulh x4, x16, x7\n\t" "adds x13, x13, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[3] * B[0] */ "mul x3, x17, x6\n\t" "umulh x4, x17, x6\n\t" "adds x13, x13, x3\n\t" - "adcs x23, x23, x4\n\t" + "adcs %x[a], %x[a], x4\n\t" "adc x26, x26, xzr\n\t" /* A[1] * B[3] */ "mul x3, x15, x9\n\t" "umulh x4, x15, x9\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, xzr, xzr\n\t" /* A[2] * B[2] */ "mul x3, x16, x8\n\t" "umulh x4, x16, x8\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[3] * B[1] */ "mul x3, x17, x7\n\t" "umulh x4, x17, x7\n\t" - "adds x23, x23, x3\n\t" + "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" /* A[2] * B[3] */ @@ -2159,13 +2160,13 @@ int curve25519(byte* r, byte* n, byte* a) /* Move top half into t4-t7 and remove top bit from t3 */ "extr x28, x28, x27, #63\n\t" "extr x27, x27, x26, #63\n\t" - "extr x26, x26, x23, #63\n\t" - "extr x23, x23, x13, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x13, #63\n\t" "and x13, x13, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x23\n\t" - "umulh x23, x3, x23\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" "adds x10, x10, x4\n\t" "mul x4, x3, x26\n\t" "umulh x26, x3, x26\n\t" @@ -2178,7 +2179,7 @@ int curve25519(byte* r, byte* n, byte* a) "adcs x13, x13, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x11, x11, x23\n\t" + "adds x11, x11, %x[a]\n\t" "adcs x12, x12, x26\n\t" "adcs x13, x13, x27\n\t" "adc x5, x5, xzr\n\t" @@ -2357,98 +2358,98 @@ int curve25519(byte* r, byte* n, byte* a) "umulh x4, x7, x11\n\t" "adds x16, x16, x3\n\t" "adcs x17, x17, x4\n\t" - "adc x18, xzr, xzr\n\t" + "adc x19, xzr, xzr\n\t" /* A[2] * B[0] */ "mul x3, x8, x10\n\t" "umulh x4, x8, x10\n\t" "adds x16, x16, x3\n\t" "adcs x17, x17, x4\n\t" - "adc x18, x18, xzr\n\t" + "adc x19, x19, xzr\n\t" /* A[0] * B[3] */ "mul x3, x6, x13\n\t" "umulh x4, x6, x13\n\t" "adds x17, x17, x3\n\t" - "adcs x18, x18, x4\n\t" - "adc x19, xzr, xzr\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, xzr, xzr\n\t" /* A[1] * B[2] */ "mul x3, x7, x12\n\t" "umulh x4, x7, x12\n\t" "adds x17, x17, x3\n\t" - "adcs x18, x18, x4\n\t" - "adc x19, x19, xzr\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" /* A[2] * B[1] */ "mul x3, x8, x11\n\t" "umulh x4, x8, x11\n\t" "adds x17, x17, x3\n\t" - "adcs x18, x18, x4\n\t" - "adc x19, x19, xzr\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[0] */ "mul x3, x9, x10\n\t" "umulh x4, x9, x10\n\t" "adds x17, x17, x3\n\t" - "adcs x18, x18, x4\n\t" - "adc x19, x19, xzr\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" /* A[1] * B[3] */ "mul x3, x7, x13\n\t" "umulh x4, x7, x13\n\t" - "adds x18, x18, x3\n\t" - "adcs x19, x19, x4\n\t" - "adc x20, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x8, x12\n\t" - "umulh x4, x8, x12\n\t" - "adds x18, x18, x3\n\t" - "adcs x19, x19, x4\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[1] */ - "mul x3, x9, x11\n\t" - "umulh x4, x9, x11\n\t" - "adds x18, x18, x3\n\t" - "adcs x19, x19, x4\n\t" - "adc x20, x20, xzr\n\t" - /* A[2] * B[3] */ - "mul x3, x8, x13\n\t" - "umulh x4, x8, x13\n\t" "adds x19, x19, x3\n\t" "adcs x20, x20, x4\n\t" "adc x21, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x3, x9, x12\n\t" - "umulh x4, x9, x12\n\t" + /* A[2] * B[2] */ + "mul x3, x8, x12\n\t" + "umulh x4, x8, x12\n\t" "adds x19, x19, x3\n\t" "adcs x20, x20, x4\n\t" "adc x21, x21, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x9, x11\n\t" + "umulh x4, x9, x11\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x8, x13\n\t" + "umulh x4, x8, x13\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x9, x12\n\t" + "umulh x4, x9, x12\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" /* A[3] * B[3] */ "mul x3, x9, x13\n\t" "umulh x4, x9, x13\n\t" - "adds x20, x20, x3\n\t" - "adc x21, x21, x4\n\t" + "adds x21, x21, x3\n\t" + "adc x22, x22, x4\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x22, x22, x21, #63\n\t" "extr x21, x21, x20, #63\n\t" "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x19, x19, x17, #63\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ "mov x3, #19\n\t" - "mul x4, x3, x18\n\t" - "umulh x18, x3, x18\n\t" - "adds x14, x14, x4\n\t" "mul x4, x3, x19\n\t" "umulh x19, x3, x19\n\t" - "adcs x15, x15, x4\n\t" + "adds x14, x14, x4\n\t" "mul x4, x3, x20\n\t" "umulh x20, x3, x20\n\t" - "adcs x16, x16, x4\n\t" + "adcs x15, x15, x4\n\t" "mul x4, x3, x21\n\t" - "umulh x5, x3, x21\n\t" + "umulh x21, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "mul x4, x3, x22\n\t" + "umulh x5, x3, x22\n\t" "adcs x17, x17, x4\n\t" "adc x5, x5, xzr\n\t" /* Add remaining product results in */ - "adds x15, x15, x18\n\t" - "adcs x16, x16, x19\n\t" - "adcs x17, x17, x20\n\t" + "adds x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adcs x17, x17, x21\n\t" "adc x5, x5, xzr\n\t" /* Overflow */ "extr x5, x5, x17, #63\n\t" @@ -2472,7 +2473,7 @@ int curve25519(byte* r, byte* n, byte* a) "ldp x29, x30, [sp], #0xc0\n\t" : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); return (uint32_t)(size_t)r; } @@ -2626,97 +2627,97 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "ldp x11, x12, [x1]\n\t" "ldp x13, x14, [x1, #16]\n\t" "ldp x15, x16, [x2]\n\t" - "ldp x17, x18, [x2, #16]\n\t" + "ldp x17, x19, [x2, #16]\n\t" /* A[0] * B[0] */ "mul x3, x11, x15\n\t" "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x19, x11, x16\n\t" + "mul x20, x11, x16\n\t" "umulh x5, x11, x16\n\t" - "adds x4, x4, x19\n\t" + "adds x4, x4, x20\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x19, x12, x15\n\t" - "umulh x20, x12, x15\n\t" - "adds x4, x4, x19\n\t" - "adcs x5, x5, x20\n\t" + "mul x20, x12, x15\n\t" + "umulh x21, x12, x15\n\t" + "adds x4, x4, x20\n\t" + "adcs x5, x5, x21\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x19, x11, x17\n\t" - "umulh x20, x11, x17\n\t" - "adds x5, x5, x19\n\t" - "adc x6, x6, x20\n\t" + "mul x20, x11, x17\n\t" + "umulh x21, x11, x17\n\t" + "adds x5, x5, x20\n\t" + "adc x6, x6, x21\n\t" /* A[1] * B[1] */ - "mul x19, x12, x16\n\t" - "umulh x20, x12, x16\n\t" - "adds x5, x5, x19\n\t" - "adcs x6, x6, x20\n\t" + "mul x20, x12, x16\n\t" + "umulh x21, x12, x16\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x19, x13, x15\n\t" - "umulh x20, x13, x15\n\t" - "adds x5, x5, x19\n\t" - "adcs x6, x6, x20\n\t" + "mul x20, x13, x15\n\t" + "umulh x21, x13, x15\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x19, x11, x18\n\t" - "umulh x20, x11, x18\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x11, x19\n\t" + "umulh x21, x11, x19\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x19, x12, x17\n\t" - "umulh x20, x12, x17\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x12, x17\n\t" + "umulh x21, x12, x17\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x19, x13, x16\n\t" - "umulh x20, x13, x16\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x13, x16\n\t" + "umulh x21, x13, x16\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x19, x14, x15\n\t" - "umulh x20, x14, x15\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x14, x15\n\t" + "umulh x21, x14, x15\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x19, x12, x18\n\t" - "umulh x20, x12, x18\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x12, x19\n\t" + "umulh x21, x12, x19\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x19, x13, x17\n\t" - "umulh x20, x13, x17\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x13, x17\n\t" + "umulh x21, x13, x17\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x19, x14, x16\n\t" - "umulh x20, x14, x16\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x14, x16\n\t" + "umulh x21, x14, x16\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x19, x13, x18\n\t" - "umulh x20, x13, x18\n\t" - "adds x8, x8, x19\n\t" - "adcs x9, x9, x20\n\t" + "mul x20, x13, x19\n\t" + "umulh x21, x13, x19\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x19, x14, x17\n\t" - "umulh x20, x14, x17\n\t" - "adds x8, x8, x19\n\t" - "adcs x9, x9, x20\n\t" + "mul x20, x14, x17\n\t" + "umulh x21, x14, x17\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x19, x14, x18\n\t" - "umulh x20, x14, x18\n\t" - "adds x9, x9, x19\n\t" - "adc x10, x10, x20\n\t" + "mul x20, x14, x19\n\t" + "umulh x21, x14, x19\n\t" + "adds x9, x9, x20\n\t" + "adc x10, x10, x21\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -2725,37 +2726,37 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x19, #19\n\t" - "mul x20, x19, x7\n\t" - "umulh x7, x19, x7\n\t" - "adds x3, x3, x20\n\t" - "mul x20, x19, x8\n\t" - "umulh x8, x19, x8\n\t" - "adcs x4, x4, x20\n\t" - "mul x20, x19, x9\n\t" - "umulh x9, x19, x9\n\t" - "adcs x5, x5, x20\n\t" - "mul x20, x19, x10\n\t" - "umulh x21, x19, x10\n\t" - "adcs x6, x6, x20\n\t" - "adc x21, x21, xzr\n\t" + "mov x20, #19\n\t" + "mul x21, x20, x7\n\t" + "umulh x7, x20, x7\n\t" + "adds x3, x3, x21\n\t" + "mul x21, x20, x8\n\t" + "umulh x8, x20, x8\n\t" + "adcs x4, x4, x21\n\t" + "mul x21, x20, x9\n\t" + "umulh x9, x20, x9\n\t" + "adcs x5, x5, x21\n\t" + "mul x21, x20, x10\n\t" + "umulh x22, x20, x10\n\t" + "adcs x6, x6, x21\n\t" + "adc x22, x22, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Overflow */ - "extr x21, x21, x6, #63\n\t" - "mul x21, x21, x19\n\t" + "extr x22, x22, x6, #63\n\t" + "mul x22, x22, x20\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x21\n\t" + "adds x3, x3, x22\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x21, x19, x6, asr 63\n\t" + "and x22, x20, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x21\n\t" + "adds x3, x3, x22\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -2769,97 +2770,97 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "ldp x11, x12, [x1]\n\t" "ldp x13, x14, [x1, #16]\n\t" "ldp x15, x16, [x2]\n\t" - "ldp x17, x18, [x2, #16]\n\t" + "ldp x17, x19, [x2, #16]\n\t" /* A[0] * B[0] */ "mul x3, x11, x15\n\t" "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x19, x11, x16\n\t" + "mul x20, x11, x16\n\t" "umulh x5, x11, x16\n\t" - "adds x4, x4, x19\n\t" + "adds x4, x4, x20\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x19, x12, x15\n\t" - "umulh x20, x12, x15\n\t" - "adds x4, x4, x19\n\t" - "adcs x5, x5, x20\n\t" + "mul x20, x12, x15\n\t" + "umulh x21, x12, x15\n\t" + "adds x4, x4, x20\n\t" + "adcs x5, x5, x21\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x19, x11, x17\n\t" - "umulh x20, x11, x17\n\t" - "adds x5, x5, x19\n\t" - "adc x6, x6, x20\n\t" + "mul x20, x11, x17\n\t" + "umulh x21, x11, x17\n\t" + "adds x5, x5, x20\n\t" + "adc x6, x6, x21\n\t" /* A[1] * B[1] */ - "mul x19, x12, x16\n\t" - "umulh x20, x12, x16\n\t" - "adds x5, x5, x19\n\t" - "adcs x6, x6, x20\n\t" + "mul x20, x12, x16\n\t" + "umulh x21, x12, x16\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x19, x13, x15\n\t" - "umulh x20, x13, x15\n\t" - "adds x5, x5, x19\n\t" - "adcs x6, x6, x20\n\t" + "mul x20, x13, x15\n\t" + "umulh x21, x13, x15\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x19, x11, x18\n\t" - "umulh x20, x11, x18\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x11, x19\n\t" + "umulh x21, x11, x19\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x19, x12, x17\n\t" - "umulh x20, x12, x17\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x12, x17\n\t" + "umulh x21, x12, x17\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x19, x13, x16\n\t" - "umulh x20, x13, x16\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x13, x16\n\t" + "umulh x21, x13, x16\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x19, x14, x15\n\t" - "umulh x20, x14, x15\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x14, x15\n\t" + "umulh x21, x14, x15\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x19, x12, x18\n\t" - "umulh x20, x12, x18\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x12, x19\n\t" + "umulh x21, x12, x19\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x19, x13, x17\n\t" - "umulh x20, x13, x17\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x13, x17\n\t" + "umulh x21, x13, x17\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x19, x14, x16\n\t" - "umulh x20, x14, x16\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x14, x16\n\t" + "umulh x21, x14, x16\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x19, x13, x18\n\t" - "umulh x20, x13, x18\n\t" - "adds x8, x8, x19\n\t" - "adcs x9, x9, x20\n\t" + "mul x20, x13, x19\n\t" + "umulh x21, x13, x19\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x19, x14, x17\n\t" - "umulh x20, x14, x17\n\t" - "adds x8, x8, x19\n\t" - "adcs x9, x9, x20\n\t" + "mul x20, x14, x17\n\t" + "umulh x21, x14, x17\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x19, x14, x18\n\t" - "umulh x20, x14, x18\n\t" - "adds x9, x9, x19\n\t" - "adc x10, x10, x20\n\t" + "mul x20, x14, x19\n\t" + "umulh x21, x14, x19\n\t" + "adds x9, x9, x20\n\t" + "adc x10, x10, x21\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -2868,37 +2869,37 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x19, #19\n\t" - "mul x20, x19, x7\n\t" - "umulh x7, x19, x7\n\t" - "adds x3, x3, x20\n\t" - "mul x20, x19, x8\n\t" - "umulh x8, x19, x8\n\t" - "adcs x4, x4, x20\n\t" - "mul x20, x19, x9\n\t" - "umulh x9, x19, x9\n\t" - "adcs x5, x5, x20\n\t" - "mul x20, x19, x10\n\t" - "umulh x21, x19, x10\n\t" - "adcs x6, x6, x20\n\t" - "adc x21, x21, xzr\n\t" + "mov x20, #19\n\t" + "mul x21, x20, x7\n\t" + "umulh x7, x20, x7\n\t" + "adds x3, x3, x21\n\t" + "mul x21, x20, x8\n\t" + "umulh x8, x20, x8\n\t" + "adcs x4, x4, x21\n\t" + "mul x21, x20, x9\n\t" + "umulh x9, x20, x9\n\t" + "adcs x5, x5, x21\n\t" + "mul x21, x20, x10\n\t" + "umulh x22, x20, x10\n\t" + "adcs x6, x6, x21\n\t" + "adc x22, x22, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Overflow */ - "extr x21, x21, x6, #63\n\t" - "mul x21, x21, x19\n\t" + "extr x22, x22, x6, #63\n\t" + "mul x22, x22, x20\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x21\n\t" + "adds x3, x3, x22\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x21, x19, x6, asr 63\n\t" + "and x22, x20, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x21\n\t" + "adds x3, x3, x22\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -2914,92 +2915,92 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "mul x3, x15, x11\n\t" "umulh x4, x15, x11\n\t" /* A[0] * B[1] */ - "mul x19, x15, x12\n\t" + "mul x20, x15, x12\n\t" "umulh x5, x15, x12\n\t" - "adds x4, x4, x19\n\t" + "adds x4, x4, x20\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x19, x16, x11\n\t" - "umulh x20, x16, x11\n\t" - "adds x4, x4, x19\n\t" - "adcs x5, x5, x20\n\t" + "mul x20, x16, x11\n\t" + "umulh x21, x16, x11\n\t" + "adds x4, x4, x20\n\t" + "adcs x5, x5, x21\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x19, x15, x13\n\t" - "umulh x20, x15, x13\n\t" - "adds x5, x5, x19\n\t" - "adc x6, x6, x20\n\t" + "mul x20, x15, x13\n\t" + "umulh x21, x15, x13\n\t" + "adds x5, x5, x20\n\t" + "adc x6, x6, x21\n\t" /* A[1] * B[1] */ - "mul x19, x16, x12\n\t" - "umulh x20, x16, x12\n\t" - "adds x5, x5, x19\n\t" - "adcs x6, x6, x20\n\t" + "mul x20, x16, x12\n\t" + "umulh x21, x16, x12\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x19, x17, x11\n\t" - "umulh x20, x17, x11\n\t" - "adds x5, x5, x19\n\t" - "adcs x6, x6, x20\n\t" + "mul x20, x17, x11\n\t" + "umulh x21, x17, x11\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x19, x15, x14\n\t" - "umulh x20, x15, x14\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x15, x14\n\t" + "umulh x21, x15, x14\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x19, x16, x13\n\t" - "umulh x20, x16, x13\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x16, x13\n\t" + "umulh x21, x16, x13\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x19, x17, x12\n\t" - "umulh x20, x17, x12\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x17, x12\n\t" + "umulh x21, x17, x12\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x19, x18, x11\n\t" - "umulh x20, x18, x11\n\t" - "adds x6, x6, x19\n\t" - "adcs x7, x7, x20\n\t" + "mul x20, x19, x11\n\t" + "umulh x21, x19, x11\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x19, x16, x14\n\t" - "umulh x20, x16, x14\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x16, x14\n\t" + "umulh x21, x16, x14\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x19, x17, x13\n\t" - "umulh x20, x17, x13\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x17, x13\n\t" + "umulh x21, x17, x13\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x19, x18, x12\n\t" - "umulh x20, x18, x12\n\t" - "adds x7, x7, x19\n\t" - "adcs x8, x8, x20\n\t" + "mul x20, x19, x12\n\t" + "umulh x21, x19, x12\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x19, x17, x14\n\t" - "umulh x20, x17, x14\n\t" - "adds x8, x8, x19\n\t" - "adcs x9, x9, x20\n\t" + "mul x20, x17, x14\n\t" + "umulh x21, x17, x14\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x19, x18, x13\n\t" - "umulh x20, x18, x13\n\t" - "adds x8, x8, x19\n\t" - "adcs x9, x9, x20\n\t" + "mul x20, x19, x13\n\t" + "umulh x21, x19, x13\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x19, x18, x14\n\t" - "umulh x20, x18, x14\n\t" - "adds x9, x9, x19\n\t" - "adc x10, x10, x20\n\t" + "mul x20, x19, x14\n\t" + "umulh x21, x19, x14\n\t" + "adds x9, x9, x20\n\t" + "adc x10, x10, x21\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3008,37 +3009,37 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x19, #19\n\t" - "mul x20, x19, x7\n\t" - "umulh x7, x19, x7\n\t" - "adds x3, x3, x20\n\t" - "mul x20, x19, x8\n\t" - "umulh x8, x19, x8\n\t" - "adcs x4, x4, x20\n\t" - "mul x20, x19, x9\n\t" - "umulh x9, x19, x9\n\t" - "adcs x5, x5, x20\n\t" - "mul x20, x19, x10\n\t" - "umulh x21, x19, x10\n\t" - "adcs x6, x6, x20\n\t" - "adc x21, x21, xzr\n\t" + "mov x20, #19\n\t" + "mul x21, x20, x7\n\t" + "umulh x7, x20, x7\n\t" + "adds x3, x3, x21\n\t" + "mul x21, x20, x8\n\t" + "umulh x8, x20, x8\n\t" + "adcs x4, x4, x21\n\t" + "mul x21, x20, x9\n\t" + "umulh x9, x20, x9\n\t" + "adcs x5, x5, x21\n\t" + "mul x21, x20, x10\n\t" + "umulh x22, x20, x10\n\t" + "adcs x6, x6, x21\n\t" + "adc x22, x22, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" /* Overflow */ - "extr x21, x21, x6, #63\n\t" - "mul x21, x21, x19\n\t" + "extr x22, x22, x6, #63\n\t" + "mul x22, x22, x20\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x21\n\t" + "adds x3, x3, x22\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x21, x19, x6, asr 63\n\t" + "and x22, x20, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x21\n\t" + "adds x3, x3, x22\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3048,7 +3049,7 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "ldp x29, x30, [sp], #0x40\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" + : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" ); } @@ -3070,97 +3071,97 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "ldp x11, x12, [x1]\n\t" "ldp x13, x14, [x1, #16]\n\t" "ldp x15, x16, [x2]\n\t" - "ldp x17, x18, [x2, #16]\n\t" + "ldp x17, x19, [x2, #16]\n\t" /* A[0] * B[0] */ "mul x3, x11, x15\n\t" "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x23, x11, x16\n\t" + "mul x24, x11, x16\n\t" "umulh x5, x11, x16\n\t" - "adds x4, x4, x23\n\t" + "adds x4, x4, x24\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x23, x12, x15\n\t" - "umulh x24, x12, x15\n\t" - "adds x4, x4, x23\n\t" - "adcs x5, x5, x24\n\t" + "mul x24, x12, x15\n\t" + "umulh x25, x12, x15\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x23, x11, x17\n\t" - "umulh x24, x11, x17\n\t" - "adds x5, x5, x23\n\t" - "adc x6, x6, x24\n\t" + "mul x24, x11, x17\n\t" + "umulh x25, x11, x17\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" /* A[1] * B[1] */ - "mul x23, x12, x16\n\t" - "umulh x24, x12, x16\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x12, x16\n\t" + "umulh x25, x12, x16\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x23, x13, x15\n\t" - "umulh x24, x13, x15\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x23, x11, x18\n\t" - "umulh x24, x11, x18\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x11, x19\n\t" + "umulh x25, x11, x19\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x23, x12, x17\n\t" - "umulh x24, x12, x17\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x12, x17\n\t" + "umulh x25, x12, x17\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x23, x13, x16\n\t" - "umulh x24, x13, x16\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x13, x16\n\t" + "umulh x25, x13, x16\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x23, x14, x15\n\t" - "umulh x24, x14, x15\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x14, x15\n\t" + "umulh x25, x14, x15\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x23, x12, x18\n\t" - "umulh x24, x12, x18\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x12, x19\n\t" + "umulh x25, x12, x19\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x23, x13, x17\n\t" - "umulh x24, x13, x17\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x13, x17\n\t" + "umulh x25, x13, x17\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x23, x14, x16\n\t" - "umulh x24, x14, x16\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x14, x16\n\t" + "umulh x25, x14, x16\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x23, x13, x18\n\t" - "umulh x24, x13, x18\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x13, x19\n\t" + "umulh x25, x13, x19\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x23, x14, x17\n\t" - "umulh x24, x14, x17\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x14, x17\n\t" + "umulh x25, x14, x17\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x23, x14, x18\n\t" - "umulh x24, x14, x18\n\t" - "adds x9, x9, x23\n\t" - "adc x10, x10, x24\n\t" + "mul x24, x14, x19\n\t" + "umulh x25, x14, x19\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3169,37 +3170,37 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x23, #19\n\t" - "mul x24, x23, x7\n\t" - "umulh x7, x23, x7\n\t" - "adds x3, x3, x24\n\t" - "mul x24, x23, x8\n\t" - "umulh x8, x23, x8\n\t" - "adcs x4, x4, x24\n\t" - "mul x24, x23, x9\n\t" - "umulh x9, x23, x9\n\t" - "adcs x5, x5, x24\n\t" - "mul x24, x23, x10\n\t" - "umulh x25, x23, x10\n\t" - "adcs x6, x6, x24\n\t" - "adc x25, x25, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x25, x25, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x25, x25, x6, #63\n\t" - "mul x25, x25, x23\n\t" + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x25, x23, x6, asr 63\n\t" + "and x26, x24, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3209,98 +3210,98 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "ldr x0, [x29, #32]\n\t" "ldr x2, [x29, #48]\n\t" /* Multiply */ - "ldp x19, x20, [x2]\n\t" - "ldp x21, x22, [x2, #16]\n\t" + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x3, x11, x19\n\t" - "umulh x4, x11, x19\n\t" + "mul x3, x11, x20\n\t" + "umulh x4, x11, x20\n\t" /* A[0] * B[1] */ - "mul x23, x11, x20\n\t" - "umulh x5, x11, x20\n\t" - "adds x4, x4, x23\n\t" + "mul x24, x11, x21\n\t" + "umulh x5, x11, x21\n\t" + "adds x4, x4, x24\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x23, x12, x19\n\t" - "umulh x24, x12, x19\n\t" - "adds x4, x4, x23\n\t" - "adcs x5, x5, x24\n\t" + "mul x24, x12, x20\n\t" + "umulh x25, x12, x20\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x23, x11, x21\n\t" - "umulh x24, x11, x21\n\t" - "adds x5, x5, x23\n\t" - "adc x6, x6, x24\n\t" + "mul x24, x11, x22\n\t" + "umulh x25, x11, x22\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" /* A[1] * B[1] */ - "mul x23, x12, x20\n\t" - "umulh x24, x12, x20\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x12, x21\n\t" + "umulh x25, x12, x21\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x23, x13, x19\n\t" - "umulh x24, x13, x19\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x13, x20\n\t" + "umulh x25, x13, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x23, x11, x22\n\t" - "umulh x24, x11, x22\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x11, x23\n\t" + "umulh x25, x11, x23\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x23, x12, x21\n\t" - "umulh x24, x12, x21\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x12, x22\n\t" + "umulh x25, x12, x22\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x23, x13, x20\n\t" - "umulh x24, x13, x20\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x13, x21\n\t" + "umulh x25, x13, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x23, x14, x19\n\t" - "umulh x24, x14, x19\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x14, x20\n\t" + "umulh x25, x14, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x23, x12, x22\n\t" - "umulh x24, x12, x22\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x12, x23\n\t" + "umulh x25, x12, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x23, x13, x21\n\t" - "umulh x24, x13, x21\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x13, x22\n\t" + "umulh x25, x13, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x23, x14, x20\n\t" - "umulh x24, x14, x20\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x14, x21\n\t" + "umulh x25, x14, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x23, x13, x22\n\t" - "umulh x24, x13, x22\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x13, x23\n\t" + "umulh x25, x13, x23\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x23, x14, x21\n\t" - "umulh x24, x14, x21\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x14, x22\n\t" + "umulh x25, x14, x22\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x23, x14, x22\n\t" - "umulh x24, x14, x22\n\t" - "adds x9, x9, x23\n\t" - "adc x10, x10, x24\n\t" + "mul x24, x14, x23\n\t" + "umulh x25, x14, x23\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3309,37 +3310,37 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x23, #19\n\t" - "mul x24, x23, x7\n\t" - "umulh x7, x23, x7\n\t" - "adds x3, x3, x24\n\t" - "mul x24, x23, x8\n\t" - "umulh x8, x23, x8\n\t" - "adcs x4, x4, x24\n\t" - "mul x24, x23, x9\n\t" - "umulh x9, x23, x9\n\t" - "adcs x5, x5, x24\n\t" - "mul x24, x23, x10\n\t" - "umulh x25, x23, x10\n\t" - "adcs x6, x6, x24\n\t" - "adc x25, x25, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x25, x25, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x25, x25, x6, #63\n\t" - "mul x25, x25, x23\n\t" + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x25, x23, x6, asr 63\n\t" + "and x26, x24, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3352,95 +3353,95 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "ldp x11, x12, [x2]\n\t" "ldp x13, x14, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x3, x19, x11\n\t" - "umulh x4, x19, x11\n\t" + "mul x3, x20, x11\n\t" + "umulh x4, x20, x11\n\t" /* A[0] * B[1] */ - "mul x23, x19, x12\n\t" - "umulh x5, x19, x12\n\t" - "adds x4, x4, x23\n\t" + "mul x24, x20, x12\n\t" + "umulh x5, x20, x12\n\t" + "adds x4, x4, x24\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x23, x20, x11\n\t" - "umulh x24, x20, x11\n\t" - "adds x4, x4, x23\n\t" - "adcs x5, x5, x24\n\t" + "mul x24, x21, x11\n\t" + "umulh x25, x21, x11\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x23, x19, x13\n\t" - "umulh x24, x19, x13\n\t" - "adds x5, x5, x23\n\t" - "adc x6, x6, x24\n\t" + "mul x24, x20, x13\n\t" + "umulh x25, x20, x13\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" /* A[1] * B[1] */ - "mul x23, x20, x12\n\t" - "umulh x24, x20, x12\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x21, x12\n\t" + "umulh x25, x21, x12\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x23, x21, x11\n\t" - "umulh x24, x21, x11\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x22, x11\n\t" + "umulh x25, x22, x11\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x23, x19, x14\n\t" - "umulh x24, x19, x14\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x20, x14\n\t" + "umulh x25, x20, x14\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x23, x20, x13\n\t" - "umulh x24, x20, x13\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x21, x13\n\t" + "umulh x25, x21, x13\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x23, x21, x12\n\t" - "umulh x24, x21, x12\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x22, x12\n\t" + "umulh x25, x22, x12\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x23, x22, x11\n\t" - "umulh x24, x22, x11\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x23, x11\n\t" + "umulh x25, x23, x11\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x23, x20, x14\n\t" - "umulh x24, x20, x14\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x21, x14\n\t" + "umulh x25, x21, x14\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x23, x21, x13\n\t" - "umulh x24, x21, x13\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x22, x13\n\t" + "umulh x25, x22, x13\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x23, x22, x12\n\t" - "umulh x24, x22, x12\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x23, x12\n\t" + "umulh x25, x23, x12\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x23, x21, x14\n\t" - "umulh x24, x21, x14\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x22, x14\n\t" + "umulh x25, x22, x14\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x23, x22, x13\n\t" - "umulh x24, x22, x13\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x23, x13\n\t" + "umulh x25, x23, x13\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x23, x22, x14\n\t" - "umulh x24, x22, x14\n\t" - "adds x9, x9, x23\n\t" - "adc x10, x10, x24\n\t" + "mul x24, x23, x14\n\t" + "umulh x25, x23, x14\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3449,37 +3450,37 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x23, #19\n\t" - "mul x24, x23, x7\n\t" - "umulh x7, x23, x7\n\t" - "adds x3, x3, x24\n\t" - "mul x24, x23, x8\n\t" - "umulh x8, x23, x8\n\t" - "adcs x4, x4, x24\n\t" - "mul x24, x23, x9\n\t" - "umulh x9, x23, x9\n\t" - "adcs x5, x5, x24\n\t" - "mul x24, x23, x10\n\t" - "umulh x25, x23, x10\n\t" - "adcs x6, x6, x24\n\t" - "adc x25, x25, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x25, x25, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x25, x25, x6, #63\n\t" - "mul x25, x25, x23\n\t" + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x25, x23, x6, asr 63\n\t" + "and x26, x24, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3492,92 +3493,92 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "mul x3, x11, x15\n\t" "umulh x4, x11, x15\n\t" /* A[0] * B[1] */ - "mul x23, x11, x16\n\t" + "mul x24, x11, x16\n\t" "umulh x5, x11, x16\n\t" - "adds x4, x4, x23\n\t" + "adds x4, x4, x24\n\t" "adc x5, x5, xzr\n\t" /* A[1] * B[0] */ - "mul x23, x12, x15\n\t" - "umulh x24, x12, x15\n\t" - "adds x4, x4, x23\n\t" - "adcs x5, x5, x24\n\t" + "mul x24, x12, x15\n\t" + "umulh x25, x12, x15\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" "adc x6, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x23, x11, x17\n\t" - "umulh x24, x11, x17\n\t" - "adds x5, x5, x23\n\t" - "adc x6, x6, x24\n\t" + "mul x24, x11, x17\n\t" + "umulh x25, x11, x17\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" /* A[1] * B[1] */ - "mul x23, x12, x16\n\t" - "umulh x24, x12, x16\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x12, x16\n\t" + "umulh x25, x12, x16\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x23, x13, x15\n\t" - "umulh x24, x13, x15\n\t" - "adds x5, x5, x23\n\t" - "adcs x6, x6, x24\n\t" + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" "adc x7, x7, xzr\n\t" /* A[0] * B[3] */ - "mul x23, x11, x18\n\t" - "umulh x24, x11, x18\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x11, x19\n\t" + "umulh x25, x11, x19\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x23, x12, x17\n\t" - "umulh x24, x12, x17\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x12, x17\n\t" + "umulh x25, x12, x17\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[2] * B[1] */ - "mul x23, x13, x16\n\t" - "umulh x24, x13, x16\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x13, x16\n\t" + "umulh x25, x13, x16\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[3] * B[0] */ - "mul x23, x14, x15\n\t" - "umulh x24, x14, x15\n\t" - "adds x6, x6, x23\n\t" - "adcs x7, x7, x24\n\t" + "mul x24, x14, x15\n\t" + "umulh x25, x14, x15\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[1] * B[3] */ - "mul x23, x12, x18\n\t" - "umulh x24, x12, x18\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x12, x19\n\t" + "umulh x25, x12, x19\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x23, x13, x17\n\t" - "umulh x24, x13, x17\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x13, x17\n\t" + "umulh x25, x13, x17\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[1] */ - "mul x23, x14, x16\n\t" - "umulh x24, x14, x16\n\t" - "adds x7, x7, x23\n\t" - "adcs x8, x8, x24\n\t" + "mul x24, x14, x16\n\t" + "umulh x25, x14, x16\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[3] */ - "mul x23, x13, x18\n\t" - "umulh x24, x13, x18\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x13, x19\n\t" + "umulh x25, x13, x19\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x23, x14, x17\n\t" - "umulh x24, x14, x17\n\t" - "adds x8, x8, x23\n\t" - "adcs x9, x9, x24\n\t" + "mul x24, x14, x17\n\t" + "umulh x25, x14, x17\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[3] */ - "mul x23, x14, x18\n\t" - "umulh x24, x14, x18\n\t" - "adds x9, x9, x23\n\t" - "adc x10, x10, x24\n\t" + "mul x24, x14, x19\n\t" + "umulh x25, x14, x19\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x10, x10, x9, #63\n\t" @@ -3586,37 +3587,37 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "extr x7, x7, x6, #63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x23, #19\n\t" - "mul x24, x23, x7\n\t" - "umulh x7, x23, x7\n\t" - "adds x3, x3, x24\n\t" - "mul x24, x23, x8\n\t" - "umulh x8, x23, x8\n\t" - "adcs x4, x4, x24\n\t" - "mul x24, x23, x9\n\t" - "umulh x9, x23, x9\n\t" - "adcs x5, x5, x24\n\t" - "mul x24, x23, x10\n\t" - "umulh x25, x23, x10\n\t" - "adcs x6, x6, x24\n\t" - "adc x25, x25, xzr\n\t" + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" /* Add remaining product results in */ "adds x4, x4, x7\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" - "adc x25, x25, xzr\n\t" + "adc x26, x26, xzr\n\t" /* Overflow */ - "extr x25, x25, x6, #63\n\t" - "mul x25, x25, x23\n\t" + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" /* Reduce if top bit set */ - "and x25, x23, x6, asr 63\n\t" + "and x26, x24, x6, asr 63\n\t" "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x25\n\t" + "adds x3, x3, x26\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "adc x6, x6, xzr\n\t" @@ -3626,7 +3627,7 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" ); } @@ -3650,421 +3651,30 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "mul x5, x12, x13\n\t" "umulh x6, x12, x13\n\t" /* A[0] * A[2] */ - "mul x24, x12, x14\n\t" + "mul x25, x12, x14\n\t" "umulh x7, x12, x14\n\t" - "adds x6, x6, x24\n\t" + "adds x6, x6, x25\n\t" "adc x7, x7, xzr\n\t" /* A[0] * A[3] */ - "mul x24, x12, x15\n\t" + "mul x25, x12, x15\n\t" "umulh x8, x12, x15\n\t" - "adds x7, x7, x24\n\t" + "adds x7, x7, x25\n\t" "adc x8, x8, xzr\n\t" /* A[1] * A[2] */ - "mul x24, x13, x14\n\t" - "umulh x25, x13, x14\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x13, x14\n\t" + "umulh x26, x13, x14\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * A[3] */ - "mul x24, x13, x15\n\t" - "umulh x25, x13, x15\n\t" - "adds x8, x8, x24\n\t" - "adc x9, x9, x25\n\t" - /* A[2] * A[3] */ - "mul x24, x14, x15\n\t" - "umulh x10, x14, x15\n\t" - "adds x9, x9, x24\n\t" - "adc x10, x10, xzr\n\t" - /* Double */ - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adc x11, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x4, x12, x12\n\t" - "umulh x26, x12, x12\n\t" - /* A[1] * A[1] */ - "mul x24, x13, x13\n\t" - "umulh x25, x13, x13\n\t" - "adds x5, x5, x26\n\t" - "adcs x6, x6, x24\n\t" - "adc x26, x25, xzr\n\t" - /* A[2] * A[2] */ - "mul x24, x14, x14\n\t" - "umulh x25, x14, x14\n\t" - "adds x7, x7, x26\n\t" - "adcs x8, x8, x24\n\t" - "adc x26, x25, xzr\n\t" - /* A[3] * A[3] */ - "mul x24, x15, x15\n\t" - "umulh x25, x15, x15\n\t" - "adds x9, x9, x26\n\t" - "adcs x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x10, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x11\n\t" - "umulh x26, x24, x11\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x1, [x29, #56]\n\t" - /* Square */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" - /* A[0] * A[1] */ - "mul x9, x20, x21\n\t" - "umulh x10, x20, x21\n\t" - /* A[0] * A[2] */ - "mul x24, x20, x22\n\t" - "umulh x11, x20, x22\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, xzr\n\t" - /* A[0] * A[3] */ - "mul x24, x20, x23\n\t" - "umulh x16, x20, x23\n\t" - "adds x11, x11, x24\n\t" - "adc x16, x16, xzr\n\t" - /* A[1] * A[2] */ - "mul x24, x21, x22\n\t" - "umulh x25, x21, x22\n\t" - "adds x11, x11, x24\n\t" - "adcs x16, x16, x25\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x24, x21, x23\n\t" - "umulh x25, x21, x23\n\t" - "adds x16, x16, x24\n\t" - "adc x17, x17, x25\n\t" - /* A[2] * A[3] */ - "mul x24, x22, x23\n\t" - "umulh x18, x22, x23\n\t" - "adds x17, x17, x24\n\t" - "adc x18, x18, xzr\n\t" - /* Double */ - "adds x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x16, x16, x16\n\t" - "adcs x17, x17, x17\n\t" - "adcs x18, x18, x18\n\t" - "adc x19, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x8, x20, x20\n\t" - "umulh x26, x20, x20\n\t" - /* A[1] * A[1] */ - "mul x24, x21, x21\n\t" - "umulh x25, x21, x21\n\t" - "adds x9, x9, x26\n\t" - "adcs x10, x10, x24\n\t" - "adc x26, x25, xzr\n\t" - /* A[2] * A[2] */ - "mul x24, x22, x22\n\t" - "umulh x25, x22, x22\n\t" - "adds x11, x11, x26\n\t" - "adcs x16, x16, x24\n\t" - "adc x26, x25, xzr\n\t" - /* A[3] * A[3] */ - "mul x24, x23, x23\n\t" - "umulh x25, x23, x23\n\t" - "adds x17, x17, x26\n\t" - "adcs x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x11, #63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" + "mul x25, x13, x15\n\t" + "umulh x26, x13, x15\n\t" "adds x8, x8, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x9, x9, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x10, x10, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x11, x11, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x9, x9, x16\n\t" - "adcs x10, x10, x17\n\t" - "adcs x11, x11, x18\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x11, #63\n\t" - "mul x26, x26, x24\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x26\n\t" - "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" - "adc x11, x11, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x11, asr 63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x26\n\t" - "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" - "adc x11, x11, xzr\n\t" - /* Store */ - "stp x8, x9, [x0]\n\t" - "stp x10, x11, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - /* Add */ - "adds x12, x12, x20\n\t" - "adcs x13, x13, x21\n\t" - "adcs x14, x14, x22\n\t" - "adc x15, x15, x23\n\t" - "mov x24, #-19\n\t" - "asr x27, x15, #63\n\t" - /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x12, x12, x24\n\t" - "sbcs x13, x13, x27\n\t" - "sbcs x14, x14, x27\n\t" - "sbc x15, x15, x25\n\t" - "ldr x0, [x29, #40]\n\t" - /* Square */ - /* A[0] * A[1] */ - "mul x17, x12, x13\n\t" - "umulh x18, x12, x13\n\t" - /* A[0] * A[2] */ - "mul x24, x12, x14\n\t" - "umulh x19, x12, x14\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, xzr\n\t" - /* A[0] * A[3] */ - "mul x24, x12, x15\n\t" - "umulh x20, x12, x15\n\t" - "adds x19, x19, x24\n\t" - "adc x20, x20, xzr\n\t" - /* A[1] * A[2] */ - "mul x24, x13, x14\n\t" - "umulh x25, x13, x14\n\t" - "adds x19, x19, x24\n\t" - "adcs x20, x20, x25\n\t" - "adc x21, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x24, x13, x15\n\t" - "umulh x25, x13, x15\n\t" - "adds x20, x20, x24\n\t" - "adc x21, x21, x25\n\t" + "adc x9, x9, x26\n\t" /* A[2] * A[3] */ - "mul x24, x14, x15\n\t" - "umulh x22, x14, x15\n\t" - "adds x21, x21, x24\n\t" - "adc x22, x22, xzr\n\t" - /* Double */ - "adds x17, x17, x17\n\t" - "adcs x18, x18, x18\n\t" - "adcs x19, x19, x19\n\t" - "adcs x20, x20, x20\n\t" - "adcs x21, x21, x21\n\t" - "adcs x22, x22, x22\n\t" - "adc x23, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x16, x12, x12\n\t" - "umulh x26, x12, x12\n\t" - /* A[1] * A[1] */ - "mul x24, x13, x13\n\t" - "umulh x25, x13, x13\n\t" - "adds x17, x17, x26\n\t" - "adcs x18, x18, x24\n\t" - "adc x26, x25, xzr\n\t" - /* A[2] * A[2] */ - "mul x24, x14, x14\n\t" - "umulh x25, x14, x14\n\t" - "adds x19, x19, x26\n\t" - "adcs x20, x20, x24\n\t" - "adc x26, x25, xzr\n\t" - /* A[3] * A[3] */ - "mul x24, x15, x15\n\t" - "umulh x25, x15, x15\n\t" - "adds x21, x21, x26\n\t" - "adcs x22, x22, x24\n\t" - "adc x23, x23, x25\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x23, x23, x22, #63\n\t" - "extr x22, x22, x21, #63\n\t" - "extr x21, x21, x20, #63\n\t" - "extr x20, x20, x19, #63\n\t" - "and x19, x19, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x20\n\t" - "umulh x20, x24, x20\n\t" - "adds x16, x16, x25\n\t" - "mul x25, x24, x21\n\t" - "umulh x21, x24, x21\n\t" - "adcs x17, x17, x25\n\t" - "mul x25, x24, x22\n\t" - "umulh x22, x24, x22\n\t" - "adcs x18, x18, x25\n\t" - "mul x25, x24, x23\n\t" - "umulh x26, x24, x23\n\t" - "adcs x19, x19, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x17, x17, x20\n\t" - "adcs x18, x18, x21\n\t" - "adcs x19, x19, x22\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x19, #63\n\t" - "mul x26, x26, x24\n\t" - "and x19, x19, #0x7fffffffffffffff\n\t" - "adds x16, x16, x26\n\t" - "adcs x17, x17, xzr\n\t" - "adcs x18, x18, xzr\n\t" - "adc x19, x19, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x19, asr 63\n\t" - "and x19, x19, #0x7fffffffffffffff\n\t" - "adds x16, x16, x26\n\t" - "adcs x17, x17, xzr\n\t" - "adcs x18, x18, xzr\n\t" - "adc x19, x19, xzr\n\t" - /* Store */ - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #32]\n\t" - /* Add */ - "adds x12, x8, x4\n\t" - "adcs x13, x9, x5\n\t" - "adcs x14, x10, x6\n\t" - "adc x15, x11, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x15, #63\n\t" - /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x12, x12, x24\n\t" - "sbcs x13, x13, x27\n\t" - "sbcs x14, x14, x27\n\t" - "sbc x15, x15, x25\n\t" - /* Sub */ - "subs x20, x8, x4\n\t" - "sbcs x21, x9, x5\n\t" - "sbcs x22, x10, x6\n\t" - "sbcs x23, x11, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" - /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x20, x20, x24\n\t" - "adcs x21, x21, x27\n\t" - "adcs x22, x22, x27\n\t" - "adc x23, x23, x25\n\t" - "stp x12, x13, [x0]\n\t" - "stp x14, x15, [x0, #16]\n\t" - "stp x20, x21, [x1]\n\t" - "stp x22, x23, [x1, #16]\n\t" - "ldr x0, [x29, #16]\n\t" - /* Sub */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x13\n\t" - "sbcs x18, x18, x14\n\t" - "sbcs x19, x19, x15\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" - /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" - "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" - "ldr x0, [x29, #40]\n\t" - "ldr x1, [x29, #64]\n\t" - /* Square * 2 */ - "ldp x12, x13, [x1]\n\t" - "ldp x14, x15, [x1, #16]\n\t" - /* A[0] * A[1] */ - "mul x5, x12, x13\n\t" - "umulh x6, x12, x13\n\t" - /* A[0] * A[2] */ - "mul x24, x12, x14\n\t" - "umulh x7, x12, x14\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * A[3] */ - "mul x24, x12, x15\n\t" - "umulh x8, x12, x15\n\t" - "adds x7, x7, x24\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * A[2] */ - "mul x24, x13, x14\n\t" - "umulh x25, x13, x14\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x24, x13, x15\n\t" - "umulh x25, x13, x15\n\t" - "adds x8, x8, x24\n\t" - "adc x9, x9, x25\n\t" - /* A[2] * A[3] */ - "mul x24, x14, x15\n\t" + "mul x25, x14, x15\n\t" "umulh x10, x14, x15\n\t" - "adds x9, x9, x24\n\t" + "adds x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* Double */ "adds x5, x5, x5\n\t" @@ -4078,27 +3688,418 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "mul x4, x12, x12\n\t" "umulh x27, x12, x12\n\t" /* A[1] * A[1] */ - "mul x24, x13, x13\n\t" - "umulh x25, x13, x13\n\t" + "mul x25, x13, x13\n\t" + "umulh x26, x13, x13\n\t" "adds x5, x5, x27\n\t" - "adcs x6, x6, x24\n\t" - "adc x27, x25, xzr\n\t" + "adcs x6, x6, x25\n\t" + "adc x27, x26, xzr\n\t" /* A[2] * A[2] */ - "mul x24, x14, x14\n\t" - "umulh x25, x14, x14\n\t" + "mul x25, x14, x14\n\t" + "umulh x26, x14, x14\n\t" "adds x7, x7, x27\n\t" - "adcs x8, x8, x24\n\t" - "adc x27, x25, xzr\n\t" + "adcs x8, x8, x25\n\t" + "adc x27, x26, xzr\n\t" /* A[3] * A[3] */ - "mul x24, x15, x15\n\t" - "umulh x25, x15, x15\n\t" + "mul x25, x15, x15\n\t" + "umulh x26, x15, x15\n\t" "adds x9, x9, x27\n\t" - "adcs x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" - /* Double and Reduce */ - "mov x24, #0x169\n\t" + "adcs x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "lsr x27, x11, #61\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #56]\n\t" + /* Square */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x9, x21, x22\n\t" + "umulh x10, x21, x22\n\t" + /* A[0] * A[2] */ + "mul x25, x21, x23\n\t" + "umulh x11, x21, x23\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x21, x24\n\t" + "umulh x16, x21, x24\n\t" + "adds x11, x11, x25\n\t" + "adc x16, x16, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x22, x23\n\t" + "umulh x26, x22, x23\n\t" + "adds x11, x11, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x22, x24\n\t" + "umulh x26, x22, x24\n\t" + "adds x16, x16, x25\n\t" + "adc x17, x17, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x23, x24\n\t" + "umulh x19, x23, x24\n\t" + "adds x17, x17, x25\n\t" + "adc x19, x19, xzr\n\t" + /* Double */ + "adds x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x19, x19, x19\n\t" + "adc x20, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x8, x21, x21\n\t" + "umulh x27, x21, x21\n\t" + /* A[1] * A[1] */ + "mul x25, x22, x22\n\t" + "umulh x26, x22, x22\n\t" + "adds x9, x9, x27\n\t" + "adcs x10, x10, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x23, x23\n\t" + "umulh x26, x23, x23\n\t" + "adds x11, x11, x27\n\t" + "adcs x16, x16, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x24, x24\n\t" + "umulh x26, x24, x24\n\t" + "adds x17, x17, x27\n\t" + "adcs x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x8, x8, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x9, x9, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x10, x10, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x11, x11, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x9, x9, x16\n\t" + "adcs x10, x10, x17\n\t" + "adcs x11, x11, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x11, asr 63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Store */ + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + /* Add */ + "adds x12, x12, x21\n\t" + "adcs x13, x13, x22\n\t" + "adcs x14, x14, x23\n\t" + "adc x15, x15, x24\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + "ldr x0, [x29, #40]\n\t" + /* Square */ + /* A[0] * A[1] */ + "mul x17, x12, x13\n\t" + "umulh x19, x12, x13\n\t" + /* A[0] * A[2] */ + "mul x25, x12, x14\n\t" + "umulh x20, x12, x14\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x12, x15\n\t" + "umulh x21, x12, x15\n\t" + "adds x20, x20, x25\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x13, x14\n\t" + "umulh x26, x13, x14\n\t" + "adds x20, x20, x25\n\t" + "adcs x21, x21, x26\n\t" + "adc x22, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x13, x15\n\t" + "umulh x26, x13, x15\n\t" + "adds x21, x21, x25\n\t" + "adc x22, x22, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x14, x15\n\t" + "umulh x23, x14, x15\n\t" + "adds x22, x22, x25\n\t" + "adc x23, x23, xzr\n\t" + /* Double */ + "adds x17, x17, x17\n\t" + "adcs x19, x19, x19\n\t" + "adcs x20, x20, x20\n\t" + "adcs x21, x21, x21\n\t" + "adcs x22, x22, x22\n\t" + "adcs x23, x23, x23\n\t" + "adc x24, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x16, x12, x12\n\t" + "umulh x27, x12, x12\n\t" + /* A[1] * A[1] */ + "mul x25, x13, x13\n\t" + "umulh x26, x13, x13\n\t" + "adds x17, x17, x27\n\t" + "adcs x19, x19, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x14, x14\n\t" + "umulh x26, x14, x14\n\t" + "adds x20, x20, x27\n\t" + "adcs x21, x21, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x15, x15\n\t" + "umulh x26, x15, x15\n\t" + "adds x22, x22, x27\n\t" + "adcs x23, x23, x25\n\t" + "adc x24, x24, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x24, x24, x23, #63\n\t" + "extr x23, x23, x22, #63\n\t" + "extr x22, x22, x21, #63\n\t" + "extr x21, x21, x20, #63\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x21\n\t" + "umulh x21, x25, x21\n\t" + "adds x16, x16, x26\n\t" + "mul x26, x25, x22\n\t" + "umulh x22, x25, x22\n\t" + "adcs x17, x17, x26\n\t" + "mul x26, x25, x23\n\t" + "umulh x23, x25, x23\n\t" + "adcs x19, x19, x26\n\t" + "mul x26, x25, x24\n\t" + "umulh x27, x25, x24\n\t" + "adcs x20, x20, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x17, x17, x21\n\t" + "adcs x19, x19, x22\n\t" + "adcs x20, x20, x23\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x20, #63\n\t" + "mul x27, x27, x25\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adds x16, x16, x27\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x20, asr 63\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adds x16, x16, x27\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Store */ + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #32]\n\t" + /* Add */ + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + /* Sub */ + "subs x21, x8, x4\n\t" + "sbcs x22, x9, x5\n\t" + "sbcs x23, x10, x6\n\t" + "sbcs x24, x11, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x21, x21, x25\n\t" + "adcs x22, x22, x28\n\t" + "adcs x23, x23, x28\n\t" + "adc x24, x24, x26\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x21, x22, [x1]\n\t" + "stp x23, x24, [x1, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + /* Sub */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "sbcs x19, x19, x14\n\t" + "sbcs x20, x20, x15\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #64]\n\t" + /* Square * 2 */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x12, x13\n\t" + "umulh x6, x12, x13\n\t" + /* A[0] * A[2] */ + "mul x25, x12, x14\n\t" + "umulh x7, x12, x14\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x12, x15\n\t" + "umulh x8, x12, x15\n\t" + "adds x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x13, x14\n\t" + "umulh x26, x13, x14\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x13, x15\n\t" + "umulh x26, x13, x15\n\t" + "adds x8, x8, x25\n\t" + "adc x9, x9, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x14, x15\n\t" + "umulh x10, x14, x15\n\t" + "adds x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x12, x12\n\t" + "umulh x28, x12, x12\n\t" + /* A[1] * A[1] */ + "mul x25, x13, x13\n\t" + "umulh x26, x13, x13\n\t" + "adds x5, x5, x28\n\t" + "adcs x6, x6, x25\n\t" + "adc x28, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x14, x14\n\t" + "umulh x26, x14, x14\n\t" + "adds x7, x7, x28\n\t" + "adcs x8, x8, x25\n\t" + "adc x28, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x15, x15\n\t" + "umulh x26, x15, x15\n\t" + "adds x9, x9, x28\n\t" + "adcs x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Double and Reduce */ + "mov x25, #0x169\n\t" + /* Move top half into t4-t7 and remove top bit from t3 */ + "lsr x28, x11, #61\n\t" "extr x11, x11, x10, #62\n\t" "extr x10, x10, x9, #62\n\t" "extr x9, x9, x8, #62\n\t" @@ -4111,66 +4112,66 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz /* Two left, only one right */ "and x11, x11, #0x7fffffffffffffff\n\t" /* Multiply top bits by 19*19 */ - "mul x27, x27, x24\n\t" + "mul x28, x28, x25\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x10, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x11\n\t" - "umulh x26, x24, x11\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ - "adds x4, x4, x27\n\t" + "adds x4, x4, x28\n\t" "adcs x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ "ldr x0, [x29, #40]\n\t" /* Sub */ - "subs x4, x4, x20\n\t" - "sbcs x5, x5, x21\n\t" - "sbcs x6, x6, x22\n\t" - "sbcs x7, x7, x23\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "subs x4, x4, x21\n\t" + "sbcs x5, x5, x22\n\t" + "sbcs x6, x6, x23\n\t" + "sbcs x7, x7, x24\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x4, x4, x24\n\t" - "adcs x5, x5, x27\n\t" - "adcs x6, x6, x27\n\t" - "adc x7, x7, x25\n\t" + "adds x4, x4, x25\n\t" + "adcs x5, x5, x28\n\t" + "adcs x6, x6, x28\n\t" + "adc x7, x7, x26\n\t" "stp x4, x5, [x0]\n\t" "stp x6, x7, [x0, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) : - : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } @@ -4193,170 +4194,170 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldp x12, x13, [x2]\n\t" "ldp x14, x15, [x2, #16]\n\t" "ldp x16, x17, [x3]\n\t" - "ldp x18, x19, [x3, #16]\n\t" + "ldp x19, x20, [x3, #16]\n\t" "adds x4, x12, x16\n\t" "adcs x5, x13, x17\n\t" - "adcs x6, x14, x18\n\t" - "adc x7, x15, x19\n\t" - "mov x24, #-19\n\t" - "asr x27, x7, #63\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x24\n\t" - "sbcs x5, x5, x27\n\t" - "sbcs x6, x6, x27\n\t" - "sbc x7, x7, x25\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" /* Sub */ "subs x8, x12, x16\n\t" "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x18\n\t" - "sbcs x11, x15, x19\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x8, x8, x24\n\t" - "adcs x9, x9, x27\n\t" - "adcs x10, x10, x27\n\t" - "adc x11, x11, x25\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" "ldr x0, [x29, #32]\n\t" "ldr x2, [x29, #168]\n\t" /* Multiply */ - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x12, x4, x20\n\t" - "umulh x13, x4, x20\n\t" + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" /* A[0] * B[1] */ - "mul x24, x4, x21\n\t" - "umulh x14, x4, x21\n\t" - "adds x13, x13, x24\n\t" + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x5, x20\n\t" - "umulh x25, x5, x20\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x4, x22\n\t" - "umulh x25, x4, x22\n\t" - "adds x14, x14, x24\n\t" - "adc x15, x15, x25\n\t" + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" /* A[1] * B[1] */ - "mul x24, x5, x21\n\t" - "umulh x25, x5, x21\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x6, x20\n\t" - "umulh x25, x6, x20\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x4, x23\n\t" - "umulh x25, x4, x23\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x5, x22\n\t" - "umulh x25, x5, x22\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x6, x21\n\t" - "umulh x25, x6, x21\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x7, x20\n\t" - "umulh x25, x7, x20\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x5, x23\n\t" - "umulh x25, x5, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x6, x22\n\t" - "umulh x25, x6, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x7, x21\n\t" - "umulh x25, x7, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x6, x23\n\t" - "umulh x25, x6, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x7, x22\n\t" - "umulh x25, x7, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x7, x23\n\t" - "umulh x25, x7, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x15, #63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x12, x12, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x13, x13, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x14, x14, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x15, x15, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x13, x13, x16\n\t" "adcs x14, x14, x17\n\t" - "adcs x15, x15, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x15, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x15, asr 63\n\t" + "and x27, x25, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" @@ -4364,137 +4365,137 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr x0, [x29, #24]\n\t" "ldr x1, [x29, #176]\n\t" /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x8, x20\n\t" - "umulh x5, x8, x20\n\t" + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" /* A[0] * B[1] */ - "mul x24, x8, x21\n\t" - "umulh x6, x8, x21\n\t" - "adds x5, x5, x24\n\t" + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x9, x20\n\t" - "umulh x25, x9, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x8, x22\n\t" - "umulh x25, x8, x22\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x9, x21\n\t" - "umulh x25, x9, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x10, x20\n\t" - "umulh x25, x10, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x8, x23\n\t" - "umulh x25, x8, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x9, x22\n\t" - "umulh x25, x9, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x10, x21\n\t" - "umulh x25, x10, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x11, x20\n\t" - "umulh x25, x11, x20\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x9, x23\n\t" - "umulh x25, x9, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x10, x22\n\t" - "umulh x25, x10, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x11, x21\n\t" - "umulh x25, x11, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x10, x23\n\t" - "umulh x25, x10, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x11, x22\n\t" - "umulh x25, x11, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x11, x23\n\t" - "umulh x25, x11, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x16\n\t" "adcs x6, x6, x17\n\t" - "adcs x7, x7, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -4506,133 +4507,133 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x9, x13, x5\n\t" "adcs x10, x14, x6\n\t" "adc x11, x15, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x11, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x24\n\t" - "sbcs x9, x9, x27\n\t" - "sbcs x10, x10, x27\n\t" - "sbc x11, x11, x25\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" /* Sub */ "subs x16, x12, x4\n\t" "sbcs x17, x13, x5\n\t" - "sbcs x18, x14, x6\n\t" - "sbcs x19, x15, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x8, x9, [x0]\n\t" "stp x10, x11, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldr x0, [x29, #40]\n\t" "ldr x1, [x29, #160]\n\t" "ldr x3, [x29, #72]\n\t" /* Multiply */ "ldp x16, x17, [x1]\n\t" - "ldp x18, x19, [x1, #16]\n\t" - "ldp x20, x21, [x3]\n\t" - "ldp x22, x23, [x3, #16]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x3]\n\t" + "ldp x23, x24, [x3, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x16, x20\n\t" - "umulh x5, x16, x20\n\t" + "mul x4, x16, x21\n\t" + "umulh x5, x16, x21\n\t" /* A[0] * B[1] */ - "mul x24, x16, x21\n\t" - "umulh x6, x16, x21\n\t" - "adds x5, x5, x24\n\t" + "mul x25, x16, x22\n\t" + "umulh x6, x16, x22\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x17, x20\n\t" - "umulh x25, x17, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x16, x22\n\t" - "umulh x25, x16, x22\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x17, x21\n\t" - "umulh x25, x17, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x18, x20\n\t" - "umulh x25, x18, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x16, x23\n\t" - "umulh x25, x16, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x17, x22\n\t" - "umulh x25, x17, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x18, x21\n\t" - "umulh x25, x18, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x19, x20\n\t" - "umulh x25, x19, x20\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x17, x23\n\t" - "umulh x25, x17, x23\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x24, x18, x22\n\t" - "umulh x25, x18, x22\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x24, x19, x21\n\t" - "umulh x25, x19, x21\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x24, x18, x23\n\t" - "umulh x25, x18, x23\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x24, x19, x22\n\t" - "umulh x25, x19, x22\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x19, x23\n\t" - "umulh x25, x19, x23\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -4641,37 +4642,37 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x10, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x11\n\t" - "umulh x26, x24, x11\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -4685,55 +4686,55 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x9, x9, x9\n\t" "adcs x10, x10, x10\n\t" "adc x11, x11, x11\n\t" - "mov x24, #-19\n\t" - "asr x27, x11, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x24\n\t" - "sbcs x9, x9, x27\n\t" - "sbcs x10, x10, x27\n\t" - "sbc x11, x11, x25\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" "ldr x1, [x29, #40]\n\t" /* Add */ "adds x12, x8, x4\n\t" "adcs x13, x9, x5\n\t" "adcs x14, x10, x6\n\t" "adc x15, x11, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x15, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x12, x12, x24\n\t" - "sbcs x13, x13, x27\n\t" - "sbcs x14, x14, x27\n\t" - "sbc x15, x15, x25\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" /* Sub */ "subs x16, x8, x4\n\t" "sbcs x17, x9, x5\n\t" - "sbcs x18, x10, x6\n\t" - "sbcs x19, x11, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x10, x6\n\t" + "sbcs x20, x11, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x12, x13, [x0]\n\t" "stp x14, x15, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); (void)qxy2d; (void)qyplusx; @@ -4759,170 +4760,170 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldp x12, x13, [x2]\n\t" "ldp x14, x15, [x2, #16]\n\t" "ldp x16, x17, [x3]\n\t" - "ldp x18, x19, [x3, #16]\n\t" + "ldp x19, x20, [x3, #16]\n\t" "adds x4, x12, x16\n\t" "adcs x5, x13, x17\n\t" - "adcs x6, x14, x18\n\t" - "adc x7, x15, x19\n\t" - "mov x24, #-19\n\t" - "asr x27, x7, #63\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x24\n\t" - "sbcs x5, x5, x27\n\t" - "sbcs x6, x6, x27\n\t" - "sbc x7, x7, x25\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" /* Sub */ "subs x8, x12, x16\n\t" "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x18\n\t" - "sbcs x11, x15, x19\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x8, x8, x24\n\t" - "adcs x9, x9, x27\n\t" - "adcs x10, x10, x27\n\t" - "adc x11, x11, x25\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" "ldr x0, [x29, #32]\n\t" "ldr x2, [x29, #176]\n\t" /* Multiply */ - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x12, x4, x20\n\t" - "umulh x13, x4, x20\n\t" + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" /* A[0] * B[1] */ - "mul x24, x4, x21\n\t" - "umulh x14, x4, x21\n\t" - "adds x13, x13, x24\n\t" + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x5, x20\n\t" - "umulh x25, x5, x20\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x4, x22\n\t" - "umulh x25, x4, x22\n\t" - "adds x14, x14, x24\n\t" - "adc x15, x15, x25\n\t" + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" /* A[1] * B[1] */ - "mul x24, x5, x21\n\t" - "umulh x25, x5, x21\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x6, x20\n\t" - "umulh x25, x6, x20\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x4, x23\n\t" - "umulh x25, x4, x23\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x5, x22\n\t" - "umulh x25, x5, x22\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x6, x21\n\t" - "umulh x25, x6, x21\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x7, x20\n\t" - "umulh x25, x7, x20\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x5, x23\n\t" - "umulh x25, x5, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x6, x22\n\t" - "umulh x25, x6, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x7, x21\n\t" - "umulh x25, x7, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x6, x23\n\t" - "umulh x25, x6, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x7, x22\n\t" - "umulh x25, x7, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x7, x23\n\t" - "umulh x25, x7, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x15, #63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x12, x12, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x13, x13, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x14, x14, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x15, x15, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x13, x13, x16\n\t" "adcs x14, x14, x17\n\t" - "adcs x15, x15, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x15, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x15, asr 63\n\t" + "and x27, x25, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" @@ -4930,137 +4931,137 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr x0, [x29, #24]\n\t" "ldr x1, [x29, #168]\n\t" /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x8, x20\n\t" - "umulh x5, x8, x20\n\t" + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" /* A[0] * B[1] */ - "mul x24, x8, x21\n\t" - "umulh x6, x8, x21\n\t" - "adds x5, x5, x24\n\t" + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x9, x20\n\t" - "umulh x25, x9, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x8, x22\n\t" - "umulh x25, x8, x22\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x9, x21\n\t" - "umulh x25, x9, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x10, x20\n\t" - "umulh x25, x10, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x8, x23\n\t" - "umulh x25, x8, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x9, x22\n\t" - "umulh x25, x9, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x10, x21\n\t" - "umulh x25, x10, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x11, x20\n\t" - "umulh x25, x11, x20\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x9, x23\n\t" - "umulh x25, x9, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x10, x22\n\t" - "umulh x25, x10, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x11, x21\n\t" - "umulh x25, x11, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x10, x23\n\t" - "umulh x25, x10, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x11, x22\n\t" - "umulh x25, x11, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x11, x23\n\t" - "umulh x25, x11, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x16\n\t" "adcs x6, x6, x17\n\t" - "adcs x7, x7, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -5072,133 +5073,133 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x9, x13, x5\n\t" "adcs x10, x14, x6\n\t" "adc x11, x15, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x11, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x24\n\t" - "sbcs x9, x9, x27\n\t" - "sbcs x10, x10, x27\n\t" - "sbc x11, x11, x25\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" /* Sub */ "subs x16, x12, x4\n\t" "sbcs x17, x13, x5\n\t" - "sbcs x18, x14, x6\n\t" - "sbcs x19, x15, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x8, x9, [x0]\n\t" "stp x10, x11, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldr x0, [x29, #40]\n\t" "ldr x1, [x29, #160]\n\t" "ldr x3, [x29, #72]\n\t" /* Multiply */ "ldp x16, x17, [x1]\n\t" - "ldp x18, x19, [x1, #16]\n\t" - "ldp x20, x21, [x3]\n\t" - "ldp x22, x23, [x3, #16]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x3]\n\t" + "ldp x23, x24, [x3, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x16, x20\n\t" - "umulh x5, x16, x20\n\t" + "mul x4, x16, x21\n\t" + "umulh x5, x16, x21\n\t" /* A[0] * B[1] */ - "mul x24, x16, x21\n\t" - "umulh x6, x16, x21\n\t" - "adds x5, x5, x24\n\t" + "mul x25, x16, x22\n\t" + "umulh x6, x16, x22\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x17, x20\n\t" - "umulh x25, x17, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x16, x22\n\t" - "umulh x25, x16, x22\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x17, x21\n\t" - "umulh x25, x17, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x18, x20\n\t" - "umulh x25, x18, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x16, x23\n\t" - "umulh x25, x16, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x17, x22\n\t" - "umulh x25, x17, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x18, x21\n\t" - "umulh x25, x18, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x19, x20\n\t" - "umulh x25, x19, x20\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x17, x23\n\t" - "umulh x25, x17, x23\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x24, x18, x22\n\t" - "umulh x25, x18, x22\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x24, x19, x21\n\t" - "umulh x25, x19, x21\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x24, x18, x23\n\t" - "umulh x25, x18, x23\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x24, x19, x22\n\t" - "umulh x25, x19, x22\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x19, x23\n\t" - "umulh x25, x19, x23\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -5207,37 +5208,37 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x10, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x11\n\t" - "umulh x26, x24, x11\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -5251,55 +5252,55 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adcs x9, x9, x9\n\t" "adcs x10, x10, x10\n\t" "adc x11, x11, x11\n\t" - "mov x24, #-19\n\t" - "asr x27, x11, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x24\n\t" - "sbcs x9, x9, x27\n\t" - "sbcs x10, x10, x27\n\t" - "sbc x11, x11, x25\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" "ldr x1, [x29, #40]\n\t" /* Add */ "adds x12, x8, x4\n\t" "adcs x13, x9, x5\n\t" "adcs x14, x10, x6\n\t" "adc x15, x11, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x15, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x12, x12, x24\n\t" - "sbcs x13, x13, x27\n\t" - "sbcs x14, x14, x27\n\t" - "sbc x15, x15, x25\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" /* Sub */ "subs x16, x8, x4\n\t" "sbcs x17, x9, x5\n\t" - "sbcs x18, x10, x6\n\t" - "sbcs x19, x11, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x10, x6\n\t" + "sbcs x20, x11, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x12, x13, [x1]\n\t" "stp x14, x15, [x1, #16]\n\t" "stp x16, x17, [x0]\n\t" - "stp x18, x19, [x0, #16]\n\t" + "stp x19, x20, [x0, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); (void)qxy2d; (void)qyplusx; @@ -5325,170 +5326,170 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldp x12, x13, [x2]\n\t" "ldp x14, x15, [x2, #16]\n\t" "ldp x16, x17, [x3]\n\t" - "ldp x18, x19, [x3, #16]\n\t" + "ldp x19, x20, [x3, #16]\n\t" "adds x4, x12, x16\n\t" "adcs x5, x13, x17\n\t" - "adcs x6, x14, x18\n\t" - "adc x7, x15, x19\n\t" - "mov x24, #-19\n\t" - "asr x27, x7, #63\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x24\n\t" - "sbcs x5, x5, x27\n\t" - "sbcs x6, x6, x27\n\t" - "sbc x7, x7, x25\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" /* Sub */ "subs x8, x12, x16\n\t" "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x18\n\t" - "sbcs x11, x15, x19\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x8, x8, x24\n\t" - "adcs x9, x9, x27\n\t" - "adcs x10, x10, x27\n\t" - "adc x11, x11, x25\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" "ldr x0, [x29, #32]\n\t" "ldr x2, [x29, #176]\n\t" /* Multiply */ - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x12, x4, x20\n\t" - "umulh x13, x4, x20\n\t" + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" /* A[0] * B[1] */ - "mul x24, x4, x21\n\t" - "umulh x14, x4, x21\n\t" - "adds x13, x13, x24\n\t" + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x5, x20\n\t" - "umulh x25, x5, x20\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x4, x22\n\t" - "umulh x25, x4, x22\n\t" - "adds x14, x14, x24\n\t" - "adc x15, x15, x25\n\t" + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" /* A[1] * B[1] */ - "mul x24, x5, x21\n\t" - "umulh x25, x5, x21\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x6, x20\n\t" - "umulh x25, x6, x20\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x4, x23\n\t" - "umulh x25, x4, x23\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x5, x22\n\t" - "umulh x25, x5, x22\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x6, x21\n\t" - "umulh x25, x6, x21\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x7, x20\n\t" - "umulh x25, x7, x20\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x5, x23\n\t" - "umulh x25, x5, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x6, x22\n\t" - "umulh x25, x6, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x7, x21\n\t" - "umulh x25, x7, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x6, x23\n\t" - "umulh x25, x6, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x7, x22\n\t" - "umulh x25, x7, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x7, x23\n\t" - "umulh x25, x7, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x15, #63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x12, x12, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x13, x13, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x14, x14, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x15, x15, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x13, x13, x16\n\t" "adcs x14, x14, x17\n\t" - "adcs x15, x15, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x15, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x15, asr 63\n\t" + "and x27, x25, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" @@ -5496,137 +5497,137 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr x0, [x29, #24]\n\t" "ldr x1, [x29, #184]\n\t" /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x8, x20\n\t" - "umulh x5, x8, x20\n\t" + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" /* A[0] * B[1] */ - "mul x24, x8, x21\n\t" - "umulh x6, x8, x21\n\t" - "adds x5, x5, x24\n\t" + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x9, x20\n\t" - "umulh x25, x9, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x8, x22\n\t" - "umulh x25, x8, x22\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x9, x21\n\t" - "umulh x25, x9, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x10, x20\n\t" - "umulh x25, x10, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x8, x23\n\t" - "umulh x25, x8, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x9, x22\n\t" - "umulh x25, x9, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x10, x21\n\t" - "umulh x25, x10, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x11, x20\n\t" - "umulh x25, x11, x20\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x9, x23\n\t" - "umulh x25, x9, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x10, x22\n\t" - "umulh x25, x10, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x11, x21\n\t" - "umulh x25, x11, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x10, x23\n\t" - "umulh x25, x10, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x11, x22\n\t" - "umulh x25, x11, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x11, x23\n\t" - "umulh x25, x11, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x16\n\t" "adcs x6, x6, x17\n\t" - "adcs x7, x7, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -5638,35 +5639,35 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x9, x13, x5\n\t" "adcs x10, x14, x6\n\t" "adc x11, x15, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x11, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x24\n\t" - "sbcs x9, x9, x27\n\t" - "sbcs x10, x10, x27\n\t" - "sbc x11, x11, x25\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" /* Sub */ "subs x16, x12, x4\n\t" "sbcs x17, x13, x5\n\t" - "sbcs x18, x14, x6\n\t" - "sbcs x19, x15, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x8, x9, [x0]\n\t" "stp x10, x11, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldr x0, [x29, #48]\n\t" "ldr x1, [x29, #64]\n\t" "ldr x2, [x29, #160]\n\t" @@ -5674,97 +5675,97 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldp x12, x13, [x1]\n\t" "ldp x14, x15, [x1, #16]\n\t" "ldp x16, x17, [x2]\n\t" - "ldp x18, x19, [x2, #16]\n\t" + "ldp x19, x20, [x2, #16]\n\t" /* A[0] * B[0] */ "mul x4, x12, x16\n\t" "umulh x5, x12, x16\n\t" /* A[0] * B[1] */ - "mul x24, x12, x17\n\t" + "mul x25, x12, x17\n\t" "umulh x6, x12, x17\n\t" - "adds x5, x5, x24\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x13, x16\n\t" - "umulh x25, x13, x16\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x13, x16\n\t" + "umulh x26, x13, x16\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x12, x18\n\t" - "umulh x25, x12, x18\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x12, x19\n\t" + "umulh x26, x12, x19\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x13, x17\n\t" - "umulh x25, x13, x17\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x13, x17\n\t" + "umulh x26, x13, x17\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x14, x16\n\t" - "umulh x25, x14, x16\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x14, x16\n\t" + "umulh x26, x14, x16\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x12, x19\n\t" - "umulh x25, x12, x19\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x12, x20\n\t" + "umulh x26, x12, x20\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x13, x18\n\t" - "umulh x25, x13, x18\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x13, x19\n\t" + "umulh x26, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x14, x17\n\t" - "umulh x25, x14, x17\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x14, x17\n\t" + "umulh x26, x14, x17\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x15, x16\n\t" - "umulh x25, x15, x16\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x15, x16\n\t" + "umulh x26, x15, x16\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x13, x19\n\t" - "umulh x25, x13, x19\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x13, x20\n\t" + "umulh x26, x13, x20\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x24, x14, x18\n\t" - "umulh x25, x14, x18\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x14, x19\n\t" + "umulh x26, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x24, x15, x17\n\t" - "umulh x25, x15, x17\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x15, x17\n\t" + "umulh x26, x15, x17\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x24, x14, x19\n\t" - "umulh x25, x14, x19\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x14, x20\n\t" + "umulh x26, x14, x20\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x24, x15, x18\n\t" - "umulh x25, x15, x18\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x15, x19\n\t" + "umulh x26, x15, x19\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x15, x19\n\t" - "umulh x25, x15, x19\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" + "mul x25, x15, x20\n\t" + "umulh x26, x15, x20\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -5773,37 +5774,37 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x10, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x11\n\t" - "umulh x26, x24, x11\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -5814,114 +5815,114 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" "adc x7, x7, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x7, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x24\n\t" - "sbcs x5, x5, x27\n\t" - "sbcs x6, x6, x27\n\t" - "sbc x7, x7, x25\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" "ldr x0, [x29, #40]\n\t" "ldr x1, [x29, #168]\n\t" "ldr x2, [x29, #72]\n\t" /* Multiply */ "ldp x16, x17, [x1]\n\t" - "ldp x18, x19, [x1, #16]\n\t" - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x8, x16, x20\n\t" - "umulh x9, x16, x20\n\t" + "mul x8, x16, x21\n\t" + "umulh x9, x16, x21\n\t" /* A[0] * B[1] */ - "mul x24, x16, x21\n\t" - "umulh x10, x16, x21\n\t" - "adds x9, x9, x24\n\t" + "mul x25, x16, x22\n\t" + "umulh x10, x16, x22\n\t" + "adds x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x17, x20\n\t" - "umulh x25, x17, x20\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x16, x22\n\t" - "umulh x25, x16, x22\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" /* A[1] * B[1] */ - "mul x24, x17, x21\n\t" - "umulh x25, x17, x21\n\t" - "adds x10, x10, x24\n\t" - "adcs x11, x11, x25\n\t" + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" "adc x12, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x18, x20\n\t" - "umulh x25, x18, x20\n\t" - "adds x10, x10, x24\n\t" - "adcs x11, x11, x25\n\t" + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" "adc x12, x12, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x16, x23\n\t" - "umulh x25, x16, x23\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x17, x22\n\t" - "umulh x25, x17, x22\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, x13, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x18, x21\n\t" - "umulh x25, x18, x21\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, x13, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x19, x20\n\t" - "umulh x25, x19, x20\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, x13, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x17, x23\n\t" - "umulh x25, x17, x23\n\t" - "adds x12, x12, x24\n\t" - "adcs x13, x13, x25\n\t" + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" "adc x14, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x24, x18, x22\n\t" - "umulh x25, x18, x22\n\t" - "adds x12, x12, x24\n\t" - "adcs x13, x13, x25\n\t" + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" "adc x14, x14, xzr\n\t" /* A[3] * B[1] */ - "mul x24, x19, x21\n\t" - "umulh x25, x19, x21\n\t" - "adds x12, x12, x24\n\t" - "adcs x13, x13, x25\n\t" + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" "adc x14, x14, xzr\n\t" /* A[2] * B[3] */ - "mul x24, x18, x23\n\t" - "umulh x25, x18, x23\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x24, x19, x22\n\t" - "umulh x25, x19, x22\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, x15, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x19, x23\n\t" - "umulh x25, x19, x23\n\t" - "adds x14, x14, x24\n\t" - "adc x15, x15, x25\n\t" + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x15, x15, x14, #63\n\t" @@ -5930,37 +5931,37 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x12, x12, x11, #63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x12\n\t" - "umulh x12, x24, x12\n\t" - "adds x8, x8, x25\n\t" - "mul x25, x24, x13\n\t" - "umulh x13, x24, x13\n\t" - "adcs x9, x9, x25\n\t" - "mul x25, x24, x14\n\t" - "umulh x14, x24, x14\n\t" - "adcs x10, x10, x25\n\t" - "mul x25, x24, x15\n\t" - "umulh x26, x24, x15\n\t" - "adcs x11, x11, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x12\n\t" + "umulh x12, x25, x12\n\t" + "adds x8, x8, x26\n\t" + "mul x26, x25, x13\n\t" + "umulh x13, x25, x13\n\t" + "adcs x9, x9, x26\n\t" + "mul x26, x25, x14\n\t" + "umulh x14, x25, x14\n\t" + "adcs x10, x10, x26\n\t" + "mul x26, x25, x15\n\t" + "umulh x27, x25, x15\n\t" + "adcs x11, x11, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x9, x9, x12\n\t" "adcs x10, x10, x13\n\t" "adcs x11, x11, x14\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x11, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x26\n\t" + "adds x8, x8, x27\n\t" "adcs x9, x9, xzr\n\t" "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x11, asr 63\n\t" + "and x27, x25, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x26\n\t" + "adds x8, x8, x27\n\t" "adcs x9, x9, xzr\n\t" "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" @@ -5972,39 +5973,39 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x13, x5, x9\n\t" "adcs x14, x6, x10\n\t" "adc x15, x7, x11\n\t" - "mov x24, #-19\n\t" - "asr x27, x15, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x12, x12, x24\n\t" - "sbcs x13, x13, x27\n\t" - "sbcs x14, x14, x27\n\t" - "sbc x15, x15, x25\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" /* Sub */ "subs x16, x4, x8\n\t" "sbcs x17, x5, x9\n\t" - "sbcs x18, x6, x10\n\t" - "sbcs x19, x7, x11\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x6, x10\n\t" + "sbcs x20, x7, x11\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x12, x13, [x0]\n\t" "stp x14, x15, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); (void)qz; (void)qt2d; @@ -6031,170 +6032,170 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldp x12, x13, [x2]\n\t" "ldp x14, x15, [x2, #16]\n\t" "ldp x16, x17, [x3]\n\t" - "ldp x18, x19, [x3, #16]\n\t" + "ldp x19, x20, [x3, #16]\n\t" "adds x4, x12, x16\n\t" "adcs x5, x13, x17\n\t" - "adcs x6, x14, x18\n\t" - "adc x7, x15, x19\n\t" - "mov x24, #-19\n\t" - "asr x27, x7, #63\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x24\n\t" - "sbcs x5, x5, x27\n\t" - "sbcs x6, x6, x27\n\t" - "sbc x7, x7, x25\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" /* Sub */ "subs x8, x12, x16\n\t" "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x18\n\t" - "sbcs x11, x15, x19\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x8, x8, x24\n\t" - "adcs x9, x9, x27\n\t" - "adcs x10, x10, x27\n\t" - "adc x11, x11, x25\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" "ldr x0, [x29, #32]\n\t" "ldr x2, [x29, #184]\n\t" /* Multiply */ - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x12, x4, x20\n\t" - "umulh x13, x4, x20\n\t" + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" /* A[0] * B[1] */ - "mul x24, x4, x21\n\t" - "umulh x14, x4, x21\n\t" - "adds x13, x13, x24\n\t" + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" "adc x14, x14, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x5, x20\n\t" - "umulh x25, x5, x20\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x4, x22\n\t" - "umulh x25, x4, x22\n\t" - "adds x14, x14, x24\n\t" - "adc x15, x15, x25\n\t" + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" /* A[1] * B[1] */ - "mul x24, x5, x21\n\t" - "umulh x25, x5, x21\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x6, x20\n\t" - "umulh x25, x6, x20\n\t" - "adds x14, x14, x24\n\t" - "adcs x15, x15, x25\n\t" + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x4, x23\n\t" - "umulh x25, x4, x23\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x5, x22\n\t" - "umulh x25, x5, x22\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x6, x21\n\t" - "umulh x25, x6, x21\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x7, x20\n\t" - "umulh x25, x7, x20\n\t" - "adds x15, x15, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x5, x23\n\t" - "umulh x25, x5, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x6, x22\n\t" - "umulh x25, x6, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x7, x21\n\t" - "umulh x25, x7, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x6, x23\n\t" - "umulh x25, x6, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x7, x22\n\t" - "umulh x25, x7, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x7, x23\n\t" - "umulh x25, x7, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x15, #63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x12, x12, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x13, x13, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x14, x14, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x15, x15, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x13, x13, x16\n\t" "adcs x14, x14, x17\n\t" - "adcs x15, x15, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x15, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x15, asr 63\n\t" + "and x27, x25, x15, asr 63\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x26\n\t" + "adds x12, x12, x27\n\t" "adcs x13, x13, xzr\n\t" "adcs x14, x14, xzr\n\t" "adc x15, x15, xzr\n\t" @@ -6202,137 +6203,137 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr x0, [x29, #24]\n\t" "ldr x1, [x29, #176]\n\t" /* Multiply */ - "ldp x20, x21, [x1]\n\t" - "ldp x22, x23, [x1, #16]\n\t" + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" /* A[0] * B[0] */ - "mul x4, x8, x20\n\t" - "umulh x5, x8, x20\n\t" + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" /* A[0] * B[1] */ - "mul x24, x8, x21\n\t" - "umulh x6, x8, x21\n\t" - "adds x5, x5, x24\n\t" + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x9, x20\n\t" - "umulh x25, x9, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x8, x22\n\t" - "umulh x25, x8, x22\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x9, x21\n\t" - "umulh x25, x9, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x10, x20\n\t" - "umulh x25, x10, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x16, x16, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x8, x23\n\t" - "umulh x25, x8, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x9, x22\n\t" - "umulh x25, x9, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x10, x21\n\t" - "umulh x25, x10, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x11, x20\n\t" - "umulh x25, x11, x20\n\t" - "adds x7, x7, x24\n\t" - "adcs x16, x16, x25\n\t" + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x9, x23\n\t" - "umulh x25, x9, x23\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x10, x22\n\t" - "umulh x25, x10, x22\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x11, x21\n\t" - "umulh x25, x11, x21\n\t" - "adds x16, x16, x24\n\t" - "adcs x17, x17, x25\n\t" - "adc x18, x18, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x10, x23\n\t" - "umulh x25, x10, x23\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x11, x22\n\t" - "umulh x25, x11, x22\n\t" - "adds x17, x17, x24\n\t" - "adcs x18, x18, x25\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x11, x23\n\t" - "umulh x25, x11, x23\n\t" - "adds x18, x18, x24\n\t" - "adc x19, x19, x25\n\t" + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x19, x19, x18, #63\n\t" - "extr x18, x18, x17, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" "extr x17, x17, x16, #63\n\t" "extr x16, x16, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x16\n\t" - "umulh x16, x24, x16\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x17\n\t" - "umulh x17, x24, x17\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x18\n\t" - "umulh x18, x24, x18\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x19\n\t" - "umulh x26, x24, x19\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x16\n\t" "adcs x6, x6, x17\n\t" - "adcs x7, x7, x18\n\t" - "adc x26, x26, xzr\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -6344,35 +6345,35 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x9, x13, x5\n\t" "adcs x10, x14, x6\n\t" "adc x11, x15, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x11, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x24\n\t" - "sbcs x9, x9, x27\n\t" - "sbcs x10, x10, x27\n\t" - "sbc x11, x11, x25\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" /* Sub */ "subs x16, x12, x4\n\t" "sbcs x17, x13, x5\n\t" - "sbcs x18, x14, x6\n\t" - "sbcs x19, x15, x7\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x8, x9, [x0]\n\t" "stp x10, x11, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldr x0, [x29, #48]\n\t" "ldr x1, [x29, #64]\n\t" "ldr x2, [x29, #160]\n\t" @@ -6380,97 +6381,97 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldp x12, x13, [x1]\n\t" "ldp x14, x15, [x1, #16]\n\t" "ldp x16, x17, [x2]\n\t" - "ldp x18, x19, [x2, #16]\n\t" + "ldp x19, x20, [x2, #16]\n\t" /* A[0] * B[0] */ "mul x4, x12, x16\n\t" "umulh x5, x12, x16\n\t" /* A[0] * B[1] */ - "mul x24, x12, x17\n\t" + "mul x25, x12, x17\n\t" "umulh x6, x12, x17\n\t" - "adds x5, x5, x24\n\t" + "adds x5, x5, x25\n\t" "adc x6, x6, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x13, x16\n\t" - "umulh x25, x13, x16\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" + "mul x25, x13, x16\n\t" + "umulh x26, x13, x16\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" "adc x7, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x12, x18\n\t" - "umulh x25, x12, x18\n\t" - "adds x6, x6, x24\n\t" - "adc x7, x7, x25\n\t" + "mul x25, x12, x19\n\t" + "umulh x26, x12, x19\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" /* A[1] * B[1] */ - "mul x24, x13, x17\n\t" - "umulh x25, x13, x17\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x13, x17\n\t" + "umulh x26, x13, x17\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x14, x16\n\t" - "umulh x25, x14, x16\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" + "mul x25, x14, x16\n\t" + "umulh x26, x14, x16\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" "adc x8, x8, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x12, x19\n\t" - "umulh x25, x12, x19\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x12, x20\n\t" + "umulh x26, x12, x20\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x13, x18\n\t" - "umulh x25, x13, x18\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x13, x19\n\t" + "umulh x26, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x14, x17\n\t" - "umulh x25, x14, x17\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x14, x17\n\t" + "umulh x26, x14, x17\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x15, x16\n\t" - "umulh x25, x15, x16\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" + "mul x25, x15, x16\n\t" + "umulh x26, x15, x16\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x13, x19\n\t" - "umulh x25, x13, x19\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x13, x20\n\t" + "umulh x26, x13, x20\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x24, x14, x18\n\t" - "umulh x25, x14, x18\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x14, x19\n\t" + "umulh x26, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[3] * B[1] */ - "mul x24, x15, x17\n\t" - "umulh x25, x15, x17\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" + "mul x25, x15, x17\n\t" + "umulh x26, x15, x17\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" /* A[2] * B[3] */ - "mul x24, x14, x19\n\t" - "umulh x25, x14, x19\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x14, x20\n\t" + "umulh x26, x14, x20\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x24, x15, x18\n\t" - "umulh x25, x15, x18\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x15, x19\n\t" + "umulh x26, x15, x19\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x15, x19\n\t" - "umulh x25, x15, x19\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" + "mul x25, x15, x20\n\t" + "umulh x26, x15, x20\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x11, x11, x10, #63\n\t" @@ -6479,37 +6480,37 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x8, x8, x7, #63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adds x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x10, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "mul x25, x24, x11\n\t" - "umulh x26, x24, x11\n\t" - "adcs x7, x7, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adcs x7, x7, x10\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x7, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x7, asr 63\n\t" + "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x26\n\t" + "adds x4, x4, x27\n\t" "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" @@ -6520,114 +6521,114 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" "adc x7, x7, x7\n\t" - "mov x24, #-19\n\t" - "asr x27, x7, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x24\n\t" - "sbcs x5, x5, x27\n\t" - "sbcs x6, x6, x27\n\t" - "sbc x7, x7, x25\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" "ldr x0, [x29, #40]\n\t" "ldr x1, [x29, #168]\n\t" "ldr x2, [x29, #72]\n\t" /* Multiply */ "ldp x16, x17, [x1]\n\t" - "ldp x18, x19, [x1, #16]\n\t" - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" /* A[0] * B[0] */ - "mul x8, x16, x20\n\t" - "umulh x9, x16, x20\n\t" + "mul x8, x16, x21\n\t" + "umulh x9, x16, x21\n\t" /* A[0] * B[1] */ - "mul x24, x16, x21\n\t" - "umulh x10, x16, x21\n\t" - "adds x9, x9, x24\n\t" + "mul x25, x16, x22\n\t" + "umulh x10, x16, x22\n\t" + "adds x9, x9, x25\n\t" "adc x10, x10, xzr\n\t" /* A[1] * B[0] */ - "mul x24, x17, x20\n\t" - "umulh x25, x17, x20\n\t" - "adds x9, x9, x24\n\t" - "adcs x10, x10, x25\n\t" + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" "adc x11, xzr, xzr\n\t" /* A[0] * B[2] */ - "mul x24, x16, x22\n\t" - "umulh x25, x16, x22\n\t" - "adds x10, x10, x24\n\t" - "adc x11, x11, x25\n\t" + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" /* A[1] * B[1] */ - "mul x24, x17, x21\n\t" - "umulh x25, x17, x21\n\t" - "adds x10, x10, x24\n\t" - "adcs x11, x11, x25\n\t" + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" "adc x12, xzr, xzr\n\t" /* A[2] * B[0] */ - "mul x24, x18, x20\n\t" - "umulh x25, x18, x20\n\t" - "adds x10, x10, x24\n\t" - "adcs x11, x11, x25\n\t" + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" "adc x12, x12, xzr\n\t" /* A[0] * B[3] */ - "mul x24, x16, x23\n\t" - "umulh x25, x16, x23\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, xzr, xzr\n\t" /* A[1] * B[2] */ - "mul x24, x17, x22\n\t" - "umulh x25, x17, x22\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, x13, xzr\n\t" /* A[2] * B[1] */ - "mul x24, x18, x21\n\t" - "umulh x25, x18, x21\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, x13, xzr\n\t" /* A[3] * B[0] */ - "mul x24, x19, x20\n\t" - "umulh x25, x19, x20\n\t" - "adds x11, x11, x24\n\t" - "adcs x12, x12, x25\n\t" + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" "adc x13, x13, xzr\n\t" /* A[1] * B[3] */ - "mul x24, x17, x23\n\t" - "umulh x25, x17, x23\n\t" - "adds x12, x12, x24\n\t" - "adcs x13, x13, x25\n\t" + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" "adc x14, xzr, xzr\n\t" /* A[2] * B[2] */ - "mul x24, x18, x22\n\t" - "umulh x25, x18, x22\n\t" - "adds x12, x12, x24\n\t" - "adcs x13, x13, x25\n\t" + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" "adc x14, x14, xzr\n\t" /* A[3] * B[1] */ - "mul x24, x19, x21\n\t" - "umulh x25, x19, x21\n\t" - "adds x12, x12, x24\n\t" - "adcs x13, x13, x25\n\t" + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" "adc x14, x14, xzr\n\t" /* A[2] * B[3] */ - "mul x24, x18, x23\n\t" - "umulh x25, x18, x23\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, xzr, xzr\n\t" /* A[3] * B[2] */ - "mul x24, x19, x22\n\t" - "umulh x25, x19, x22\n\t" - "adds x13, x13, x24\n\t" - "adcs x14, x14, x25\n\t" + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" "adc x15, x15, xzr\n\t" /* A[3] * B[3] */ - "mul x24, x19, x23\n\t" - "umulh x25, x19, x23\n\t" - "adds x14, x14, x24\n\t" - "adc x15, x15, x25\n\t" + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" /* Reduce */ /* Move top half into t4-t7 and remove top bit from t3 */ "extr x15, x15, x14, #63\n\t" @@ -6636,37 +6637,37 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "extr x12, x12, x11, #63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x12\n\t" - "umulh x12, x24, x12\n\t" - "adds x8, x8, x25\n\t" - "mul x25, x24, x13\n\t" - "umulh x13, x24, x13\n\t" - "adcs x9, x9, x25\n\t" - "mul x25, x24, x14\n\t" - "umulh x14, x24, x14\n\t" - "adcs x10, x10, x25\n\t" - "mul x25, x24, x15\n\t" - "umulh x26, x24, x15\n\t" - "adcs x11, x11, x25\n\t" - "adc x26, x26, xzr\n\t" + "mov x25, #19\n\t" + "mul x26, x25, x12\n\t" + "umulh x12, x25, x12\n\t" + "adds x8, x8, x26\n\t" + "mul x26, x25, x13\n\t" + "umulh x13, x25, x13\n\t" + "adcs x9, x9, x26\n\t" + "mul x26, x25, x14\n\t" + "umulh x14, x25, x14\n\t" + "adcs x10, x10, x26\n\t" + "mul x26, x25, x15\n\t" + "umulh x27, x25, x15\n\t" + "adcs x11, x11, x26\n\t" + "adc x27, x27, xzr\n\t" /* Add remaining product results in */ "adds x9, x9, x12\n\t" "adcs x10, x10, x13\n\t" "adcs x11, x11, x14\n\t" - "adc x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" /* Overflow */ - "extr x26, x26, x11, #63\n\t" - "mul x26, x26, x24\n\t" + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x26\n\t" + "adds x8, x8, x27\n\t" "adcs x9, x9, xzr\n\t" "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" /* Reduce if top bit set */ - "and x26, x24, x11, asr 63\n\t" + "and x27, x25, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x26\n\t" + "adds x8, x8, x27\n\t" "adcs x9, x9, xzr\n\t" "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" @@ -6678,39 +6679,39 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adcs x13, x5, x9\n\t" "adcs x14, x6, x10\n\t" "adc x15, x7, x11\n\t" - "mov x24, #-19\n\t" - "asr x27, x15, #63\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x12, x12, x24\n\t" - "sbcs x13, x13, x27\n\t" - "sbcs x14, x14, x27\n\t" - "sbc x15, x15, x25\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" /* Sub */ "subs x16, x4, x8\n\t" "sbcs x17, x5, x9\n\t" - "sbcs x18, x6, x10\n\t" - "sbcs x19, x7, x11\n\t" - "mov x24, #-19\n\t" - "csetm x27, cc\n\t" + "sbcs x19, x6, x10\n\t" + "sbcs x20, x7, x11\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" /* Mask the modulus */ - "and x24, x27, x24\n\t" - "and x25, x27, #0x7fffffffffffffff\n\t" + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x24\n\t" - "adcs x17, x17, x27\n\t" - "adcs x18, x18, x27\n\t" - "adc x19, x19, x25\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" "stp x12, x13, [x0]\n\t" "stp x14, x15, [x0, #16]\n\t" "stp x16, x17, [x1]\n\t" - "stp x18, x19, [x1, #16]\n\t" + "stp x19, x20, [x1, #16]\n\t" "ldp x29, x30, [sp], #0x50\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) : - : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); (void)qz; (void)qt2d; @@ -6718,5 +6719,4 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz (void)qyminusx; } -#endif /* WOLFSSL_ARMASM */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index f9dcf788e..ba2744acb 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -23,8 +23,6 @@ * cd ../scripts * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S */ - -#ifdef WOLFSSL_ARMASM #ifdef __aarch64__ .text .section .rodata @@ -127,16 +125,16 @@ Transform_Sha512_Len: stp x29, x30, [sp, #-128]! add x29, sp, #0 str x17, [x29, #16] - stp x18, x19, [x29, #24] - stp x20, x21, [x29, #40] - stp x22, x23, [x29, #56] - stp x24, x25, [x29, #72] - str x26, [x29, #88] + str x19, [x29, #24] + stp x20, x21, [x29, #32] + stp x22, x23, [x29, #48] + stp x24, x25, [x29, #64] + stp x26, x27, [x29, #80] stp d8, d9, [x29, #96] stp d10, d11, [x29, #112] adr x3, L_SHA512_transform_neon_len_k - adr x26, L_SHA512_transform_neon_len_ror8 - ld1 {v11.16b}, [x26] + adr x27, L_SHA512_transform_neon_len_ror8 + ld1 {v11.16b}, [x27] # Load digest into working vars ldp x4, x5, [x0] ldp x6, x7, [x0, #16] @@ -147,26 +145,26 @@ L_sha512_len_neon_begin: # Load W # Copy digest to add in at end ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40 - mov x18, x4 + mov x19, x4 ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40 - mov x19, x5 + mov x20, x5 rev64 v0.16b, v0.16b - mov x20, x6 + mov x21, x6 rev64 v1.16b, v1.16b - mov x21, x7 + mov x22, x7 rev64 v2.16b, v2.16b - mov x22, x8 + mov x23, x8 rev64 v3.16b, v3.16b - mov x23, x9 + mov x24, x9 rev64 v4.16b, v4.16b - mov x24, x10 + mov x25, x10 rev64 v5.16b, v5.16b - mov x25, x11 + mov x26, x11 rev64 v6.16b, v6.16b rev64 v7.16b, v7.16b # Pre-calc: b ^ c eor x16, x5, x6 - mov x26, #4 + mov x27, #4 # Start of 16 rounds L_sha512_len_neon_start: # Round 0 @@ -665,7 +663,7 @@ L_sha512_len_neon_start: add v7.2d, v7.2d, v9.2d add x8, x8, x4 add x4, x4, x14 - subs x26, x26, #1 + subs x27, x27, #1 bne L_sha512_len_neon_start # Round 0 mov x13, v0.d[0] @@ -1019,14 +1017,14 @@ L_sha512_len_neon_start: add x14, x14, x17 add x8, x8, x4 add x4, x4, x14 - add x11, x11, x25 - add x10, x10, x24 - add x9, x9, x23 - add x8, x8, x22 - add x7, x7, x21 - add x6, x6, x20 - add x5, x5, x19 - add x4, x4, x18 + add x11, x11, x26 + add x10, x10, x25 + add x9, x9, x24 + add x8, x8, x23 + add x7, x7, x22 + add x6, x6, x21 + add x5, x5, x20 + add x4, x4, x19 adr x3, L_SHA512_transform_neon_len_k subs w2, w2, #0x80 bne L_sha512_len_neon_begin @@ -1035,15 +1033,14 @@ L_sha512_len_neon_start: stp x8, x9, [x0, #32] stp x10, x11, [x0, #48] ldr x17, [x29, #16] - ldp x18, x19, [x29, #24] - ldp x20, x21, [x29, #40] - ldp x22, x23, [x29, #56] - ldp x24, x25, [x29, #72] - ldr x26, [x29, #88] + ldr x19, [x29, #24] + ldp x20, x21, [x29, #32] + ldp x22, x23, [x29, #48] + ldp x24, x25, [x29, #64] + ldp x26, x27, [x29, #80] ldp d8, d9, [x29, #96] ldp d10, d11, [x29, #112] ldp x29, x30, [sp], #0x80 ret .size Transform_Sha512_Len,.-Transform_Sha512_Len #endif /* __aarch64__ */ -#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm.c index 4e1807f01..fa490c635 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.c @@ -24,16 +24,7 @@ * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c */ #ifdef __aarch64__ - #include -#ifdef HAVE_CONFIG_H - #include -#endif - -#include - -#ifdef WOLFSSL_ARMASM - #include static const uint64_t L_SHA512_transform_neon_len_k[] = { @@ -130,8 +121,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" - "adr x26, %[L_SHA512_transform_neon_len_ror8]\n\t" - "ld1 {v11.16b}, [x26]\n\t" + "adr x27, %[L_SHA512_transform_neon_len_ror8]\n\t" + "ld1 {v11.16b}, [x27]\n\t" /* Load digest into working vars */ "ldp x4, x5, [%x[sha512]]\n\t" "ldp x6, x7, [%x[sha512], #16]\n\t" @@ -143,26 +134,26 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) /* Load W */ /* Copy digest to add in at end */ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" - "mov x18, x4\n\t" + "mov x19, x4\n\t" "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" - "mov x19, x5\n\t" + "mov x20, x5\n\t" "rev64 v0.16b, v0.16b\n\t" - "mov x20, x6\n\t" + "mov x21, x6\n\t" "rev64 v1.16b, v1.16b\n\t" - "mov x21, x7\n\t" + "mov x22, x7\n\t" "rev64 v2.16b, v2.16b\n\t" - "mov x22, x8\n\t" + "mov x23, x8\n\t" "rev64 v3.16b, v3.16b\n\t" - "mov x23, x9\n\t" + "mov x24, x9\n\t" "rev64 v4.16b, v4.16b\n\t" - "mov x24, x10\n\t" + "mov x25, x10\n\t" "rev64 v5.16b, v5.16b\n\t" - "mov x25, x11\n\t" + "mov x26, x11\n\t" "rev64 v6.16b, v6.16b\n\t" "rev64 v7.16b, v7.16b\n\t" /* Pre-calc: b ^ c */ "eor x16, x5, x6\n\t" - "mov x26, #4\n\t" + "mov x27, #4\n\t" /* Start of 16 rounds */ "\n" "L_sha512_len_neon_start_%=: \n\t" @@ -662,7 +653,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v7.2d, v7.2d, v9.2d\n\t" "add x8, x8, x4\n\t" "add x4, x4, x14\n\t" - "subs x26, x26, #1\n\t" + "subs x27, x27, #1\n\t" "bne L_sha512_len_neon_start_%=\n\t" /* Round 0 */ "mov x13, v0.d[0]\n\t" @@ -1016,14 +1007,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add x14, x14, x17\n\t" "add x8, x8, x4\n\t" "add x4, x4, x14\n\t" - "add x11, x11, x25\n\t" - "add x10, x10, x24\n\t" - "add x9, x9, x23\n\t" - "add x8, x8, x22\n\t" - "add x7, x7, x21\n\t" - "add x6, x6, x20\n\t" - "add x5, x5, x19\n\t" - "add x4, x4, x18\n\t" + "add x11, x11, x26\n\t" + "add x10, x10, x25\n\t" + "add x9, x9, x24\n\t" + "add x8, x8, x23\n\t" + "add x7, x7, x22\n\t" + "add x6, x6, x21\n\t" + "add x5, x5, x20\n\t" + "add x4, x4, x19\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" "subs %w[len], %w[len], #0x80\n\t" "bne L_sha512_len_neon_begin_%=\n\t" @@ -1034,9 +1025,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldp x29, x30, [sp], #16\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" ); } -#endif /* WOLFSSL_ARMASM */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 0f90efba0..d072703d2 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -194,9 +194,9 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "ldp x12, x13, [%[a], 32]\n\t" "ldp x14, x15, [%[a], 48]\n\t" "ldp x16, x17, [%[b], 0]\n\t" - "ldp x18, x19, [%[b], 16]\n\t" - "ldp x20, x21, [%[b], 32]\n\t" - "ldp x22, x23, [%[b], 48]\n\t" + "ldp x19, x20, [%[b], 16]\n\t" + "ldp x21, x22, [%[b], 32]\n\t" + "ldp x23, x24, [%[b], 48]\n\t" "# A[0] * B[0]\n\t" "mul x3, x8, x16\n\t" "umulh x4, x8, x16\n\t" @@ -214,8 +214,8 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x3, xzr, xzr\n\t" "str x4, [%[tmp], 8]\n\t" "# A[0] * B[2]\n\t" - "mul x6, x8, x18\n\t" - "umulh x7, x8, x18\n\t" + "mul x6, x8, x19\n\t" + "umulh x7, x8, x19\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" @@ -233,14 +233,14 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x4, x4, xzr\n\t" "str x5, [%[tmp], 16]\n\t" "# A[0] * B[3]\n\t" - "mul x6, x8, x19\n\t" - "umulh x7, x8, x19\n\t" + "mul x6, x8, x20\n\t" + "umulh x7, x8, x20\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[1] * B[2]\n\t" - "mul x6, x9, x18\n\t" - "umulh x7, x9, x18\n\t" + "mul x6, x9, x19\n\t" + "umulh x7, x9, x19\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" @@ -258,20 +258,20 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x5, x5, xzr\n\t" "str x3, [%[tmp], 24]\n\t" "# A[0] * B[4]\n\t" - "mul x6, x8, x20\n\t" - "umulh x7, x8, x20\n\t" + "mul x6, x8, x21\n\t" + "umulh x7, x8, x21\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[1] * B[3]\n\t" - "mul x6, x9, x19\n\t" - "umulh x7, x9, x19\n\t" + "mul x6, x9, x20\n\t" + "umulh x7, x9, x20\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[2] * B[2]\n\t" - "mul x6, x10, x18\n\t" - "umulh x7, x10, x18\n\t" + "mul x6, x10, x19\n\t" + "umulh x7, x10, x19\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" @@ -289,26 +289,26 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x3, x3, xzr\n\t" "str x4, [%[tmp], 32]\n\t" "# A[0] * B[5]\n\t" - "mul x6, x8, x21\n\t" - "umulh x7, x8, x21\n\t" + "mul x6, x8, x22\n\t" + "umulh x7, x8, x22\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" "# A[1] * B[4]\n\t" - "mul x6, x9, x20\n\t" - "umulh x7, x9, x20\n\t" + "mul x6, x9, x21\n\t" + "umulh x7, x9, x21\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[2] * B[3]\n\t" - "mul x6, x10, x19\n\t" - "umulh x7, x10, x19\n\t" + "mul x6, x10, x20\n\t" + "umulh x7, x10, x20\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[3] * B[2]\n\t" - "mul x6, x11, x18\n\t" - "umulh x7, x11, x18\n\t" + "mul x6, x11, x19\n\t" + "umulh x7, x11, x19\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" @@ -326,32 +326,32 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x4, x4, xzr\n\t" "str x5, [%[tmp], 40]\n\t" "# A[0] * B[6]\n\t" - "mul x6, x8, x22\n\t" - "umulh x7, x8, x22\n\t" + "mul x6, x8, x23\n\t" + "umulh x7, x8, x23\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[1] * B[5]\n\t" - "mul x6, x9, x21\n\t" - "umulh x7, x9, x21\n\t" + "mul x6, x9, x22\n\t" + "umulh x7, x9, x22\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[2] * B[4]\n\t" - "mul x6, x10, x20\n\t" - "umulh x7, x10, x20\n\t" + "mul x6, x10, x21\n\t" + "umulh x7, x10, x21\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[3] * B[3]\n\t" - "mul x6, x11, x19\n\t" - "umulh x7, x11, x19\n\t" + "mul x6, x11, x20\n\t" + "umulh x7, x11, x20\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[4] * B[2]\n\t" - "mul x6, x12, x18\n\t" - "umulh x7, x12, x18\n\t" + "mul x6, x12, x19\n\t" + "umulh x7, x12, x19\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" @@ -369,38 +369,38 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x5, x5, xzr\n\t" "str x3, [%[tmp], 48]\n\t" "# A[0] * B[7]\n\t" - "mul x6, x8, x23\n\t" - "umulh x7, x8, x23\n\t" + "mul x6, x8, x24\n\t" + "umulh x7, x8, x24\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[1] * B[6]\n\t" - "mul x6, x9, x22\n\t" - "umulh x7, x9, x22\n\t" + "mul x6, x9, x23\n\t" + "umulh x7, x9, x23\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[2] * B[5]\n\t" - "mul x6, x10, x21\n\t" - "umulh x7, x10, x21\n\t" + "mul x6, x10, x22\n\t" + "umulh x7, x10, x22\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[3] * B[4]\n\t" - "mul x6, x11, x20\n\t" - "umulh x7, x11, x20\n\t" + "mul x6, x11, x21\n\t" + "umulh x7, x11, x21\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[4] * B[3]\n\t" - "mul x6, x12, x19\n\t" - "umulh x7, x12, x19\n\t" + "mul x6, x12, x20\n\t" + "umulh x7, x12, x20\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[5] * B[2]\n\t" - "mul x6, x13, x18\n\t" - "umulh x7, x13, x18\n\t" + "mul x6, x13, x19\n\t" + "umulh x7, x13, x19\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" @@ -418,38 +418,38 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x3, x3, xzr\n\t" "str x4, [%[tmp], 56]\n\t" "# A[1] * B[7]\n\t" - "mul x6, x9, x23\n\t" - "umulh x7, x9, x23\n\t" + "mul x6, x9, x24\n\t" + "umulh x7, x9, x24\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" "# A[2] * B[6]\n\t" - "mul x6, x10, x22\n\t" - "umulh x7, x10, x22\n\t" + "mul x6, x10, x23\n\t" + "umulh x7, x10, x23\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[3] * B[5]\n\t" - "mul x6, x11, x21\n\t" - "umulh x7, x11, x21\n\t" + "mul x6, x11, x22\n\t" + "umulh x7, x11, x22\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[4] * B[4]\n\t" - "mul x6, x12, x20\n\t" - "umulh x7, x12, x20\n\t" + "mul x6, x12, x21\n\t" + "umulh x7, x12, x21\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[5] * B[3]\n\t" - "mul x6, x13, x19\n\t" - "umulh x7, x13, x19\n\t" + "mul x6, x13, x20\n\t" + "umulh x7, x13, x20\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[6] * B[2]\n\t" - "mul x6, x14, x18\n\t" - "umulh x7, x14, x18\n\t" + "mul x6, x14, x19\n\t" + "umulh x7, x14, x19\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" @@ -461,139 +461,139 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x4, x4, xzr\n\t" "str x5, [%[r], 64]\n\t" "# A[2] * B[7]\n\t" - "mul x6, x10, x23\n\t" - "umulh x7, x10, x23\n\t" + "mul x6, x10, x24\n\t" + "umulh x7, x10, x24\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[3] * B[6]\n\t" - "mul x6, x11, x22\n\t" - "umulh x7, x11, x22\n\t" + "mul x6, x11, x23\n\t" + "umulh x7, x11, x23\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[4] * B[5]\n\t" - "mul x6, x12, x21\n\t" - "umulh x7, x12, x21\n\t" + "mul x6, x12, x22\n\t" + "umulh x7, x12, x22\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[5] * B[4]\n\t" - "mul x6, x13, x20\n\t" - "umulh x7, x13, x20\n\t" + "mul x6, x13, x21\n\t" + "umulh x7, x13, x21\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[6] * B[3]\n\t" - "mul x6, x14, x19\n\t" - "umulh x7, x14, x19\n\t" + "mul x6, x14, x20\n\t" + "umulh x7, x14, x20\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[7] * B[2]\n\t" - "mul x6, x15, x18\n\t" - "umulh x7, x15, x18\n\t" + "mul x6, x15, x19\n\t" + "umulh x7, x15, x19\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "str x3, [%[r], 72]\n\t" "# A[3] * B[7]\n\t" - "mul x6, x11, x23\n\t" - "umulh x7, x11, x23\n\t" + "mul x6, x11, x24\n\t" + "umulh x7, x11, x24\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[4] * B[6]\n\t" - "mul x6, x12, x22\n\t" - "umulh x7, x12, x22\n\t" + "mul x6, x12, x23\n\t" + "umulh x7, x12, x23\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[5] * B[5]\n\t" - "mul x6, x13, x21\n\t" - "umulh x7, x13, x21\n\t" + "mul x6, x13, x22\n\t" + "umulh x7, x13, x22\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[6] * B[4]\n\t" - "mul x6, x14, x20\n\t" - "umulh x7, x14, x20\n\t" + "mul x6, x14, x21\n\t" + "umulh x7, x14, x21\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[7] * B[3]\n\t" - "mul x6, x15, x19\n\t" - "umulh x7, x15, x19\n\t" + "mul x6, x15, x20\n\t" + "umulh x7, x15, x20\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "str x4, [%[r], 80]\n\t" "# A[4] * B[7]\n\t" - "mul x6, x12, x23\n\t" - "umulh x7, x12, x23\n\t" + "mul x6, x12, x24\n\t" + "umulh x7, x12, x24\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" "# A[5] * B[6]\n\t" - "mul x6, x13, x22\n\t" - "umulh x7, x13, x22\n\t" + "mul x6, x13, x23\n\t" + "umulh x7, x13, x23\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[6] * B[5]\n\t" - "mul x6, x14, x21\n\t" - "umulh x7, x14, x21\n\t" + "mul x6, x14, x22\n\t" + "umulh x7, x14, x22\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[7] * B[4]\n\t" - "mul x6, x15, x20\n\t" - "umulh x7, x15, x20\n\t" + "mul x6, x15, x21\n\t" + "umulh x7, x15, x21\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "str x5, [%[r], 88]\n\t" "# A[5] * B[7]\n\t" - "mul x6, x13, x23\n\t" - "umulh x7, x13, x23\n\t" + "mul x6, x13, x24\n\t" + "umulh x7, x13, x24\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[6] * B[6]\n\t" - "mul x6, x14, x22\n\t" - "umulh x7, x14, x22\n\t" + "mul x6, x14, x23\n\t" + "umulh x7, x14, x23\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[7] * B[5]\n\t" - "mul x6, x15, x21\n\t" - "umulh x7, x15, x21\n\t" + "mul x6, x15, x22\n\t" + "umulh x7, x15, x22\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "str x3, [%[r], 96]\n\t" "# A[6] * B[7]\n\t" - "mul x6, x14, x23\n\t" - "umulh x7, x14, x23\n\t" + "mul x6, x14, x24\n\t" + "umulh x7, x14, x24\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[7] * B[6]\n\t" - "mul x6, x15, x22\n\t" - "umulh x7, x15, x22\n\t" + "mul x6, x15, x23\n\t" + "umulh x7, x15, x23\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "str x4, [%[r], 104]\n\t" "# A[7] * B[7]\n\t" - "mul x6, x15, x23\n\t" - "umulh x7, x15, x23\n\t" + "mul x6, x15, x24\n\t" + "umulh x7, x15, x24\n\t" "adds x5, x5, x6\n\t" "adc x3, x3, x7\n\t" "stp x5, x3, [%[r], 112]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24" ); XMEMCPY(r, tmp, sizeof(tmp)); @@ -2352,11 +2352,11 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -2414,8 +2414,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2423,8 +2423,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2432,8 +2432,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2441,8 +2441,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2450,8 +2450,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2459,8 +2459,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2468,8 +2468,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2477,8 +2477,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2486,8 +2486,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2495,8 +2495,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x5, x5, x6\n\t" "adcs x7, x7, %[ca]\n\t" "cset %[ca], cs\n\t" @@ -2515,7 +2515,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca); @@ -3537,11 +3537,11 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -3599,8 +3599,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3608,8 +3608,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3617,8 +3617,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3626,8 +3626,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3635,8 +3635,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3644,8 +3644,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3653,8 +3653,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3662,8 +3662,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3671,8 +3671,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3680,8 +3680,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3860,7 +3860,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca); @@ -6770,8 +6770,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "ldp x12, x13, [%[a], 16]\n\t" "ldp x14, x15, [%[a], 32]\n\t" "ldp x16, x17, [%[a], 48]\n\t" - "ldp x18, x19, [%[a], 64]\n\t" - "ldp x20, x21, [%[a], 80]\n\t" + "ldp x19, x20, [%[a], 64]\n\t" + "ldp x21, x22, [%[a], 80]\n\t" "# A[0] * A[0]\n\t" "mul x2, x10, x10\n\t" "umulh x3, x10, x10\n\t" @@ -6932,8 +6932,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[tmp], 56]\n\t" "# A[0] * A[8]\n\t" - "mul x5, x10, x18\n\t" - "umulh x6, x10, x18\n\t" + "mul x5, x10, x19\n\t" + "umulh x6, x10, x19\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[7]\n\t" @@ -6968,13 +6968,13 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[tmp], 64]\n\t" "# A[0] * A[9]\n\t" - "mul x5, x10, x19\n\t" - "umulh x6, x10, x19\n\t" + "mul x5, x10, x20\n\t" + "umulh x6, x10, x20\n\t" "mov x4, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[8]\n\t" - "mul x8, x11, x18\n\t" - "umulh x9, x11, x18\n\t" + "mul x8, x11, x19\n\t" + "umulh x9, x11, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7004,19 +7004,19 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, x7\n\t" "str x2, [%[tmp], 72]\n\t" "# A[0] * A[10]\n\t" - "mul x5, x10, x20\n\t" - "umulh x6, x10, x20\n\t" + "mul x5, x10, x21\n\t" + "umulh x6, x10, x21\n\t" "mov x2, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[9]\n\t" - "mul x8, x11, x19\n\t" - "umulh x9, x11, x19\n\t" + "mul x8, x11, x20\n\t" + "umulh x9, x11, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[2] * A[8]\n\t" - "mul x8, x12, x18\n\t" - "umulh x9, x12, x18\n\t" + "mul x8, x12, x19\n\t" + "umulh x9, x12, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7046,25 +7046,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[tmp], 80]\n\t" "# A[0] * A[11]\n\t" - "mul x5, x10, x21\n\t" - "umulh x6, x10, x21\n\t" + "mul x5, x10, x22\n\t" + "umulh x6, x10, x22\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[10]\n\t" - "mul x8, x11, x20\n\t" - "umulh x9, x11, x20\n\t" + "mul x8, x11, x21\n\t" + "umulh x9, x11, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[2] * A[9]\n\t" - "mul x8, x12, x19\n\t" - "umulh x9, x12, x19\n\t" + "mul x8, x12, x20\n\t" + "umulh x9, x12, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[3] * A[8]\n\t" - "mul x8, x13, x18\n\t" - "umulh x9, x13, x18\n\t" + "mul x8, x13, x19\n\t" + "umulh x9, x13, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7088,25 +7088,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[tmp], 88]\n\t" "# A[1] * A[11]\n\t" - "mul x5, x11, x21\n\t" - "umulh x6, x11, x21\n\t" + "mul x5, x11, x22\n\t" + "umulh x6, x11, x22\n\t" "mov x4, 0\n\t" "mov x7, 0\n\t" "# A[2] * A[10]\n\t" - "mul x8, x12, x20\n\t" - "umulh x9, x12, x20\n\t" + "mul x8, x12, x21\n\t" + "umulh x9, x12, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[3] * A[9]\n\t" - "mul x8, x13, x19\n\t" - "umulh x9, x13, x19\n\t" + "mul x8, x13, x20\n\t" + "umulh x9, x13, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[4] * A[8]\n\t" - "mul x8, x14, x18\n\t" - "umulh x9, x14, x18\n\t" + "mul x8, x14, x19\n\t" + "umulh x9, x14, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7130,25 +7130,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, x7\n\t" "str x2, [%[r], 96]\n\t" "# A[2] * A[11]\n\t" - "mul x5, x12, x21\n\t" - "umulh x6, x12, x21\n\t" + "mul x5, x12, x22\n\t" + "umulh x6, x12, x22\n\t" "mov x2, 0\n\t" "mov x7, 0\n\t" "# A[3] * A[10]\n\t" - "mul x8, x13, x20\n\t" - "umulh x9, x13, x20\n\t" + "mul x8, x13, x21\n\t" + "umulh x9, x13, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[4] * A[9]\n\t" - "mul x8, x14, x19\n\t" - "umulh x9, x14, x19\n\t" + "mul x8, x14, x20\n\t" + "umulh x9, x14, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[5] * A[8]\n\t" - "mul x8, x15, x18\n\t" - "umulh x9, x15, x18\n\t" + "mul x8, x15, x19\n\t" + "umulh x9, x15, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7166,25 +7166,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[r], 104]\n\t" "# A[3] * A[11]\n\t" - "mul x5, x13, x21\n\t" - "umulh x6, x13, x21\n\t" + "mul x5, x13, x22\n\t" + "umulh x6, x13, x22\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[4] * A[10]\n\t" - "mul x8, x14, x20\n\t" - "umulh x9, x14, x20\n\t" + "mul x8, x14, x21\n\t" + "umulh x9, x14, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[5] * A[9]\n\t" - "mul x8, x15, x19\n\t" - "umulh x9, x15, x19\n\t" + "mul x8, x15, x20\n\t" + "umulh x9, x15, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[6] * A[8]\n\t" - "mul x8, x16, x18\n\t" - "umulh x9, x16, x18\n\t" + "mul x8, x16, x19\n\t" + "umulh x9, x16, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7202,25 +7202,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[r], 112]\n\t" "# A[4] * A[11]\n\t" - "mul x5, x14, x21\n\t" - "umulh x6, x14, x21\n\t" + "mul x5, x14, x22\n\t" + "umulh x6, x14, x22\n\t" "mov x4, 0\n\t" "mov x7, 0\n\t" "# A[5] * A[10]\n\t" - "mul x8, x15, x20\n\t" - "umulh x9, x15, x20\n\t" + "mul x8, x15, x21\n\t" + "umulh x9, x15, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[6] * A[9]\n\t" - "mul x8, x16, x19\n\t" - "umulh x9, x16, x19\n\t" + "mul x8, x16, x20\n\t" + "umulh x9, x16, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[7] * A[8]\n\t" - "mul x8, x17, x18\n\t" - "umulh x9, x17, x18\n\t" + "mul x8, x17, x19\n\t" + "umulh x9, x17, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7232,25 +7232,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, x7\n\t" "str x2, [%[r], 120]\n\t" "# A[5] * A[11]\n\t" - "mul x5, x15, x21\n\t" - "umulh x6, x15, x21\n\t" + "mul x5, x15, x22\n\t" + "umulh x6, x15, x22\n\t" "mov x2, 0\n\t" "mov x7, 0\n\t" "# A[6] * A[10]\n\t" - "mul x8, x16, x20\n\t" - "umulh x9, x16, x20\n\t" + "mul x8, x16, x21\n\t" + "umulh x9, x16, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[7] * A[9]\n\t" - "mul x8, x17, x19\n\t" - "umulh x9, x17, x19\n\t" + "mul x8, x17, x20\n\t" + "umulh x9, x17, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[8] * A[8]\n\t" - "mul x8, x18, x18\n\t" - "umulh x9, x18, x18\n\t" + "mul x8, x19, x19\n\t" + "umulh x9, x19, x19\n\t" "adds x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" "adc x7, x7, x7\n\t" @@ -7262,19 +7262,19 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[r], 128]\n\t" "# A[6] * A[11]\n\t" - "mul x5, x16, x21\n\t" - "umulh x6, x16, x21\n\t" + "mul x5, x16, x22\n\t" + "umulh x6, x16, x22\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[7] * A[10]\n\t" - "mul x8, x17, x20\n\t" - "umulh x9, x17, x20\n\t" + "mul x8, x17, x21\n\t" + "umulh x9, x17, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[8] * A[9]\n\t" - "mul x8, x18, x19\n\t" - "umulh x9, x18, x19\n\t" + "mul x8, x19, x20\n\t" + "umulh x9, x19, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7286,8 +7286,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[r], 136]\n\t" "# A[7] * A[11]\n\t" - "mul x8, x17, x21\n\t" - "umulh x9, x17, x21\n\t" + "mul x8, x17, x22\n\t" + "umulh x9, x17, x22\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, xzr, xzr\n\t" @@ -7295,8 +7295,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" "# A[8] * A[10]\n\t" - "mul x8, x18, x20\n\t" - "umulh x9, x18, x20\n\t" + "mul x8, x19, x21\n\t" + "umulh x9, x19, x21\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" @@ -7304,15 +7304,15 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" "# A[9] * A[9]\n\t" - "mul x8, x19, x19\n\t" - "umulh x9, x19, x19\n\t" + "mul x8, x20, x20\n\t" + "umulh x9, x20, x20\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" "str x2, [%[r], 144]\n\t" "# A[8] * A[11]\n\t" - "mul x8, x18, x21\n\t" - "umulh x9, x18, x21\n\t" + "mul x8, x19, x22\n\t" + "umulh x9, x19, x22\n\t" "adds x3, x3, x8\n\t" "adcs x4, x4, x9\n\t" "adc x2, xzr, xzr\n\t" @@ -7320,8 +7320,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x4, x4, x9\n\t" "adc x2, x2, xzr\n\t" "# A[9] * A[10]\n\t" - "mul x8, x19, x20\n\t" - "umulh x9, x19, x20\n\t" + "mul x8, x20, x21\n\t" + "umulh x9, x20, x21\n\t" "adds x3, x3, x8\n\t" "adcs x4, x4, x9\n\t" "adc x2, x2, xzr\n\t" @@ -7330,8 +7330,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, xzr\n\t" "str x3, [%[r], 152]\n\t" "# A[9] * A[11]\n\t" - "mul x8, x19, x21\n\t" - "umulh x9, x19, x21\n\t" + "mul x8, x20, x22\n\t" + "umulh x9, x20, x22\n\t" "adds x4, x4, x8\n\t" "adcs x2, x2, x9\n\t" "adc x3, xzr, xzr\n\t" @@ -7339,15 +7339,15 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x2, x2, x9\n\t" "adc x3, x3, xzr\n\t" "# A[10] * A[10]\n\t" - "mul x8, x20, x20\n\t" - "umulh x9, x20, x20\n\t" + "mul x8, x21, x21\n\t" + "umulh x9, x21, x21\n\t" "adds x4, x4, x8\n\t" "adcs x2, x2, x9\n\t" "adc x3, x3, xzr\n\t" "str x4, [%[r], 160]\n\t" "# A[10] * A[11]\n\t" - "mul x8, x20, x21\n\t" - "umulh x9, x20, x21\n\t" + "mul x8, x21, x22\n\t" + "umulh x9, x21, x22\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, xzr, xzr\n\t" @@ -7356,14 +7356,14 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, xzr\n\t" "str x2, [%[r], 168]\n\t" "# A[11] * A[11]\n\t" - "mul x8, x21, x21\n\t" - "umulh x9, x21, x21\n\t" + "mul x8, x22, x22\n\t" + "umulh x9, x22, x22\n\t" "adds x3, x3, x8\n\t" "adc x4, x4, x9\n\t" "stp x3, x4, [%[r], 176]\n\t" : : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" + : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" ); XMEMCPY(r, tmp, sizeof(tmp)); @@ -9129,11 +9129,11 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -9191,8 +9191,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9200,8 +9200,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9209,8 +9209,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9218,8 +9218,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9227,8 +9227,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9236,8 +9236,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9245,8 +9245,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9254,8 +9254,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9263,8 +9263,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9272,8 +9272,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9372,7 +9372,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_3072_cond_sub_24(a - 24, a, m, (sp_digit)0 - ca); @@ -10610,11 +10610,11 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -10672,8 +10672,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10681,8 +10681,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10690,8 +10690,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10699,8 +10699,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10708,8 +10708,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10717,8 +10717,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10726,8 +10726,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10735,8 +10735,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10744,8 +10744,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10753,8 +10753,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -11093,7 +11093,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca); @@ -13535,102 +13535,102 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, sp_digit* a, sp_digit* b, (void)mp; __asm__ __volatile__ ( - "ldr x18, [%[a], 0]\n\t" - "ldr x19, [%[a], 8]\n\t" - "ldr x20, [%[a], 16]\n\t" - "ldr x21, [%[a], 24]\n\t" - "ldr x22, [%[b], 0]\n\t" - "ldr x23, [%[b], 8]\n\t" - "ldr x24, [%[b], 16]\n\t" - "ldr x25, [%[b], 24]\n\t" + "ldr x19, [%[a], 0]\n\t" + "ldr x20, [%[a], 8]\n\t" + "ldr x21, [%[a], 16]\n\t" + "ldr x22, [%[a], 24]\n\t" + "ldr x23, [%[b], 0]\n\t" + "ldr x24, [%[b], 8]\n\t" + "ldr x25, [%[b], 16]\n\t" + "ldr x26, [%[b], 24]\n\t" "# A[0] * B[0]\n\t" - "mul x10, x18, x22\n\t" - "umulh x11, x18, x22\n\t" + "mul x10, x19, x23\n\t" + "umulh x11, x19, x23\n\t" "# A[0] * B[1]\n\t" - "mul x5, x18, x23\n\t" - "umulh x6, x18, x23\n\t" + "mul x5, x19, x24\n\t" + "umulh x6, x19, x24\n\t" "adds x11, x11, x5\n\t" "adc x12, xzr, x6\n\t" "# A[1] * B[0]\n\t" - "mul x5, x19, x22\n\t" - "umulh x6, x19, x22\n\t" + "mul x5, x20, x23\n\t" + "umulh x6, x20, x23\n\t" "adds x11, x11, x5\n\t" "adcs x12, x12, x6\n\t" "adc x13, xzr, xzr\n\t" "# A[0] * B[2]\n\t" - "mul x5, x18, x24\n\t" - "umulh x6, x18, x24\n\t" + "mul x5, x19, x25\n\t" + "umulh x6, x19, x25\n\t" "adds x12, x12, x5\n\t" "adc x13, x13, x6\n\t" "# A[1] * B[1]\n\t" - "mul x5, x19, x23\n\t" - "umulh x6, x19, x23\n\t" + "mul x5, x20, x24\n\t" + "umulh x6, x20, x24\n\t" "adds x12, x12, x5\n\t" "adcs x13, x13, x6\n\t" "adc x14, xzr, xzr\n\t" "# A[2] * B[0]\n\t" - "mul x5, x20, x22\n\t" - "umulh x6, x20, x22\n\t" + "mul x5, x21, x23\n\t" + "umulh x6, x21, x23\n\t" "adds x12, x12, x5\n\t" "adcs x13, x13, x6\n\t" "adc x14, x14, xzr\n\t" "# A[0] * B[3]\n\t" - "mul x5, x18, x25\n\t" - "umulh x6, x18, x25\n\t" + "mul x5, x19, x26\n\t" + "umulh x6, x19, x26\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, xzr, xzr\n\t" "# A[1] * B[2]\n\t" - "mul x5, x19, x24\n\t" - "umulh x6, x19, x24\n\t" + "mul x5, x20, x25\n\t" + "umulh x6, x20, x25\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, x15, xzr\n\t" "# A[2] * B[1]\n\t" - "mul x5, x20, x23\n\t" - "umulh x6, x20, x23\n\t" + "mul x5, x21, x24\n\t" + "umulh x6, x21, x24\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, x15, xzr\n\t" "# A[3] * B[0]\n\t" - "mul x5, x21, x22\n\t" - "umulh x6, x21, x22\n\t" + "mul x5, x22, x23\n\t" + "umulh x6, x22, x23\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, x15, xzr\n\t" "# A[1] * B[3]\n\t" - "mul x5, x19, x25\n\t" - "umulh x6, x19, x25\n\t" + "mul x5, x20, x26\n\t" + "umulh x6, x20, x26\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, x6\n\t" "adc x16, xzr, xzr\n\t" "# A[2] * B[2]\n\t" - "mul x5, x20, x24\n\t" - "umulh x6, x20, x24\n\t" + "mul x5, x21, x25\n\t" + "umulh x6, x21, x25\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, x6\n\t" "adc x16, x16, xzr\n\t" "# A[3] * B[1]\n\t" - "mul x5, x21, x23\n\t" - "umulh x6, x21, x23\n\t" + "mul x5, x22, x24\n\t" + "umulh x6, x22, x24\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, x6\n\t" "adc x16, x16, xzr\n\t" "# A[2] * B[3]\n\t" - "mul x5, x20, x25\n\t" - "umulh x6, x20, x25\n\t" + "mul x5, x21, x26\n\t" + "umulh x6, x21, x26\n\t" "adds x15, x15, x5\n\t" "adcs x16, x16, x6\n\t" "adc x17, xzr, xzr\n\t" "# A[3] * B[2]\n\t" - "mul x5, x21, x24\n\t" - "umulh x6, x21, x24\n\t" + "mul x5, x22, x25\n\t" + "umulh x6, x22, x25\n\t" "adds x15, x15, x5\n\t" "adcs x16, x16, x6\n\t" "adc x17, x17, xzr\n\t" "# A[3] * B[3]\n\t" - "mul x5, x21, x25\n\t" - "umulh x6, x21, x25\n\t" + "mul x5, x22, x26\n\t" + "umulh x6, x22, x26\n\t" "adds x16, x16, x5\n\t" "adc x17, x17, x6\n\t" "# Start Reduction\n\t" @@ -13645,12 +13645,12 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, sp_digit* a, sp_digit* b, "add x8, x8, x10\n\t" "# a[0]-a[2] << 32\n\t" "lsl x10, x10, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x11, x6, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x12, x7, 32\n\t" - "eor x11, x11, x18\n\t" - "eor x12, x12, x19\n\t" + "eor x11, x11, x19\n\t" + "eor x12, x12, x20\n\t" "# - a[0] << 32 << 192\n\t" "sub x8, x8, x10\n\t" "# + a[0]-a[2] << 32 << 64\n\t" @@ -13670,47 +13670,47 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, sp_digit* a, sp_digit* b, "adcs x15, x15, x7\n\t" "adcs x16, x16, x8\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# mu <<= 32\n\t" "lsr x9, x8, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x5, x5, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x6, x6, 32\n\t" - "lsr x20, x7, 32\n\t" + "lsr x21, x7, 32\n\t" "lsl x7, x7, 32\n\t" "lsl x8, x8, 32\n\t" - "eor x6, x6, x18\n\t" - "eor x7, x7, x19\n\t" - "eor x8, x8, x20\n\t" + "eor x6, x6, x19\n\t" + "eor x7, x7, x20\n\t" + "eor x8, x8, x21\n\t" "# a += (mu << 32) << 64\n\t" "adds x13, x13, x7\n\t" "adcs x14, x14, x8\n\t" "adcs x15, x15, x9\n\t" "adcs x16, x16, xzr\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# a -= (mu << 32) << 192\n\t" "subs x13, x13, x5\n\t" - "mov x18, 0xffffffff\n\t" + "mov x19, 0xffffffff\n\t" "sbcs x14, x14, x6\n\t" - "mov x19, 0xffffffff00000001\n\t" + "mov x20, 0xffffffff00000001\n\t" "sbcs x15, x15, x7\n\t" "sbcs x16, x16, x8\n\t" "sbcs x17, x17, x9\n\t" - "cset x20, cc\n\t" - "add x10, x10, x20\n\t" + "cset x21, cc\n\t" + "add x10, x10, x21\n\t" "# mask m and sub from result if overflow\n\t" "# m[0] = -1 & mask = mask\n\t" - "and x18, x18, x10\n\t" - "# m[2] = 0 & mask = 0\n\t" "and x19, x19, x10\n\t" + "# m[2] = 0 & mask = 0\n\t" + "and x20, x20, x10\n\t" "subs x14, x14, x10\n\t" - "sbcs x15, x15, x18\n\t" + "sbcs x15, x15, x19\n\t" "sbcs x16, x16, xzr\n\t" - "sbc x17, x17, x19\n\t" + "sbc x17, x17, x20\n\t" "str x14, [%[r], 0]\n\t" "str x15, [%[r], 8]\n\t" "str x16, [%[r], 16]\n\t" @@ -13718,8 +13718,8 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, sp_digit* a, sp_digit* b, : [m] "+r" (m), [a] "+r" (a), [b] "+r" (b) : [r] "r" (r) : "memory", "x5", "x6", "x7", "x8", "x9", - "x18", "x19", "x20", "x21", - "x22", "x23", "x24", "x25", + "x19", "x20", "x21", "x22", + "x23", "x24", "x25", "x26", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); } @@ -13735,37 +13735,37 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, sp_digit* a, sp_digit* m, sp_digit mp) { __asm__ __volatile__ ( - "ldr x18, [%[a], 0]\n\t" - "ldr x19, [%[a], 8]\n\t" - "ldr x20, [%[a], 16]\n\t" - "ldr x21, [%[a], 24]\n\t" + "ldr x19, [%[a], 0]\n\t" + "ldr x20, [%[a], 8]\n\t" + "ldr x21, [%[a], 16]\n\t" + "ldr x22, [%[a], 24]\n\t" "# A[0] * A[1]\n\t" - "mul x11, x18, x19\n\t" - "umulh x12, x18, x19\n\t" + "mul x11, x19, x20\n\t" + "umulh x12, x19, x20\n\t" "# A[0] * A[2]\n\t" - "mul x5, x18, x20\n\t" - "umulh x6, x18, x20\n\t" + "mul x5, x19, x21\n\t" + "umulh x6, x19, x21\n\t" "adds x12, x12, x5\n\t" "adc x13, xzr, x6\n\t" "# A[0] * A[3]\n\t" - "mul x5, x18, x21\n\t" - "umulh x6, x18, x21\n\t" + "mul x5, x19, x22\n\t" + "umulh x6, x19, x22\n\t" "adds x13, x13, x5\n\t" "adc x14, xzr, x6\n\t" "# A[1] * A[2]\n\t" - "mul x5, x19, x20\n\t" - "umulh x6, x19, x20\n\t" + "mul x5, x20, x21\n\t" + "umulh x6, x20, x21\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, xzr, xzr\n\t" "# A[1] * A[3]\n\t" - "mul x5, x19, x21\n\t" - "umulh x6, x19, x21\n\t" + "mul x5, x20, x22\n\t" + "umulh x6, x20, x22\n\t" "adds x14, x14, x5\n\t" "adc x15, x15, x6\n\t" "# A[2] * A[3]\n\t" - "mul x5, x20, x21\n\t" - "umulh x6, x20, x21\n\t" + "mul x5, x21, x22\n\t" + "umulh x6, x21, x22\n\t" "adds x15, x15, x5\n\t" "adc x16, xzr, x6\n\t" "# Double\n\t" @@ -13777,24 +13777,24 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, sp_digit* a, sp_digit* m, "adcs x16, x16, x16\n\t" "cset x17, cs\n\t" "# A[0] * A[0]\n\t" - "mul x10, x18, x18\n\t" - "umulh x4, x18, x18\n\t" + "mul x10, x19, x19\n\t" + "umulh x4, x19, x19\n\t" "# A[1] * A[1]\n\t" - "mul x5, x19, x19\n\t" - "umulh x6, x19, x19\n\t" + "mul x5, x20, x20\n\t" + "umulh x6, x20, x20\n\t" "# A[2] * A[2]\n\t" - "mul x7, x20, x20\n\t" - "umulh x8, x20, x20\n\t" + "mul x7, x21, x21\n\t" + "umulh x8, x21, x21\n\t" "# A[3] * A[3]\n\t" - "mul x9, x21, x21\n\t" - "umulh x18, x21, x21\n\t" + "mul x9, x22, x22\n\t" + "umulh x19, x22, x22\n\t" "adds x11, x11, x4\n\t" "adcs x12, x12, x5\n\t" "adcs x13, x13, x6\n\t" "adcs x14, x14, x7\n\t" "adcs x15, x15, x8\n\t" "adcs x16, x16, x9\n\t" - "adc x17, x17, x18\n\t" + "adc x17, x17, x19\n\t" "# Start Reduction\n\t" "mov x5, x10\n\t" "mov x6, x11\n\t" @@ -13807,12 +13807,12 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, sp_digit* a, sp_digit* m, "add x8, x8, x10\n\t" "# a[0]-a[2] << 32\n\t" "lsl x10, x10, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x11, x6, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x12, x7, 32\n\t" - "eor x11, x11, x18\n\t" - "eor x12, x12, x19\n\t" + "eor x11, x11, x19\n\t" + "eor x12, x12, x20\n\t" "# - a[0] << 32 << 192\n\t" "sub x8, x8, x10\n\t" "# + a[0]-a[2] << 32 << 64\n\t" @@ -13832,47 +13832,47 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, sp_digit* a, sp_digit* m, "adcs x15, x15, x7\n\t" "adcs x16, x16, x8\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# mu <<= 32\n\t" "lsr x9, x8, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x5, x5, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x6, x6, 32\n\t" - "lsr x20, x7, 32\n\t" + "lsr x21, x7, 32\n\t" "lsl x7, x7, 32\n\t" "lsl x8, x8, 32\n\t" - "eor x6, x6, x18\n\t" - "eor x7, x7, x19\n\t" - "eor x8, x8, x20\n\t" + "eor x6, x6, x19\n\t" + "eor x7, x7, x20\n\t" + "eor x8, x8, x21\n\t" "# a += (mu << 32) << 64\n\t" "adds x13, x13, x7\n\t" "adcs x14, x14, x8\n\t" "adcs x15, x15, x9\n\t" "adcs x16, x16, xzr\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# a -= (mu << 32) << 192\n\t" "subs x13, x13, x5\n\t" - "mov x18, 0xffffffff\n\t" + "mov x19, 0xffffffff\n\t" "sbcs x14, x14, x6\n\t" - "mov x19, 0xffffffff00000001\n\t" + "mov x20, 0xffffffff00000001\n\t" "sbcs x15, x15, x7\n\t" "sbcs x16, x16, x8\n\t" "sbcs x17, x17, x9\n\t" - "cset x20, cc\n\t" - "add x10, x10, x20\n\t" + "cset x21, cc\n\t" + "add x10, x10, x21\n\t" "# mask m and sub from result if overflow\n\t" "# m[0] = -1 & mask = mask\n\t" - "and x18, x18, x10\n\t" - "# m[2] = 0 & mask = 0\n\t" "and x19, x19, x10\n\t" + "# m[2] = 0 & mask = 0\n\t" + "and x20, x20, x10\n\t" "subs x14, x14, x10\n\t" - "sbcs x15, x15, x18\n\t" + "sbcs x15, x15, x19\n\t" "sbcs x16, x16, xzr\n\t" - "sbc x17, x17, x19\n\t" + "sbc x17, x17, x20\n\t" "str x14, [%[r], 0]\n\t" "str x15, [%[r], 8]\n\t" "str x16, [%[r], 16]\n\t" @@ -13880,7 +13880,7 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, sp_digit* a, sp_digit* m, : [m] "+r" (m), [a] "+r" (a), [mp] "+r" (mp) : [r] "r" (r) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", - "x18", "x19", "x20", "x21", + "x19", "x20", "x21", "x22", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); }