SP ASM: fixes for Wycheproof tests

ARM64 ASM: Fix P256 Montogomery Reduce. Fix div to handle large dividend word.
2022-06-14 16:11:46 +10:00
parent 8899112456
commit 013066ca06
5 changed files with 263 additions and 48 deletions
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -4253,9 +4253,13 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_2048_cmp_32(&t1[32], d) >= 0;
+    sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_2048_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
@@ -5786,6 +5790,13 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    for (i = 63; i > 0; i--) {
+        if (t1[i + 64] != d[i])
+            break;
+    }
+    if (t1[i + 64] >= d[i]) {
+        sp_2048_sub_in_place_64(&t1[64], d);
+    }
    for (i = 63; i >= 0; i--) {
        if (t1[64 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -6637,9 +6648,13 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    r1 = sp_2048_cmp_64(&t1[64], d) >= 0;
+    sp_2048_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1);
    for (i = 63; i >= 0; i--) {
-        sp_digit hi = t1[64 + i] - (t1[64 + i] == div);
+        sp_digit mask = 0 - (t1[64 + i] == div);
+        sp_digit hi = t1[64 + i] + mask;
        r1 = div_2048_word_64(hi, t1[64 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
@@ -14312,9 +14327,13 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
+    r1 = sp_3072_cmp_48(&t1[48], d) >= 0;
+    sp_3072_cond_sub_48(&t1[48], &t1[48], d, (sp_digit)0 - r1);
    for (i = 47; i >= 0; i--) {
-        sp_digit hi = t1[48 + i] - (t1[48 + i] == div);
+        sp_digit mask = 0 - (t1[48 + i] == div);
+        sp_digit hi = t1[48 + i] + mask;
        r1 = div_3072_word_48(hi, t1[48 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
@@ -16301,6 +16320,13 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
+    for (i = 95; i > 0; i--) {
+        if (t1[i + 96] != d[i])
+            break;
+    }
+    if (t1[i + 96] >= d[i]) {
+        sp_3072_sub_in_place_96(&t1[96], d);
+    }
    for (i = 95; i >= 0; i--) {
        if (t1[96 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -17504,9 +17530,13 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
+    r1 = sp_3072_cmp_96(&t1[96], d) >= 0;
+    sp_3072_cond_sub_96(&t1[96], &t1[96], d, (sp_digit)0 - r1);
    for (i = 95; i >= 0; i--) {
-        sp_digit hi = t1[96 + i] - (t1[96 + i] == div);
+        sp_digit mask = 0 - (t1[96 + i] == div);
+        sp_digit hi = t1[96 + i] + mask;
        r1 = div_3072_word_96(hi, t1[96 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_96(t2, d, r1);
        t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2);
@@ -23356,6 +23386,13 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
+    for (i = 127; i > 0; i--) {
+        if (t1[i + 128] != d[i])
+            break;
+    }
+    if (t1[i + 128] >= d[i]) {
+        sp_4096_sub_in_place_128(&t1[128], d);
+    }
    for (i = 127; i >= 0; i--) {
        if (t1[128 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -24911,9 +24948,13 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
+    r1 = sp_4096_cmp_128(&t1[128], d) >= 0;
+    sp_4096_cond_sub_128(&t1[128], &t1[128], d, (sp_digit)0 - r1);
    for (i = 127; i >= 0; i--) {
-        sp_digit hi = t1[128 + i] - (t1[128 + i] == div);
+        sp_digit mask = 0 - (t1[128 + i] == div);
+        sp_digit hi = t1[128 + i] + mask;
        r1 = div_4096_word_128(hi, t1[128 + i - 1], div);
+        r1 |= mask;

        sp_4096_mul_d_128(t2, d, r1);
        t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2);
@@ -34575,9 +34616,13 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit

    div = d[7];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 8);
+    r1 = sp_256_cmp_8(&t1[8], d) >= 0;
+    sp_256_cond_sub_8(&t1[8], &t1[8], d, (sp_digit)0 - r1);
    for (i = 7; i >= 0; i--) {
-        sp_digit hi = t1[8 + i] - (t1[8 + i] == div);
+        sp_digit mask = 0 - (t1[8 + i] == div);
+        sp_digit hi = t1[8 + i] + mask;
        r1 = div_256_word_8(hi, t1[8 + i - 1], div);
+        r1 |= mask;

        sp_256_mul_d_8(t2, d, r1);
        t1[8 + i] += sp_256_sub_in_place_8(&t1[i], t2);
@@ -43678,9 +43723,13 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi

    div = d[11];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 12);
+    r1 = sp_384_cmp_12(&t1[12], d) >= 0;
+    sp_384_cond_sub_12(&t1[12], &t1[12], d, (sp_digit)0 - r1);
    for (i = 11; i >= 0; i--) {
-        sp_digit hi = t1[12 + i] - (t1[12 + i] == div);
+        sp_digit mask = 0 - (t1[12 + i] == div);
+        sp_digit hi = t1[12 + i] + mask;
        r1 = div_384_word_12(hi, t1[12 + i - 1], div);
+        r1 |= mask;

        sp_384_mul_d_12(t2, d, r1);
        t1[12 + i] += sp_384_sub_in_place_12(&t1[i], t2);
@@ -62498,9 +62547,13 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_1024_cmp_32(&t1[32], d) >= 0;
+    sp_1024_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_1024_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_1024_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_1024_sub_in_place_32(&t1[i], t2);
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -3934,9 +3934,13 @@ static WC_INLINE int sp_2048_div_16(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[15];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 16);
+    r1 = sp_2048_cmp_16(&t1[16], d) >= 0;
+    sp_2048_cond_sub_16(&t1[16], &t1[16], d, (sp_digit)0 - r1);
    for (i = 15; i >= 0; i--) {
-        sp_digit hi = t1[16 + i] - (t1[16 + i] == div);
+        sp_digit mask = 0 - (t1[16 + i] == div);
+        sp_digit hi = t1[16 + i] + mask;
        r1 = div_2048_word_16(hi, t1[16 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_16(t2, d, r1);
        t1[16 + i] += sp_2048_sub_in_place_16(&t1[i], t2);
@@ -4970,6 +4974,13 @@ static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, s

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    for (i = 31; i > 0; i--) {
+        if (t1[i + 32] != d[i])
+            break;
+    }
+    if (t1[i + 32] >= d[i]) {
+        sp_2048_sub_in_place_32(&t1[32], d);
+    }
    for (i = 31; i >= 0; i--) {
        if (t1[32 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -5530,9 +5541,13 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_2048_cmp_32(&t1[32], d) >= 0;
+    sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_2048_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
@@ -13071,9 +13086,13 @@ static WC_INLINE int sp_3072_div_24(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[23];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 24);
+    r1 = sp_3072_cmp_24(&t1[24], d) >= 0;
+    sp_3072_cond_sub_24(&t1[24], &t1[24], d, (sp_digit)0 - r1);
    for (i = 23; i >= 0; i--) {
-        sp_digit hi = t1[24 + i] - (t1[24 + i] == div);
+        sp_digit mask = 0 - (t1[24 + i] == div);
+        sp_digit hi = t1[24 + i] + mask;
        r1 = div_3072_word_24(hi, t1[24 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_24(t2, d, r1);
        t1[24 + i] += sp_3072_sub_in_place_24(&t1[i], t2);
@@ -14347,6 +14366,13 @@ static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, s

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
+    for (i = 47; i > 0; i--) {
+        if (t1[i + 48] != d[i])
+            break;
+    }
+    if (t1[i + 48] >= d[i]) {
+        sp_3072_sub_in_place_48(&t1[48], d);
+    }
    for (i = 47; i >= 0; i--) {
        if (t1[48 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -15059,9 +15085,13 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
+    r1 = sp_3072_cmp_48(&t1[48], d) >= 0;
+    sp_3072_cond_sub_48(&t1[48], &t1[48], d, (sp_digit)0 - r1);
    for (i = 47; i >= 0; i--) {
-        sp_digit hi = t1[48 + i] - (t1[48 + i] == div);
+        sp_digit mask = 0 - (t1[48 + i] == div);
+        sp_digit hi = t1[48 + i] + mask;
        r1 = div_3072_word_48(hi, t1[48 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
@@ -19256,6 +19286,13 @@ static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, s

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    for (i = 63; i > 0; i--) {
+        if (t1[i + 64] != d[i])
+            break;
+    }
+    if (t1[i + 64] >= d[i]) {
+        sp_4096_sub_in_place_64(&t1[64], d);
+    }
    for (i = 63; i >= 0; i--) {
        if (t1[64 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -20120,9 +20157,13 @@ static WC_INLINE int sp_4096_div_64(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    r1 = sp_4096_cmp_64(&t1[64], d) >= 0;
+    sp_4096_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1);
    for (i = 63; i >= 0; i--) {
-        sp_digit hi = t1[64 + i] - (t1[64 + i] == div);
+        sp_digit mask = 0 - (t1[64 + i] == div);
+        sp_digit hi = t1[64 + i] + mask;
        r1 = div_4096_word_64(hi, t1[64 + i - 1], div);
+        r1 |= mask;

        sp_4096_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_4096_sub_in_place_64(&t1[i], t2);
@@ -22388,11 +22429,13 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const
        "adcs	x15, x15, xzr\n\t"
        "extr	x6, x6, x5, 32\n\t"
        "adc	x8, x8, xzr\n\t"
-        "adds	x11, x11, x6\n\t"
        "extr	x5, x5, x4, 32\n\t"
+        "lsl	x4, x4, 32\n\t"
+        "adds	x9, x9, x4\n\t"
+        "adcs	x10, x10, x5\n\t"
+        "adcs	x11, x11, x6\n\t"
        "adcs	x12, x12, x7\n\t"
        "adcs	x13, x13, x16\n\t"
-        "lsl	x4, x4, 32\n\t"
        "adcs	x14, x14, xzr\n\t"
        "adcs	x15, x15, xzr\n\t"
        "adc	x8, x8, xzr\n\t"
@@ -22400,12 +22443,11 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const
        "subs	x11, x11, x4\n\t"
        "sbcs	x12, x12, x5\n\t"
        "sbcs	x13, x13, x6\n\t"
-        "sub	x8, xzr, x8\n\t"
        "sbcs	x14, x14, x7\n\t"
-        "sub	x8, x8, #1\n\t"
        "sbcs	x15, x15, x16\n\t"
        "mov	x19, 0xffffffff00000001\n\t"
-        "adc	x8, x8, xzr\n\t"
+        "sbc	x8, x8, xzr\n\t"
+        "neg	x8, x8\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "subs	x12, x12, x8\n\t"
@@ -22535,11 +22577,13 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const
        "adcs	x15, x15, xzr\n\t"
        "extr	x5, x5, x4, 32\n\t"
        "adc	x8, x8, xzr\n\t"
-        "adds	x11, x11, x5\n\t"
        "extr	x4, x4, x3, 32\n\t"
+        "lsl	x3, x3, 32\n\t"
+        "adds	x9, x9, x3\n\t"
+        "adcs	x10, x10, x4\n\t"
+        "adcs	x11, x11, x5\n\t"
        "adcs	x12, x12, x6\n\t"
        "adcs	x13, x13, x7\n\t"
-        "lsl	x3, x3, 32\n\t"
        "adcs	x14, x14, xzr\n\t"
        "adcs	x15, x15, xzr\n\t"
        "adc	x8, x8, xzr\n\t"
@@ -22547,12 +22591,11 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const
        "subs	x11, x11, x3\n\t"
        "sbcs	x12, x12, x4\n\t"
        "sbcs	x13, x13, x5\n\t"
-        "sub	x8, xzr, x8\n\t"
        "sbcs	x14, x14, x6\n\t"
-        "sub	x8, x8, #1\n\t"
        "sbcs	x15, x15, x7\n\t"
        "mov	x17, 0xffffffff00000001\n\t"
-        "adc	x8, x8, xzr\n\t"
+        "sbc	x8, x8, xzr\n\t"
+        "neg	x8, x8\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "subs	x12, x12, x8\n\t"
@@ -22839,11 +22882,13 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m,
        "adcs	x17, x17, xzr\n\t"
        "extr	x5, x5, x4, 32\n\t"
        "adc	x10, x10, xzr\n\t"
-        "adds	x13, x13, x5\n\t"
        "extr	x4, x4, x3, 32\n\t"
+        "lsl	x3, x3, 32\n\t"
+        "adds	x11, x11, x3\n\t"
+        "adcs	x12, x12, x4\n\t"
+        "adcs	x13, x13, x5\n\t"
        "adcs	x14, x14, x6\n\t"
        "adcs	x15, x15, x7\n\t"
-        "lsl	x3, x3, 32\n\t"
        "adcs	x16, x16, xzr\n\t"
        "adcs	x17, x17, xzr\n\t"
        "adc	x10, x10, xzr\n\t"
@@ -22851,12 +22896,11 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m,
        "subs	x13, x13, x3\n\t"
        "sbcs	x14, x14, x4\n\t"
        "sbcs	x15, x15, x5\n\t"
-        "sub	x10, xzr, x10\n\t"
        "sbcs	x16, x16, x6\n\t"
-        "sub	x10, x10, #1\n\t"
        "sbcs	x17, x17, x7\n\t"
        "mov	x9, 0xffffffff00000001\n\t"
-        "adc	x10, x10, xzr\n\t"
+        "sbc	x10, x10, xzr\n\t"
+        "neg	x10, x10\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "subs	x14, x14, x10\n\t"
@@ -39854,9 +39898,13 @@ static WC_INLINE int sp_256_div_4(const sp_digit* a, const sp_digit* d, sp_digit

    div = d[3];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 4);
+    r1 = sp_256_cmp_4(&t1[4], d) >= 0;
+    sp_256_cond_sub_4(&t1[4], &t1[4], d, (sp_digit)0 - r1);
    for (i = 3; i >= 0; i--) {
-        sp_digit hi = t1[4 + i] - (t1[4 + i] == div);
+        sp_digit mask = 0 - (t1[4 + i] == div);
+        sp_digit hi = t1[4 + i] + mask;
        r1 = div_256_word_4(hi, t1[4 + i - 1], div);
+        r1 |= mask;

        sp_256_mul_d_4(t2, d, r1);
        t1[4 + i] += sp_256_sub_in_place_4(&t1[i], t2);
@@ -65802,9 +65850,13 @@ static WC_INLINE int sp_384_div_6(const sp_digit* a, const sp_digit* d, sp_digit

    div = d[5];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 6);
+    r1 = sp_384_cmp_6(&t1[6], d) >= 0;
+    sp_384_cond_sub_6(&t1[6], &t1[6], d, (sp_digit)0 - r1);
    for (i = 5; i >= 0; i--) {
-        sp_digit hi = t1[6 + i] - (t1[6 + i] == div);
+        sp_digit mask = 0 - (t1[6 + i] == div);
+        sp_digit hi = t1[6 + i] + mask;
        r1 = div_384_word_6(hi, t1[6 + i - 1], div);
+        r1 |= mask;

        sp_384_mul_d_6(t2, d, r1);
        t1[6 + i] += sp_384_sub_in_place_6(&t1[i], t2);
@@ -113907,9 +113959,13 @@ static WC_INLINE int sp_1024_div_16(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[15];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 16);
+    r1 = sp_1024_cmp_16(&t1[16], d) >= 0;
+    sp_1024_cond_sub_16(&t1[16], &t1[16], d, (sp_digit)0 - r1);
    for (i = 15; i >= 0; i--) {
-        sp_digit hi = t1[16 + i] - (t1[16 + i] == div);
+        sp_digit mask = 0 - (t1[16 + i] == div);
+        sp_digit hi = t1[16 + i] + mask;
        r1 = div_1024_word_16(hi, t1[16 + i - 1], div);
+        r1 |= mask;

        sp_1024_mul_d_16(t2, d, r1);
        t1[16 + i] += sp_1024_sub_in_place_16(&t1[i], t2);
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -23937,9 +23937,13 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_2048_cmp_32(&t1[32], d) >= 0;
+    sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_2048_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
@@ -27307,6 +27311,13 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    for (i = 63; i > 0; i--) {
+        if (t1[i + 64] != d[i])
+            break;
+    }
+    if (t1[i + 64] >= d[i]) {
+        sp_2048_sub_in_place_64(&t1[64], d);
+    }
    for (i = 63; i >= 0; i--) {
        if (t1[64 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -27513,9 +27524,13 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    r1 = sp_2048_cmp_64(&t1[64], d) >= 0;
+    sp_2048_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1);
    for (i = 63; i >= 0; i--) {
-        sp_digit hi = t1[64 + i] - (t1[64 + i] == div);
+        sp_digit mask = 0 - (t1[64 + i] == div);
+        sp_digit hi = t1[64 + i] + mask;
        r1 = div_2048_word_64(hi, t1[64 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
@@ -75379,9 +75394,13 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
+    r1 = sp_3072_cmp_48(&t1[48], d) >= 0;
+    sp_3072_cond_sub_48(&t1[48], &t1[48], d, (sp_digit)0 - r1);
    for (i = 47; i >= 0; i--) {
-        sp_digit hi = t1[48 + i] - (t1[48 + i] == div);
+        sp_digit mask = 0 - (t1[48 + i] == div);
+        sp_digit hi = t1[48 + i] + mask;
        r1 = div_3072_word_48(hi, t1[48 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
@@ -79575,6 +79594,13 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
+    for (i = 95; i > 0; i--) {
+        if (t1[i + 96] != d[i])
+            break;
+    }
+    if (t1[i + 96] >= d[i]) {
+        sp_3072_sub_in_place_96(&t1[96], d);
+    }
    for (i = 95; i >= 0; i--) {
        if (t1[96 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -79786,9 +79812,13 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
+    r1 = sp_3072_cmp_96(&t1[96], d) >= 0;
+    sp_3072_cond_sub_96(&t1[96], &t1[96], d, (sp_digit)0 - r1);
    for (i = 95; i >= 0; i--) {
-        sp_digit hi = t1[96 + i] - (t1[96 + i] == div);
+        sp_digit mask = 0 - (t1[96 + i] == div);
+        sp_digit hi = t1[96 + i] + mask;
        r1 = div_3072_word_96(hi, t1[96 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_96(t2, d, r1);
        t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2);
@@ -92108,6 +92138,13 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
+    for (i = 127; i > 0; i--) {
+        if (t1[i + 128] != d[i])
+            break;
+    }
+    if (t1[i + 128] >= d[i]) {
+        sp_4096_sub_in_place_128(&t1[128], d);
+    }
    for (i = 127; i >= 0; i--) {
        if (t1[128 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -92320,9 +92357,13 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
+    r1 = sp_4096_cmp_128(&t1[128], d) >= 0;
+    sp_4096_cond_sub_128(&t1[128], &t1[128], d, (sp_digit)0 - r1);
    for (i = 127; i >= 0; i--) {
-        sp_digit hi = t1[128 + i] - (t1[128 + i] == div);
+        sp_digit mask = 0 - (t1[128 + i] == div);
+        sp_digit hi = t1[128 + i] + mask;
        r1 = div_4096_word_128(hi, t1[128 + i - 1], div);
+        r1 |= mask;

        sp_4096_mul_d_128(t2, d, r1);
        t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2);
@@ -105211,9 +105252,13 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit

    div = d[7];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 8);
+    r1 = sp_256_cmp_8(&t1[8], d) >= 0;
+    sp_256_cond_sub_8(&t1[8], &t1[8], d, (sp_digit)0 - r1);
    for (i = 7; i >= 0; i--) {
-        sp_digit hi = t1[8 + i] - (t1[8 + i] == div);
+        sp_digit mask = 0 - (t1[8 + i] == div);
+        sp_digit hi = t1[8 + i] + mask;
        r1 = div_256_word_8(hi, t1[8 + i - 1], div);
+        r1 |= mask;

        sp_256_mul_d_8(t2, d, r1);
        t1[8 + i] += sp_256_sub_in_place_8(&t1[i], t2);
@@ -115844,9 +115889,13 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi

    div = d[11];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 12);
+    r1 = sp_384_cmp_12(&t1[12], d) >= 0;
+    sp_384_cond_sub_12(&t1[12], &t1[12], d, (sp_digit)0 - r1);
    for (i = 11; i >= 0; i--) {
-        sp_digit hi = t1[12 + i] - (t1[12 + i] == div);
+        sp_digit mask = 0 - (t1[12 + i] == div);
+        sp_digit hi = t1[12 + i] + mask;
        r1 = div_384_word_12(hi, t1[12 + i - 1], div);
+        r1 |= mask;

        sp_384_mul_d_12(t2, d, r1);
        t1[12 + i] += sp_384_sub_in_place_12(&t1[i], t2);
@@ -201817,9 +201866,13 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_1024_cmp_32(&t1[32], d) >= 0;
+    sp_1024_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_1024_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_1024_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_1024_sub_in_place_32(&t1[i], t2);
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -3372,9 +3372,13 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_2048_cmp_32(&t1[32], d) >= 0;
+    sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_2048_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
@@ -4215,6 +4219,13 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    for (i = 63; i > 0; i--) {
+        if (t1[i + 64] != d[i])
+            break;
+    }
+    if (t1[i + 64] >= d[i]) {
+        sp_2048_sub_in_place_64(&t1[64], d);
+    }
    for (i = 63; i >= 0; i--) {
        if (t1[64 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -4357,9 +4368,13 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
+    r1 = sp_2048_cmp_64(&t1[64], d) >= 0;
+    sp_2048_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1);
    for (i = 63; i >= 0; i--) {
-        sp_digit hi = t1[64 + i] - (t1[64 + i] == div);
+        sp_digit mask = 0 - (t1[64 + i] == div);
+        sp_digit hi = t1[64 + i] + mask;
        r1 = div_2048_word_64(hi, t1[64 + i - 1], div);
+        r1 |= mask;

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
@@ -8884,9 +8899,13 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
+    r1 = sp_3072_cmp_48(&t1[48], d) >= 0;
+    sp_3072_cond_sub_48(&t1[48], &t1[48], d, (sp_digit)0 - r1);
    for (i = 47; i >= 0; i--) {
-        sp_digit hi = t1[48 + i] - (t1[48 + i] == div);
+        sp_digit mask = 0 - (t1[48 + i] == div);
+        sp_digit hi = t1[48 + i] + mask;
        r1 = div_3072_word_48(hi, t1[48 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
@@ -9809,6 +9828,13 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
+    for (i = 95; i > 0; i--) {
+        if (t1[i + 96] != d[i])
+            break;
+    }
+    if (t1[i + 96] >= d[i]) {
+        sp_3072_sub_in_place_96(&t1[96], d);
+    }
    for (i = 95; i >= 0; i--) {
        if (t1[96 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -9953,9 +9979,13 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
+    r1 = sp_3072_cmp_96(&t1[96], d) >= 0;
+    sp_3072_cond_sub_96(&t1[96], &t1[96], d, (sp_digit)0 - r1);
    for (i = 95; i >= 0; i--) {
-        sp_digit hi = t1[96 + i] - (t1[96 + i] == div);
+        sp_digit mask = 0 - (t1[96 + i] == div);
+        sp_digit hi = t1[96 + i] + mask;
        r1 = div_3072_word_96(hi, t1[96 + i - 1], div);
+        r1 |= mask;

        sp_3072_mul_d_96(t2, d, r1);
        t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2);
@@ -13586,6 +13616,13 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
+    for (i = 127; i > 0; i--) {
+        if (t1[i + 128] != d[i])
+            break;
+    }
+    if (t1[i + 128] >= d[i]) {
+        sp_4096_sub_in_place_128(&t1[128], d);
+    }
    for (i = 127; i >= 0; i--) {
        if (t1[128 + i] == div) {
            r1 = SP_DIGIT_MAX;
@@ -13730,9 +13767,13 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
+    r1 = sp_4096_cmp_128(&t1[128], d) >= 0;
+    sp_4096_cond_sub_128(&t1[128], &t1[128], d, (sp_digit)0 - r1);
    for (i = 127; i >= 0; i--) {
-        sp_digit hi = t1[128 + i] - (t1[128 + i] == div);
+        sp_digit mask = 0 - (t1[128 + i] == div);
+        sp_digit hi = t1[128 + i] + mask;
        r1 = div_4096_word_128(hi, t1[128 + i - 1], div);
+        r1 |= mask;

        sp_4096_mul_d_128(t2, d, r1);
        t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2);
@@ -23094,9 +23135,13 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit

    div = d[7];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 8);
+    r1 = sp_256_cmp_8(&t1[8], d) >= 0;
+    sp_256_cond_sub_8(&t1[8], &t1[8], d, (sp_digit)0 - r1);
    for (i = 7; i >= 0; i--) {
-        sp_digit hi = t1[8 + i] - (t1[8 + i] == div);
+        sp_digit mask = 0 - (t1[8 + i] == div);
+        sp_digit hi = t1[8 + i] + mask;
        r1 = div_256_word_8(hi, t1[8 + i - 1], div);
+        r1 |= mask;

        sp_256_mul_d_8(t2, d, r1);
        t1[8 + i] += sp_256_sub_in_place_8(&t1[i], t2);
@@ -30283,9 +30328,13 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi

    div = d[11];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 12);
+    r1 = sp_384_cmp_12(&t1[12], d) >= 0;
+    sp_384_cond_sub_12(&t1[12], &t1[12], d, (sp_digit)0 - r1);
    for (i = 11; i >= 0; i--) {
-        sp_digit hi = t1[12 + i] - (t1[12 + i] == div);
+        sp_digit mask = 0 - (t1[12 + i] == div);
+        sp_digit hi = t1[12 + i] + mask;
        r1 = div_384_word_12(hi, t1[12 + i - 1], div);
+        r1 |= mask;

        sp_384_mul_d_12(t2, d, r1);
        t1[12 + i] += sp_384_sub_in_place_12(&t1[i], t2);
@@ -42243,9 +42292,13 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
+    r1 = sp_1024_cmp_32(&t1[32], d) >= 0;
+    sp_1024_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
-        sp_digit hi = t1[32 + i] - (t1[32 + i] == div);
+        sp_digit mask = 0 - (t1[32 + i] == div);
+        sp_digit hi = t1[32 + i] + mask;
        r1 = div_1024_word_32(hi, t1[32 + i - 1], div);
+        r1 |= mask;

        sp_1024_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_1024_sub_in_place_32(&t1[i], t2);
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -20,7 +20,7 @@
 */

 #ifdef WOLFSSL_USER_SETTINGS
-#include "wolfssl/wolfcrypt/settings.h"
+#include "wolfssl/wolfcrpyt/settings.h"
 #endif

 #ifndef HAVE_INTEL_AVX1