Tweak the SP x86_64 ECC assembly

Put back fixes undone in previous commits: - Fix casting warning in SP when mp_digit < sp_digit - SP fix check for NULL in EC point_new
2026-01-28 18:59:58 +01:00 · 2020-04-06 11:02:30 +10:00
parent 9a1687d00e
commit 7dad0d3965
8 changed files with 251 additions and 248 deletions
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -7839,10 +7839,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -7850,7 +7850,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -19407,10 +19407,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 96; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -19418,7 +19418,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -72049,10 +72049,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 128; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -72060,7 +72060,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -73192,7 +73192,7 @@ static int sp_256_point_new_ex_8(void* heap, sp_point_256* sp, sp_point_256** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -73579,10 +73579,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 8; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -73590,7 +73590,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -81204,7 +81204,7 @@ static int sp_384_point_new_ex_12(void* heap, sp_point_384* sp, sp_point_384** p
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -81458,10 +81458,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 12; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -81469,7 +81469,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -5235,10 +5235,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 32; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -5246,7 +5246,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -12907,10 +12907,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 48; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -12918,7 +12918,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -17806,10 +17806,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -17817,7 +17817,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -18568,7 +18568,7 @@ static int sp_256_point_new_ex_4(void* heap, sp_point_256* sp, sp_point_256** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -18787,10 +18787,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 4; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -18798,7 +18798,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -36352,7 +36352,7 @@ static int sp_384_point_new_ex_6(void* heap, sp_point_384* sp, sp_point_384** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -36624,10 +36624,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 6; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -36635,7 +36635,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -4545,10 +4545,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -4556,7 +4556,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -10288,10 +10288,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 96; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -10299,7 +10299,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -14691,10 +14691,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 128; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -14702,7 +14702,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -15852,7 +15852,7 @@ static int sp_256_point_new_ex_8(void* heap, sp_point_256* sp, sp_point_256** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -16075,10 +16075,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 8; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -16086,7 +16086,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -21937,7 +21937,7 @@ static int sp_384_point_new_ex_12(void* heap, sp_point_384* sp, sp_point_384** p
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -22191,10 +22191,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 12; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -22202,7 +22202,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -3652,10 +3652,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 90; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 23) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -3663,7 +3663,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 23 - s;
@@ -7489,10 +7489,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 134; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 23) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -7500,7 +7500,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 23 - s;
@@ -11490,10 +11490,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 196; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 21) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -11501,7 +11501,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 21 - s;
@@ -12454,7 +12454,7 @@ static int sp_256_point_new_ex_10(void* heap, sp_point_256* sp, sp_point_256** p
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -12742,10 +12742,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 10; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 26) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -12753,7 +12753,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 26 - s;
@@ -17779,7 +17779,7 @@ static int sp_384_point_new_ex_15(void* heap, sp_point_384* sp, sp_point_384** p
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -18108,10 +18108,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 15; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 26) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -18119,7 +18119,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 26 - s;
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -3292,10 +3292,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 36; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 57) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -3303,7 +3303,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 57 - s;
@@ -7403,10 +7403,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 54; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 57) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -7414,7 +7414,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 57 - s;
@@ -11759,10 +11759,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 78; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 53) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -11770,7 +11770,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 53 - s;
@@ -12488,7 +12488,7 @@ static int sp_256_point_new_ex_5(void* heap, sp_point_256* sp, sp_point_256** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -12759,10 +12759,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 5; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 52) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -12770,7 +12770,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 52 - s;
@@ -17566,7 +17566,7 @@ static int sp_384_point_new_ex_7(void* heap, sp_point_384* sp, sp_point_384** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -17868,10 +17868,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 7; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 55) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -17879,7 +17879,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 55 - s;
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -4297,10 +4297,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -4308,7 +4308,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -8889,10 +8889,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 96; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -8900,7 +8900,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -12428,10 +12428,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 128; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -12439,7 +12439,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -13587,7 +13587,7 @@ static int sp_256_point_new_ex_8(void* heap, sp_point_256* sp, sp_point_256** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -13810,10 +13810,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 8; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -13821,7 +13821,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
@@ -20048,7 +20048,7 @@ static int sp_384_point_new_ex_12(void* heap, sp_point_384* sp, sp_point_384** p
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -20302,10 +20302,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 12; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -20313,7 +20313,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -1650,10 +1650,10 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 32; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -1661,7 +1661,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -3704,10 +3704,10 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 48; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -3715,7 +3715,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -5212,10 +5212,10 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -5223,7 +5223,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -5693,7 +5693,7 @@ static int sp_256_point_new_ex_4(void* heap, sp_point_256* sp, sp_point_256** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -5912,10 +5912,10 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 4; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -5923,7 +5923,7 @@ static int sp_256_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
@@ -23529,7 +23529,7 @@ static int sp_384_point_new_ex_6(void* heap, sp_point_384* sp, sp_point_384** p)
 #else
    *p = sp;
 #endif
-    if (p == NULL) {
+    if (*p == NULL) {
        ret = MEMORY_E;
    }
    return ret;
@@ -23801,10 +23801,10 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)

        r->dp[0] = 0;
        for (i = 0; i < 6; i++) {
-            r->dp[j] |= a[i] << s;
+            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= (1L << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
-            r->dp[++j] = a[i] >> s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j++] &= (1L << DIGIT_BIT) - 1;
@@ -23812,7 +23812,7 @@ static int sp_384_to_mp(const sp_digit* a, mp_int* r)
                    r->dp[j] = 0;
                }
                else {
-                    r->dp[j] = a[i] >> s;
+                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 64 - s;
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -37530,13 +37530,13 @@ _sp_256_mont_mul_4:
        sbbq	$0, %r9
        #   a -= (mu << 32) << 192
        subq	%rax, %r12
-        movq	$4294967295, %rax
        sbbq	%rsi, %r13
-        movq	$18446744069414584321, %rsi
        sbbq	%r8, %r14
        sbbq	%rdx, %r15
        sbbq	%rcx, %rbx
        adcq	$0, %r9
+        movq	$4294967295, %rax
+        movq	$18446744069414584321, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r9, %rax
@@ -37709,13 +37709,13 @@ _sp_256_mont_sqr_4:
        sbbq	$0, %r9
        #   a -= (mu << 32) << 192
        subq	%rax, %r12
-        movq	$4294967295, %rax
        sbbq	%rsi, %r13
-        movq	$18446744069414584321, %rsi
        sbbq	%r8, %r14
        sbbq	%rdx, %r15
        sbbq	%rcx, %rbx
        adcq	$0, %r9
+        movq	$4294967295, %rax
+        movq	$18446744069414584321, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r9, %rax
@@ -38009,17 +38009,17 @@ _sp_256_mont_add_4:
        movq	8(%rsi), %rcx
        movq	16(%rsi), %r8
        movq	24(%rsi), %r9
+        movq	$4294967295, %r10
+        movq	$18446744069414584321, %r11
        addq	(%rdx), %rax
        adcq	8(%rdx), %rcx
-        movq	$4294967295, %r10
        adcq	16(%rdx), %r8
-        movq	$18446744069414584321, %r11
+        movq	$0, %rsi
        adcq	24(%rdx), %r9
-        movq	$0, %rdx
-        sbbq	$0, %rdx
-        andq	%rdx, %r10
-        andq	%rdx, %r11
-        subq	%rdx, %rax
+        sbbq	$0, %rsi
+        andq	%rsi, %r10
+        andq	%rsi, %r11
+        subq	%rsi, %rax
        sbbq	%r10, %rcx
        movq	%rax, (%rdi)
        sbbq	$0, %r8
@@ -38051,13 +38051,13 @@ _sp_256_mont_dbl_4:
        movq	8(%rsi), %rax
        movq	16(%rsi), %rcx
        movq	24(%rsi), %r8
-        xorq	%r11, %r11
+        movq	$4294967295, %r9
+        movq	$18446744069414584321, %r10
        addq	%rdx, %rdx
        adcq	%rax, %rax
-        movq	$4294967295, %r9
        adcq	%rcx, %rcx
+        movq	$0, %r11
        adcq	%r8, %r8
-        movq	$18446744069414584321, %r10
        sbbq	$0, %r11
        andq	%r11, %r9
        andq	%r11, %r10
@@ -38093,13 +38093,13 @@ _sp_256_mont_tpl_4:
        movq	8(%rsi), %rax
        movq	16(%rsi), %rcx
        movq	24(%rsi), %r8
-        xorq	%r11, %r11
+        movq	$4294967295, %r9
+        movq	$18446744069414584321, %r10
        addq	%rdx, %rdx
        adcq	%rax, %rax
-        movq	$4294967295, %r9
        adcq	%rcx, %rcx
+        movq	$0, %r11
        adcq	%r8, %r8
-        movq	$18446744069414584321, %r10
        sbbq	$0, %r11
        andq	%r11, %r9
        andq	%r11, %r10
@@ -38107,13 +38107,13 @@ _sp_256_mont_tpl_4:
        sbbq	%r9, %rax
        sbbq	$0, %rcx
        sbbq	%r10, %r8
-        xorq	%r11, %r11
+        movq	$4294967295, %r9
+        movq	$18446744069414584321, %r10
        addq	(%rsi), %rdx
        adcq	8(%rsi), %rax
-        movq	$4294967295, %r9
        adcq	16(%rsi), %rcx
+        movq	$0, %r11
        adcq	24(%rsi), %r8
-        movq	$18446744069414584321, %r10
        sbbq	$0, %r11
        andq	%r11, %r9
        andq	%r11, %r10
@@ -38150,17 +38150,17 @@ _sp_256_mont_sub_4:
        movq	8(%rsi), %rcx
        movq	16(%rsi), %r8
        movq	24(%rsi), %r9
+        movq	$4294967295, %r10
+        movq	$18446744069414584321, %r11
        subq	(%rdx), %rax
        sbbq	8(%rdx), %rcx
-        movq	$4294967295, %r10
        sbbq	16(%rdx), %r8
-        movq	$18446744069414584321, %r11
+        movq	$0, %rsi
        sbbq	24(%rdx), %r9
-        movq	$0, %rdx
-        sbbq	$0, %rdx
-        andq	%rdx, %r10
-        andq	%rdx, %r11
-        addq	%rdx, %rax
+        sbbq	$0, %rsi
+        andq	%rsi, %r10
+        andq	%rsi, %r11
+        addq	%rsi, %rax
        adcq	%r10, %rcx
        movq	%rax, (%rdi)
        adcq	$0, %r8
@@ -38370,13 +38370,13 @@ _sp_256_mont_mul_avx2_4:
        sbbq	$0, %r8
        #   a -= (mu << 32) << 192
        subq	%rax, %r11
-        movq	$4294967295, %rax
        sbbq	%rsi, %r12
-        movq	$18446744069414584321, %rsi
        sbbq	%rbp, %r13
        sbbq	%rdx, %r14
        sbbq	%rcx, %r15
        adcq	$0, %r8
+        movq	$4294967295, %rax
+        movq	$18446744069414584321, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r8, %rax
@@ -38424,25 +38424,26 @@ _sp_256_mont_sqr_avx2_4:
        push	%rbx
        # A[0] * A[1]
        movq	(%rsi), %rdx
+        movq	16(%rsi), %r15
        mulxq	8(%rsi), %r9, %r10
        # A[0] * A[3]
        mulxq	24(%rsi), %r11, %r12
        # A[2] * A[1]
-        movq	16(%rsi), %rdx
+        movq	%r15, %rdx
        mulxq	8(%rsi), %rcx, %rbx
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
        # A[2] * A[3]
        mulxq	24(%rsi), %r13, %r14
+        xorq	%r15, %r15
+        adoxq	%rcx, %r11
        adoxq	%rbx, %r12
        # A[2] * A[0]
        mulxq	(%rsi), %rcx, %rbx
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
        # A[1] * A[3]
        movq	8(%rsi), %rdx
+        adoxq	%r15, %r13
        mulxq	24(%rsi), %rax, %r8
+        adcxq	%rcx, %r10
+        adoxq	%r15, %r14
        adcxq	%rbx, %r11
        adcxq	%rax, %r12
        adcxq	%r8, %r13
@@ -38497,7 +38498,7 @@ _sp_256_mont_sqr_avx2_4:
        adcq	%r10, %rdx
        # a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
        #   a += mu << 256
-        movq	$0, %r8
+        xorq	%r8, %r8
        addq	%rax, %r12
        adcq	%rsi, %r13
        adcq	%rcx, %r14
@@ -38526,13 +38527,13 @@ _sp_256_mont_sqr_avx2_4:
        sbbq	$0, %r8
        #   a -= (mu << 32) << 192
        subq	%rax, %r11
-        movq	$4294967295, %rax
        sbbq	%rsi, %r12
-        movq	$18446744069414584321, %rsi
        sbbq	%rcx, %r13
        sbbq	%rdx, %r14
        sbbq	%rbx, %r15
        adcq	$0, %r8
+        movq	$4294967295, %rax
+        movq	$18446744069414584321, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r8, %rax
@@ -38876,102 +38877,103 @@ sp_256_mul_avx2_4:
 .p2align	4
 _sp_256_mul_avx2_4:
 #endif /* __APPLE__ */
+        push	%rbx
+        push	%rbp
        push	%r12
        push	%r13
        push	%r14
        push	%r15
-        push	%rbx
-        movq	%rdx, %rax
-        # A[0] * B[0]
-        movq	(%rax), %rdx
-        mulxq	(%rsi), %r9, %r10
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r11, %r12
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %r8
-        xorq	%rbx, %rbx
+        movq	%rdx, %rbp
+        #  A[0] * B[0]
+        movq	(%rbp), %rdx
+        mulxq	(%rsi), %r8, %r9
+        #  A[2] * B[0]
+        mulxq	16(%rsi), %r10, %r11
+        #  A[1] * B[0]
+        mulxq	8(%rsi), %rax, %rcx
+        xorq	%r15, %r15
+        adcxq	%rax, %r9
+        #  A[1] * B[3]
+        movq	24(%rbp), %rdx
+        mulxq	8(%rsi), %r12, %r13
        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	24(%rax), %rdx
-        mulxq	8(%rsi), %r13, %r14
-        adcxq	%r8, %r11
-        # A[0] * B[1]
-        movq	8(%rax), %rdx
-        mulxq	(%rsi), %rcx, %r8
+        #  A[0] * B[1]
+        movq	8(%rbp), %rdx
+        mulxq	(%rsi), %rax, %rcx
+        adoxq	%rax, %r9
+        #  A[2] * B[1]
+        mulxq	16(%rsi), %rax, %r14
        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r15
-        adoxq	%r8, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	16(%rax), %rdx
-        mulxq	8(%rsi), %rcx, %r8
+        adcxq	%rax, %r11
+        #  A[1] * B[2]
+        movq	16(%rbp), %rdx
+        mulxq	8(%rsi), %rax, %rcx
+        adcxq	%r14, %r12
+        adoxq	%rax, %r11
        adcxq	%r15, %r13
        adoxq	%rcx, %r12
-        adcxq	%rbx, %r14
-        adoxq	%r8, %r13
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %r8
-        adoxq	%rbx, %r14
-        xorq	%r15, %r15
+        #  A[0] * B[2]
+        mulxq	(%rsi), %rax, %rcx
+        adoxq	%r15, %r13
+        xorq	%r14, %r14
+        adcxq	%rax, %r10
+        #  A[1] * B[1]
+        movq	8(%rbp), %rdx
+        mulxq	8(%rsi), %rdx, %rax
        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	8(%rax), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%r8, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	8(%rax), %rdx
-        adoxq	%rcx, %r12
-        mulxq	24(%rsi), %rcx, %r8
+        adoxq	%rdx, %r10
+        #  A[3] * B[1]
+        movq	8(%rbp), %rdx
+        adoxq	%rax, %r11
+        mulxq	24(%rsi), %rax, %rcx
+        adcxq	%rax, %r12
+        #  A[2] * B[2]
+        movq	16(%rbp), %rdx
+        mulxq	16(%rsi), %rdx, %rax
        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	16(%rax), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%r8, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	24(%rax), %rdx
-        adoxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %r8
-        adoxq	%rbx, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%r8, %rbx
-        xorq	%r8, %r8
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	(%rax), %rdx
-        adcxq	%rcx, %r13
-        mulxq	24(%rsi), %rdx, %rcx
        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	24(%rax), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	16(%rax), %rdx
+        #  A[3] * B[3]
+        movq	24(%rbp), %rdx
+        adoxq	%rax, %r13
+        mulxq	24(%rsi), %rax, %rcx
+        adoxq	%r15, %r14
+        adcxq	%rax, %r14
+        #  A[0] * B[3]
+        mulxq	(%rsi), %rdx, %rax
        adcxq	%rcx, %r15
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%r8, %rbx
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%r8, %rbx
-        movq	%r9, (%rdi)
-        movq	%r10, 8(%rdi)
-        movq	%r11, 16(%rdi)
-        movq	%r12, 24(%rdi)
-        movq	%r13, 32(%rdi)
-        movq	%r14, 40(%rdi)
-        movq	%r15, 48(%rdi)
-        movq	%rbx, 56(%rdi)
-        pop	%rbx
+        xorq	%rcx, %rcx
+        adcxq	%rdx, %r11
+        #  A[3] * B[0]
+        movq	24(%rsi), %rdx
+        adcxq	%rax, %r12
+        mulxq	(%rbp), %rbx, %rax
+        adoxq	%rbx, %r11
+        adoxq	%rax, %r12
+        #  A[3] * B[2]
+        mulxq	16(%rbp), %rdx, %rax
+        adcxq	%rdx, %r13
+        #  A[2] * B[3]
+        movq	24(%rbp), %rdx
+        adcxq	%rax, %r14
+        mulxq	16(%rsi), %rax, %rdx
+        adcxq	%rcx, %r15
+        adoxq	%rax, %r13
+        adoxq	%rdx, %r14
+        adoxq	%rcx, %r15
+        movq	%r8, (%rdi)
+        movq	%r9, 8(%rdi)
+        movq	%r10, 16(%rdi)
+        movq	%r11, 24(%rdi)
+        movq	%r12, 32(%rdi)
+        movq	%r13, 40(%rdi)
+        movq	%r14, 48(%rdi)
+        movq	%r15, 56(%rdi)
        pop	%r15
        pop	%r14
        pop	%r13
        pop	%r12
+        pop	%rbp
+        pop	%rbx
        repz retq
 #ifndef __APPLE__
 .size	sp_256_mul_avx2_4,.-sp_256_mul_avx2_4
@@ -39291,32 +39293,33 @@ sp_256_sqr_avx2_4:
 .p2align	4
 _sp_256_sqr_avx2_4:
 #endif /* __APPLE__ */
-        push	%rbx
        push	%r12
        push	%r13
        push	%r14
        push	%r15
+        push	%rbx
        # A[0] * A[1]
        movq	(%rsi), %rdx
+        movq	16(%rsi), %r15
        mulxq	8(%rsi), %r9, %r10
        # A[0] * A[3]
        mulxq	24(%rsi), %r11, %r12
        # A[2] * A[1]
-        movq	16(%rsi), %rdx
+        movq	%r15, %rdx
        mulxq	8(%rsi), %rcx, %rbx
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
        # A[2] * A[3]
        mulxq	24(%rsi), %r13, %r14
+        xorq	%r15, %r15
+        adoxq	%rcx, %r11
        adoxq	%rbx, %r12
        # A[2] * A[0]
        mulxq	(%rsi), %rcx, %rbx
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
        # A[1] * A[3]
        movq	8(%rsi), %rdx
+        adoxq	%r15, %r13
        mulxq	24(%rsi), %rax, %r8
+        adcxq	%rcx, %r10
+        adoxq	%r15, %r14
        adcxq	%rbx, %r11
        adcxq	%rax, %r12
        adcxq	%r8, %r13
@@ -39327,11 +39330,11 @@ _sp_256_sqr_avx2_4:
        movq	(%rsi), %rdx
        mulxq	%rdx, %r8, %rax
        adcxq	%r9, %r9
+        adcxq	%r10, %r10
+        adoxq	%rax, %r9
        # A[1] * A[1]
        movq	8(%rsi), %rdx
        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r10, %r10
-        adoxq	%rax, %r9
        adcxq	%r11, %r11
        adoxq	%rcx, %r10
        # A[2] * A[2]
@@ -39341,10 +39344,10 @@ _sp_256_sqr_avx2_4:
        adoxq	%rbx, %r11
        adcxq	%r13, %r13
        adoxq	%rax, %r12
+        adcxq	%r14, %r14
        # A[3] * A[3]
        movq	24(%rsi), %rdx
        mulxq	%rdx, %rax, %rbx
-        adcxq	%r14, %r14
        adoxq	%rcx, %r13
        adcxq	%r15, %r15
        adoxq	%rax, %r14
@@ -39357,11 +39360,11 @@ _sp_256_sqr_avx2_4:
        movq	%r13, 40(%rdi)
        movq	%r14, 48(%rdi)
        movq	%r15, 56(%rdi)
+        pop	%rbx
        pop	%r15
        pop	%r14
        pop	%r13
        pop	%r12
-        pop	%rbx
        repz retq
 #ifndef __APPLE__
 .size	sp_256_sqr_avx2_4,.-sp_256_sqr_avx2_4