Merge branch 'feature/enable_misalign_optimization_for_iperf' into 'master'

feat(newlib): enable libc misalign optimized functions Closes IDF-13820 See merge request espressif/esp-idf!41148
2025-10-03 10:30:58 +02:00 · 2025-08-22 22:14:53 +04:00
parent 139236741c b266d829dd
commit 177dee612f
31 changed files with 419 additions and 167 deletions
--- a/components/esp_rom/CMakeLists.txt
+++ b/components/esp_rom/CMakeLists.txt
@@ -149,6 +149,10 @@ if(ESP_TEE_BUILD)
    if(CONFIG_ESP_ROM_HAS_NEWLIB_NANO_FORMAT)
        rom_linker_script("newlib-nano")
    endif()
+    rom_linker_script("libc")
+    if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY) # TODO IDF-13852: use optimized memcpy for TEE ?
+        rom_linker_script("libc-suboptimal_for_misaligned_mem")
+    endif()
 endif()

 if(BOOTLOADER_BUILD)
@@ -173,8 +177,8 @@ if(BOOTLOADER_BUILD)
            else()
                rom_linker_script("libc")
            endif()
-            if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY
-                AND NOT CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
+            if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY)
+                # For bootloader use only ROM functions due to 'relocation truncated to fit' error
                rom_linker_script("libc-suboptimal_for_misaligned_mem")
            endif()
            if(CONFIG_LIBC_NEWLIB)
--- a/components/newlib/CMakeLists.txt
+++ b/components/newlib/CMakeLists.txt
@@ -38,23 +38,21 @@ if(CONFIG_STDATOMIC_S32C1I_SPIRAM_WORKAROUND)
 endif()

 if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
-    if(CMAKE_C_COMPILER_ID MATCHES "Clang")
-        # TODO IDF-13089: remove this block and modify src/newlib.lf when clang was upgraded
    list(APPEND srcs
-             "src/string/memcmp.c"
-             "src/string/memmove.c"
-             "src/string/strncmp.c"
-             "src/string/strncpy.c"
         "src/port/riscv/memcpy.c"
+         "src/port/riscv/memmove.c"
+         "src/string/memcmp.c"
         "src/port/riscv/strcpy.c"
+         "src/string/strncpy.c"
+         "src/string/strncmp.c"
         "src/port/riscv/strcmp.S")
-        list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcmp_impl")
+    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcpy_impl")
    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memmove_impl")
-        list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncmp_impl")
-        list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncpy_impl")
+    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcmp_impl")
    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcpy_impl")
+    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncpy_impl")
+    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncmp_impl")
    list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcmp_impl")
-    endif()
 endif()

 if(CONFIG_LIBC_NEWLIB)
@@ -96,6 +94,17 @@ if(CONFIG_STDATOMIC_S32C1I_SPIRAM_WORKAROUND)
                                PROPERTIES COMPILE_FLAGS "-mno-disable-hardware-atomics")
 endif()

+if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
+    # TODO GCC-419 and IDF-13089: cleanup files
+    set_source_files_properties("src/string/memcmp.c"
+                                "src/string/strncmp.c"
+                                "src/string/strncpy.c"
+                                "src/port/riscv/memcpy.c"
+                                "src/port/riscv/memmove.c"
+                                "src/port/riscv/strcpy.c"
+                                PROPERTIES COMPILE_FLAGS -O2)
+endif()
+
 # Forces the linker to include heap, syscall, pthread, assert, and retargetable locks from this component,
 # instead of the implementations provided by newlib.
 list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_heap_impl")
--- a/components/newlib/Kconfig
+++ b/components/newlib/Kconfig
@@ -145,13 +145,13 @@ menu "LibC"

    config LIBC_OPTIMIZED_MISALIGNED_ACCESS
        bool "Use performance-optimized memXXX/strXXX functions on misaligned memory access"
-        default n
+        default y
        depends on ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY
        help
            Enables performance-optimized implementations of memory and string functions
            when handling misaligned memory.

-            This increases the image size by ~1000 bytes.
+            Require approximately 800–1000 bytes of IRAM.

            Optimized functions include:
                - memcpy
--- a/components/newlib/priv_include/string/local.h
+++ b/components/newlib/priv_include/string/local.h
@@ -22,7 +22,9 @@
 * to avoid small performance penalties (if they are not zero).  */
 #define UNALIGNED_X(X) ((long)X & (sizeof (long) - 1))

+#ifndef _HAVE_HW_MISALIGNED_ACCESS
 #define _HAVE_HW_MISALIGNED_ACCESS (__riscv_misaligned_fast || __riscv_misaligned_slow)
+#endif

 #if _HAVE_HW_MISALIGNED_ACCESS
 /* Hardware performs unaligned operations with little
--- a/components/newlib/src/libc.lf
+++ b/components/newlib/src/libc.lf
@@ -11,8 +11,6 @@ archive:
  else:
    libc.a
 entries:
-  if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y:
-    libc_a-memcpy (noflash)
  if SPIRAM_CACHE_WORKAROUND = y:
    # The following libs are either used in a lot of places or in critical
    # code. (such as panic or abort)
--- a/components/newlib/src/newlib.lf
+++ b/components/newlib/src/newlib.lf
@@ -1,6 +1,14 @@
 [mapping:newlib]
 archive: libnewlib.a
 entries:
+  if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y:
+      memcpy (noflash)
+      memmove (noflash)
+      memcmp (noflash)
+      strcpy  (noflash)
+      strncpy (noflash)
+      strcmp  (noflash)
+      strncmp (noflash)
  if LIBC_MISC_IN_IRAM = y:
    if HEAP_PLACE_FUNCTION_INTO_FLASH = n:
      heap (noflash)
@@ -11,16 +19,3 @@ entries:
    stdatomic_s32c1i (noflash)
  if STDATOMIC_S32C1I_SPIRAM_WORKAROUND = y:
    stdatomic_s32c1i (noflash)
-  if CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TOOLCHAIN_GCC = y:
-    # memcpy() should be in IRAM due to early system use
-    # Others can stay in flash (performance difference is minimal)
-    #
-    # Performance Comparison:
-    # | func    | flash | iram  |
-    # |---------|-------|-------|
-    # | memcmp  | 15986 | 15170 |
-    # | strcpy  | 17499 | 16660 |
-    # | strcmp  | 13125 | 11147 |
-    # | strncpy | 17386 | 16668 |
-    # | strncmp | 22161 | 21782 |
-    libc_a-memcpy (noflash)
--- a/components/newlib/src/picolibc/libc.lf
+++ b/components/newlib/src/picolibc/libc.lf
@@ -8,10 +8,6 @@
 archive:
    libc.a
 entries:
-  if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TARGET_ARCH_RISCV = y:
-    memcpy-asm (noflash)
-  if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TARGET_ARCH_XTENSA = y:
-    libc_machine_xtensa_memcpy (noflash)
  if SPIRAM_CACHE_WORKAROUND = y:
    # The following libs are either used in a lot of places or in critical code. (such as panic or abort)
    # Thus, they shall always be placed in IRAM.
--- a/components/newlib/src/port/riscv/memcpy.c
+++ b/components/newlib/src/port/riscv/memcpy.c
@@ -18,14 +18,11 @@

 #include <string.h>
 #include <stdint.h>
-#include "esp_attr.h"
-#include "../../string/local.h"
+#include "string/local.h"

 #define unlikely(X) __builtin_expect (!!(X), 0)

-IRAM_ATTR
 void *
-__attribute__((optimize("-Os")))
 __inhibit_loop_to_libcall
 memcpy(void *__restrict aa, const void *__restrict bb, size_t n)
 {
@@ -65,29 +62,80 @@ small:

    if (unlikely(lend - la > 8)) {
        while (lend - la > 8) {
-            long b0 = *lb++;
-            long b1 = *lb++;
-            long b2 = *lb++;
-            long b3 = *lb++;
-            long b4 = *lb++;
-            long b5 = *lb++;
-            long b6 = *lb++;
-            long b7 = *lb++;
-            long b8 = *lb++;
-            *la++ = b0;
-            *la++ = b1;
-            *la++ = b2;
-            *la++ = b3;
-            *la++ = b4;
-            *la++ = b5;
-            *la++ = b6;
-            *la++ = b7;
-            *la++ = b8;
+            /*
+             *   long b0 = *lb++;
+             *   long b1 = *lb++;
+             *   long b2 = *lb++;
+             *   long b3 = *lb++;
+             *   long b4 = *lb++;
+             *   long b5 = *lb++;
+             *   long b6 = *lb++;
+             *   long b7 = *lb++;
+             *   long b8 = *lb++;
+             *   *la++ = b0;
+             *   *la++ = b1;
+             *   *la++ = b2;
+             *   *la++ = b3;
+             *   *la++ = b4;
+             *   *la++ = b5;
+             *   *la++ = b6;
+             *   *la++ = b7;
+             *   *la++ = b8;
+             */
+            long src0, src1, src2, src3;
+            long src4, src5, src6, src7;
+            long src8;
+            /* DIG-694: need at least 2 instructions between lw and sw */
+            asm volatile("lw   %0,  0(%10)\n"   // scr0 = lb[0];
+                         "lw   %1,  4(%10)\n"   // scr1 = lb[1];
+                         "lw   %2,  8(%10)\n"   // scr2 = lb[2];
+                         "lw   %3, 12(%10)\n"   // scr3 = lb[3];
+                         "lw   %4, 16(%10)\n"   // scr4 = lb[4];
+                         "lw   %5, 20(%10)\n"   // scr5 = lb[5];
+                         "lw   %6, 24(%10)\n"   // scr6 = lb[6];
+                         "lw   %7, 28(%10)\n"   // scr7 = lb[7];
+                         "lw   %8, 32(%10)\n"   // scr8 = lb[8];
+                         "addi %9,  %9,   36\n" // la += 8 * 9;
+                         "addi %10, %10,  36\n" // lb += 8 * 9;
+                         "sw   %0, -36(%9)\n"   // *(la - 9) = src;
+                         "sw   %1, -32(%9)\n"   // *(la - 8) = src;
+                         "sw   %2, -28(%9)\n"   // *(la - 7) = src;
+                         "sw   %3, -24(%9)\n"   // *(la - 6) = src;
+                         "sw   %4, -20(%9)\n"   // *(la - 5) = src;
+                         "sw   %5, -16(%9)\n"   // *(la - 4) = src;
+                         "sw   %6, -12(%9)\n"   // *(la - 3) = src;
+                         "sw   %7,  -8(%9)\n"   // *(la - 2) = src;
+                         "sw   %8,  -4(%9)\n"   // *(la - 1) = src;
+                         : "=r"(src0), "=r"(src1), "=r"(src2), "=r"(src3),
+                         "=r"(src4), "=r"(src5), "=r"(src6), "=r"(src7),
+                         "=r"(src8),
+                         "+r"(la), "+r"(lb)
+                         :: "memory");
        }
    }
-
    while (la < lend) {
-        BODY(la, lb, long);
+        /*
+         *   BODY(la, lb, long);
+         */
+        long src0;
+#ifdef __OPTIMIZE_SIZE__
+#error "Enabled Os optimization may not work properly for DIG-694"
+        /*
+         * Replacing the string:
+         *   *la++ = tt;
+         * To:
+         *   "addi %2, %4,  4\n" // la++;
+         *   "sw   %0, -4(%4)\n" // *(la-1) = src0;
+         * May break some optimizations and slightly reduce performance.
+         */
+#endif
+        /* DIG-694: need at least 2 instructions between lw and sw */
+        asm volatile("lw   %0,  0(%1)\n" // long src0 = *lb;
+                     "addi %1, %1,  4\n" // lb++;
+                     "addi %2, %2,  4\n" // la++;
+                     "sw   %0, -4(%2)\n" // *(la-1) = src0;
+                     : "=r"(src0), "+r"(lb), "+r"(la)
+                     :: "memory");
    }

    a = (char *)la;
--- a/components/newlib/src/port/riscv/memmove.c
+++ b/components/newlib/src/port/riscv/memmove.c
@@ -0,0 +1,150 @@
+/*
+ * SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
+ *
+ * SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
+ */
+#include <string.h>
+#include <_ansi.h>
+#include <stddef.h>
+#include <limits.h>
+#include "string/local.h"
+
+void *
+__inhibit_loop_to_libcall
+memmove(void *dst_void,
+        const void *src_void,
+        size_t length)
+{
+    char *dst = dst_void;
+    const char *src = src_void;
+    long *aligned_dst;
+    const long *aligned_src;
+
+    if (src < dst && dst < src + length) {
+        /* Destructive overlap...have to copy backwards */
+        src += length;
+        dst += length;
+
+        if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
+            aligned_dst = (long*)dst;
+            aligned_src = (long*)src;
+
+            /* Copy one long word at a time if possible.  */
+            while (!TOO_SMALL_LITTLE_BLOCK(length)) {
+                /*
+                 *   const long src0 = *--aligned_src;
+                 *   *--aligned_dst = src0;
+                 *   length -= LITTLE_BLOCK_SIZE;
+                 */
+                long src0;
+                /* DIG-694: need at least 2 instructions between lw and sw */
+                asm volatile("lw   %0, -4(%1)\n" // src0 = *(aligned_src - 1);
+                             "addi %1, %1, -4\n" // aligned_src--;
+                             "addi %2, %2, -4\n" // aligned_dst--;
+                             "addi %3, %3, -4\n" // length -= LITTLE_BLOCK_SIZE;
+                             "sw   %0,  0(%2)\n" // aligned_dst = src0;
+                             : "=r"(src0), "+r"(aligned_src), "+r"(aligned_dst), "+r"(length)
+                             :: "memory");
+            }
+
+            /* Pick up any residual with a byte copier.  */
+            dst = (char*)aligned_dst;
+            src = (char*)aligned_src;
+        }
+
+        while (length--) {
+            *--dst = *--src;
+        }
+    } else {
+        /* Use optimizing algorithm for a non-destructive copy to closely
+           match memcpy. If the size is small or either SRC or DST is unaligned,
+           then punt into the byte copy loop.  This should be rare.  */
+        if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
+            aligned_dst = (long*)dst;
+            aligned_src = (long*)src;
+
+            /* Copy 8X long words at a time if possible.  */
+            while (length >= BIG_BLOCK_SIZE * 2) {
+                /*
+                 *   const long src0 = *aligned_src++;
+                 *   const long src1 = *aligned_src++;
+                 *   const long src2 = *aligned_src++;
+                 *   const long src3 = *aligned_src++;
+                 *   const long src4 = *aligned_src++;
+                 *   const long src5 = *aligned_src++;
+                 *   const long src6 = *aligned_src++;
+                 *   const long src7 = *aligned_src++;
+                 *   *aligned_dst++ = src0;
+                 *   *aligned_dst++ = src1;
+                 *   *aligned_dst++ = src2;
+                 *   *aligned_dst++ = src3;
+                 *   *aligned_dst++ = src4;
+                 *   *aligned_dst++ = src5;
+                 *   *aligned_dst++ = src6;
+                 *   *aligned_dst++ = src7;
+                 */
+                long src0, src1, src2, src3;
+                long src4, src5, src6, src7;
+                /* DIG-694: need at least 2 instructions between lw and sw */
+                asm volatile("lw   %0,  0(%8)\n"    // src0 = aligned_src[0];
+                             "lw   %1,  4(%8)\n"    // src1 = aligned_src[1];
+                             "lw   %2,  8(%8)\n"    // src2 = aligned_src[2];
+                             "lw   %3, 12(%8)\n"    // src3 = aligned_src[3];
+                             "lw   %4, 16(%8)\n"    // src4 = aligned_src[4];
+                             "lw   %5, 20(%8)\n"    // src5 = aligned_src[5];
+                             "lw   %6, 24(%8)\n"    // src6 = aligned_src[6];
+                             "lw   %7, 28(%8)\n"    // src7 = aligned_src[7];
+                             "addi %8,  %8,  32\n"  // aligned_src += BIG_BLOCK_SIZE * 2;
+                             "addi %9,  %9,  32\n"  // aligned_dst += BIG_BLOCK_SIZE * 2;
+                             "addi %10, %10, -32\n" // length -= BIG_BLOCK_SIZE * 2;
+                             "sw   %0, -32(%9)\n"   // *(aligned_dst - 8) = src0;
+                             "sw   %1, -28(%9)\n"   // *(aligned_dst - 7) = src1;
+                             "sw   %2, -24(%9)\n"   // *(aligned_dst - 6) = src2;
+                             "sw   %3, -20(%9)\n"   // *(aligned_dst - 5) = src3;
+                             "sw   %4, -16(%9)\n"   // *(aligned_dst - 4) = src4;
+                             "sw   %5, -12(%9)\n"   // *(aligned_dst - 3) = src5;
+                             "sw   %6,  -8(%9)\n"   // *(aligned_dst - 2) = src6;
+                             "sw   %7,  -4(%9)\n"   // *(aligned_dst - 1) = src7;
+                             : "=r"(src0), "=r"(src1), "=r"(src2), "=r"(src3),
+                             "=r"(src4), "=r"(src5), "=r"(src6), "=r"(src7),
+                             "+r"(aligned_src), "+r"(aligned_dst), "+r"(length)
+                             :: "memory");
+            }
+
+            /* Copy one long word at a time if possible.  */
+            while (!TOO_SMALL_LITTLE_BLOCK(length)) {
+                /*
+                 *   const long src0 = *aligned_src++;
+                 *   *aligned_dst++ = src0;
+                 *   length -= LITTLE_BLOCK_SIZE;
+                 */
+                long src0;
+                /* DIG-694: need at least 2 instructions between lw and sw */
+                asm volatile("lw   %0,  0(%4)\n" // long src0 = *aligned_src;
+                             "addi %1, %4,  4\n" // aligned_src++;
+                             "addi %2, %5,  4\n" // aligned_dst++;
+                             "addi %3, %6, -4\n" // length -= LITTLE_BLOCK_SIZE;
+                             "sw   %0, -4(%5)\n" // *(aligned_dst-1) = src0;
+                             : "=r"(src0), "=r"(aligned_src), "=r"(aligned_dst), "=r"(length)
+                             : "r"(aligned_src), "r"(aligned_dst), "r"(length));
+            }
+
+            /* Pick up any residual with a byte copier.  */
+            dst = (char*)aligned_dst;
+            src = (char*)aligned_src;
+        }
+
+        while (length--) {
+            *dst++ = *src++;
+        }
+    }
+
+    return dst_void;
+}
+
+// Hook to force the linker to include this file
+void esp_libc_include_memmove_impl(void)
+{
+}
--- a/components/newlib/src/port/riscv/strcpy.c
+++ b/components/newlib/src/port/riscv/strcpy.c
@@ -30,7 +30,6 @@ unsigned long __newlib__libc_detect_null(unsigned long w)
    return ~(((w & mask) + mask) | w | mask);
 }

-__attribute__((optimize("-Os")))
 char *strcpy(char *dst, const char *src)
 {
    char *dst0 = dst;
@@ -44,6 +43,7 @@ char *strcpy(char *dst, const char *src)
        const long *lsrc = (const long *)src;

        while (!__newlib__libc_detect_null(*lsrc)) {
+            /* DIG-694: there are enough instructions between lw and sw after compiler unrolls the loop */
            *ldst++ = *lsrc++;
        }

--- a/components/newlib/src/string/memcmp.c
+++ b/components/newlib/src/string/memcmp.c
@@ -6,9 +6,8 @@
 * SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
 */
 #include <string.h>
-#include "local.h"
+#include "string/local.h"

-__attribute__((optimize("-Os")))
 int
 memcmp(const void *m1,
       const void *m2,
--- a/components/newlib/src/string/memmove.c
+++ b/components/newlib/src/string/memmove.c
@@ -1,88 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
- *
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
- *
- * SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
- */
-#include <string.h>
-#include <_ansi.h>
-#include <stddef.h>
-#include <limits.h>
-#include "local.h"
-
-__attribute__((optimize("-Os")))
-void *
-__inhibit_loop_to_libcall
-memmove(void *dst_void,
-        const void *src_void,
-        size_t length)
-{
-    char *dst = dst_void;
-    const char *src = src_void;
-    long *aligned_dst;
-    const long *aligned_src;
-
-    if (src < dst && dst < src + length) {
-        /* Destructive overlap...have to copy backwards */
-        src += length;
-        dst += length;
-
-        if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
-            aligned_dst = (long*)dst;
-            aligned_src = (long*)src;
-
-            /* Copy one long word at a time if possible.  */
-            while (!TOO_SMALL_LITTLE_BLOCK(length)) {
-                *--aligned_dst = *--aligned_src;
-                length -= LITTLE_BLOCK_SIZE;
-            }
-
-            /* Pick up any residual with a byte copier.  */
-            dst = (char*)aligned_dst;
-            src = (char*)aligned_src;
-        }
-
-        while (length--) {
-            *--dst = *--src;
-        }
-    } else {
-        /* Use optimizing algorithm for a non-destructive copy to closely
-           match memcpy. If the size is small or either SRC or DST is unaligned,
-           then punt into the byte copy loop.  This should be rare.  */
-        if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
-            aligned_dst = (long*)dst;
-            aligned_src = (long*)src;
-
-            /* Copy 4X long words at a time if possible.  */
-            while (!TOO_SMALL_BIG_BLOCK(length)) {
-                *aligned_dst++ = *aligned_src++;
-                *aligned_dst++ = *aligned_src++;
-                *aligned_dst++ = *aligned_src++;
-                *aligned_dst++ = *aligned_src++;
-                length -= BIG_BLOCK_SIZE;
-            }
-
-            /* Copy one long word at a time if possible.  */
-            while (!TOO_SMALL_LITTLE_BLOCK(length)) {
-                *aligned_dst++ = *aligned_src++;
-                length -= LITTLE_BLOCK_SIZE;
-            }
-
-            /* Pick up any residual with a byte copier.  */
-            dst = (char*)aligned_dst;
-            src = (char*)aligned_src;
-        }
-
-        while (length--) {
-            *dst++ = *src++;
-        }
-    }
-
-    return dst_void;
-}
-
-// Hook to force the linker to include this file
-void esp_libc_include_memmove_impl(void)
-{
-}
--- a/components/newlib/src/string/strncmp.c
+++ b/components/newlib/src/string/strncmp.c
@@ -7,9 +7,8 @@
 */
 #include <string.h>
 #include <limits.h>
-#include "local.h"
+#include "string/local.h"

-__attribute__((optimize("-Os")))
 int
 strncmp(const char *s1,
        const char *s2,
--- a/components/newlib/src/string/strncpy.c
+++ b/components/newlib/src/string/strncpy.c
@@ -7,9 +7,8 @@
 */
 #include <string.h>
 #include <limits.h>
-#include "local.h"
+#include "string/local.h"

-__attribute__((optimize("-Os")))
 char *
 strncpy(char *__restrict dst0,
        const char *__restrict src0,
@@ -29,6 +28,7 @@ strncpy(char *__restrict dst0,
        sized copies.  */
        while (!TOO_SMALL_LITTLE_BLOCK(count) && !DETECT_NULL(*aligned_src)) {
            count -= sizeof(long int);
+            /* DIG-694: there are enough instructions between lw and sw after compiler unrolls the loop */
            *aligned_dst++ = *aligned_src++;
        }

--- a/components/newlib/test_apps/newlib/main/CMakeLists.txt
+++ b/components/newlib/test_apps/newlib/main/CMakeLists.txt
@@ -18,14 +18,9 @@ if(CONFIG_LIBC_NEWLIB)
 endif()

 if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
-    list(APPEND srcs "test_misaligned_mem_performance.c")
+    list(APPEND srcs "test_misaligned_access.c")
 endif()

 idf_component_register(SRCS "${srcs}"
                       PRIV_REQUIRES unity vfs cmock driver esp_timer spi_flash test_utils pthread esp_psram
                       WHOLE_ARCHIVE)
-
-if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
-    set_source_files_properties(test_misaligned_mem_performance.c
-                                PROPERTIES COMPILE_FLAGS "-Wno-incompatible-pointer-types -Wno-strict-prototypes")
-endif()
--- a/components/newlib/test_apps/newlib/main/test_misaligned_mem_performance.c
+++ b/components/newlib/test_apps/newlib/main/test_misaligned_mem_performance.c
@@ -6,11 +6,16 @@
 #include <stdint.h>
 #include <string.h>
 #include "esp_heap_caps.h"
+#include "soc/soc.h"
 #include "hal/cpu_ll.h"
 #include "unity.h"

 #define MAX_MEMTEST_SIZE 4096

+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+
 uint32_t test_function_dest_src_size(void (*foo)(...), bool pass_size)
 {
    uint32_t ccount1, ccount2;
@@ -106,3 +111,52 @@ TEST_CASE("strncmp", "[misaligned_mem]")
    /* esp32c2: 24369 cycles instead 49141.  */
    TEST_ASSERT_LESS_THAN(30000, ccount);
 }
+#pragma GCC diagnostic pop // "-Wincompatible-pointer-types" "-Wstrict-prototypes"
+
+static bool fn_in_ram(void *fn)
+{
+    const int fnaddr = (int)fn;
+    return (fnaddr >= SOC_IRAM_LOW && fnaddr < SOC_IRAM_HIGH);
+}
+
+TEST_CASE("mem functions in IRAM", "[misaligned_mem]")
+{
+    TEST_ASSERT_TRUE(fn_in_ram(memcpy));
+    TEST_ASSERT_TRUE(fn_in_ram(memcmp));
+    TEST_ASSERT_TRUE(fn_in_ram(memmove));
+    TEST_ASSERT_TRUE(fn_in_ram(strcpy));
+    TEST_ASSERT_TRUE(fn_in_ram(strncpy));
+    TEST_ASSERT_TRUE(fn_in_ram(strcmp));
+    TEST_ASSERT_TRUE(fn_in_ram(strncmp));
+}
+
+#if CONFIG_ESP_SYSTEM_MEMPROT_PMP
+TEST_CASE("access across different PMP regions", "[misaligned_mem]")
+{
+    /*
+     * PMP configurations for load and store addresses may
+     * have different permissions (e.g., "R" vs. "RW").
+     *
+     * Due to the timing alignment of internal signals, the address
+     * permission check may be incorrectly applied during the second
+     * part of a misaligned access transaction.
+     *
+     * As a workaround, insert two instructions (e.g. ADDI/NOP) between
+     * accessing to different memory regions. This spacing avoids the
+     * false permission check caused by signal timing overlap.
+     *
+     * This test may help identify the root issue in affected chips.
+     */
+
+    const void *src = (void *) SOC_DROM_MASK_LOW;
+    asm volatile("addi sp, sp, -16\n"
+                 "lw t0, 2(%0)\n"
+#if CONFIG_SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
+                 "nop\n"
+                 "nop\n"
+#endif
+                 "sw t0, 3(sp)\n"
+                 "addi sp, sp, 16"
+                 :: "r"(src) : "memory");
+}
+#endif
--- a/components/newlib/test_apps/newlib/sdkconfig.ci.misaligned_mem
+++ b/components/newlib/test_apps/newlib/sdkconfig.ci.misaligned_mem
@@ -1 +1,2 @@
 CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS=y
+CONFIG_ESP_SYSTEM_MEMPROT_PMP=y
--- a/components/soc/esp32c2/include/soc/Kconfig.soc_caps.in
+++ b/components/soc/esp32c2/include/soc/Kconfig.soc_caps.in
@@ -263,6 +263,10 @@ config SOC_CPU_IDRAM_SPLIT_USING_PMP
    bool
    default y

+config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
+    bool
+    default y
+
 config SOC_ECC_SUPPORT_POINT_VERIFY_QUIRK
    bool
    default y
--- a/components/soc/esp32c2/include/soc/soc_caps.h
+++ b/components/soc/esp32c2/include/soc/soc_caps.h
@@ -112,6 +112,8 @@
 #define SOC_CPU_WATCHPOINT_MAX_REGION_SIZE  0x80000000 // bytes

 #define SOC_CPU_IDRAM_SPLIT_USING_PMP   1
+// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
+#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1

 /*-------------------------- ECC CAPS --------------------------*/
 #define SOC_ECC_SUPPORT_POINT_VERIFY_QUIRK  1  // C2 ECC peripheral has a bug in ECC point verification, if value of K is zero the verification fails
--- a/components/soc/esp32c3/include/soc/Kconfig.soc_caps.in
+++ b/components/soc/esp32c3/include/soc/Kconfig.soc_caps.in
@@ -351,6 +351,10 @@ config SOC_CPU_WATCHPOINT_MAX_REGION_SIZE
    hex
    default 0x80000000

+config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
+    bool
+    default y
+
 config SOC_DS_SIGNATURE_MAX_BIT_LEN
    int
    default 3072
--- a/components/soc/esp32c3/include/soc/soc_caps.h
+++ b/components/soc/esp32c3/include/soc/soc_caps.h
@@ -143,6 +143,8 @@
 #define SOC_CPU_BREAKPOINTS_NUM             8
 #define SOC_CPU_WATCHPOINTS_NUM             8
 #define SOC_CPU_WATCHPOINT_MAX_REGION_SIZE  0x80000000 // bytes
+// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
+#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1

 /*-------------------------- DIGITAL SIGNATURE CAPS ----------------------------------------*/
 /** The maximum length of a Digital Signature in bits. */
--- a/components/soc/esp32c6/include/soc/Kconfig.soc_caps.in
+++ b/components/soc/esp32c6/include/soc/Kconfig.soc_caps.in
@@ -427,6 +427,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY
    int
    default 4

+config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
+    bool
+    default y
+
 config SOC_DS_SIGNATURE_MAX_BIT_LEN
    int
    default 3072
--- a/components/soc/esp32c6/include/soc/soc_caps.h
+++ b/components/soc/esp32c6/include/soc/soc_caps.h
@@ -163,6 +163,8 @@
 #define SOC_CPU_HAS_PMA                 1
 #define SOC_CPU_IDRAM_SPLIT_USING_PMP   1
 #define SOC_CPU_PMP_REGION_GRANULARITY  4
+// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
+#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1

 // TODO: IDF-5360 (Copy from esp32c3, need check)
 /*-------------------------- DIGITAL SIGNATURE CAPS ----------------------------------------*/
--- a/components/soc/esp32h2/include/soc/Kconfig.soc_caps.in
+++ b/components/soc/esp32h2/include/soc/Kconfig.soc_caps.in
@@ -427,6 +427,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY
    int
    default 4

+config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
+    bool
+    default y
+
 config SOC_MMU_PAGE_SIZE_CONFIGURABLE
    bool
    default y
--- a/components/soc/esp32h2/include/soc/soc_caps.h
+++ b/components/soc/esp32h2/include/soc/soc_caps.h
@@ -180,6 +180,8 @@
 #define SOC_CPU_HAS_PMA                 1
 #define SOC_CPU_IDRAM_SPLIT_USING_PMP   1
 #define SOC_CPU_PMP_REGION_GRANULARITY  4
+// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
+#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1

 /*-------------------------- MMU CAPS ----------------------------------------*/
 #define SOC_MMU_PAGE_SIZE_CONFIGURABLE        (1)
--- a/components/soc/esp32h21/include/soc/Kconfig.soc_caps.in
+++ b/components/soc/esp32h21/include/soc/Kconfig.soc_caps.in
@@ -283,6 +283,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY
    int
    default 4

+config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
+    bool
+    default y
+
 config SOC_MMU_PAGE_SIZE_CONFIGURABLE
    bool
    default y
--- a/components/soc/esp32h21/include/soc/soc_caps.h
+++ b/components/soc/esp32h21/include/soc/soc_caps.h
@@ -165,6 +165,8 @@
 #define SOC_CPU_HAS_PMA                 1
 #define SOC_CPU_IDRAM_SPLIT_USING_PMP   1
 #define SOC_CPU_PMP_REGION_GRANULARITY  4
+// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
+#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1

 /*-------------------------- MMU CAPS ----------------------------------------*/
 #define SOC_MMU_PAGE_SIZE_CONFIGURABLE        (1)
--- a/docs/en/api-guides/performance/ram-usage.rst
+++ b/docs/en/api-guides/performance/ram-usage.rst
@@ -194,6 +194,7 @@ The following options will reduce IRAM usage of some ESP-IDF features:
    :SOC_GPSPI_SUPPORTED: - Enable :ref:`CONFIG_HEAP_PLACE_FUNCTION_INTO_FLASH`. Provided that :ref:`CONFIG_SPI_MASTER_ISR_IN_IRAM` is not enabled and the heap functions are not incorrectly used from ISRs, this option is safe to enable in all configurations.
    :esp32c2: - Enable :ref:`CONFIG_BT_RELEASE_IRAM`. Release BT text section and merge BT data, bss & text into a large free heap region when ``esp_bt_mem_release`` is called. This makes Bluetooth unavailable until the next restart, but saving ~22 KB or more of IRAM.
    - Disable :ref:`CONFIG_LIBC_LOCKS_PLACE_IN_IRAM` if no ISRs that run while cache is disabled (i.e. IRAM ISRs) use libc lock APIs.
+    :CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - Disable :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS` to save approximately 1000 bytes of IRAM, at the cost of reduced performance.

 .. only:: esp32

--- a/docs/en/api-guides/performance/speed.rst
+++ b/docs/en/api-guides/performance/speed.rst
@@ -87,7 +87,6 @@ The following optimizations improve the execution of nearly all code, including
    :SOC_CPU_HAS_FPU: - Avoid using floating point arithmetic ``float``. Even though {IDF_TARGET_NAME} has a single precision hardware floating point unit, floating point calculations are always slower than integer calculations. If possible then use fixed point representations, a different method of integer representation, or convert part of the calculation to be integer only before switching to floating point.
    :not SOC_CPU_HAS_FPU: - Avoid using floating point arithmetic ``float``. On {IDF_TARGET_NAME} these calculations are emulated in software and are very slow. If possible, use fixed point representations, a different method of integer representation, or convert part of the calculation to be integer only before switching to floating point.
    - Avoid using double precision floating point arithmetic ``double``. These calculations are emulated in software and are very slow. If possible then use an integer-based representation, or single-precision floating point.
-    :CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - Avoid misaligned 4-byte memory accesses in performance-critical code sections. For potential performance improvements, consider enabling :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`, which requires approximately 190 bytes of IRAM and 870 bytes of flash memory. Note that properly aligned memory operations will always execute at full speed without performance penalties.


 .. only:: esp32s2 or esp32s3 or esp32p4
--- a/docs/en/migration-guides/release-6.x/6.0/toolchain.rst
+++ b/docs/en/migration-guides/release-6.x/6.0/toolchain.rst
@@ -107,3 +107,64 @@ The header ``<sys/signal.h>`` is no longer available in Picolibc. To ensure comp

    #include <sys/signal.h> /* fatal error: sys/signal.h: No such file or directory */
    #include <signal.h>     /* Ok: standard and portable */
+
+.. only:: CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY
+
+    RISC-V Chips and Misaligned Memory Access in LibC Functions
+    -----------------------------------------------------------
+
+    Espressif RISC-V chips can perform misaligned memory accesses with only a small
+    performance penalty compared to aligned accesses.
+
+    Previously, LibC functions that operate on memory (such as copy or comparison
+    functions) were implemented using byte-by-byte operations when a non-word-aligned
+    pointer was passed. Now, these functions use word (4-byte) load/store operations
+    whenever possible, resulting in a significant performance increase. These optimized
+    implementations are enabled by default via :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`,
+    which reduces the application’s memory budget (IRAM) by approximately 800–1000 bytes.
+
+    The table below shows benchmark results on the ESP32-C3 chip using 4096-byte buffers:
+
+    .. list-table:: Benchmark Results
+       :header-rows: 1
+       :widths: 20 20 20 20
+
+       * - Function
+         - Old (CPU cycles)
+         - Optimized (CPU cycles)
+         - Improvement (%)
+       * - memcpy
+         - 32873
+         - 4200
+         - 87.2
+       * - memcmp
+         - 57436
+         - 14722
+         - 74.4
+       * - memmove
+         - 49336
+         - 9237
+         - 81.3
+       * - strcpy
+         - 28678
+         - 16659
+         - 41.9
+       * - strcmp
+         - 36867
+         - 11146
+         - 69.8
+
+    .. note::
+       The results above apply to misaligned memory operations.
+       Performance for aligned memory operations remains unchanged.
+
+    Functions with Improved Performance
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+    - ``memcpy``
+    - ``memcmp``
+    - ``memmove``
+    - ``strcpy``
+    - ``strncpy``
+    - ``strcmp``
+    - ``strncmp``
--- a/docs/zh_CN/api-guides/performance/speed.rst
+++ b/docs/zh_CN/api-guides/performance/speed.rst
@@ -87,7 +87,6 @@
    :SOC_CPU_HAS_FPU: - 避免使用浮点运算 ``float``。尽管 {IDF_TARGET_NAME} 具备单精度浮点运算器，但是浮点运算总是慢于整数运算。因此可以考虑使用不同的整数表示方法进行运算，如定点表示法，或者将部分计算用整数运算后再切换为浮点运算。
    :not SOC_CPU_HAS_FPU: - 避免使用浮点运算 ``float``。{IDF_TARGET_NAME} 通过软件模拟进行浮点运算，因此速度非常慢。可以考虑使用不同的整数表示方法进行运算，如定点表示法，或者将部分计算用整数运算后再切换为浮点运算。
    - 避免使用双精度浮点运算 ``double``。{IDF_TARGET_NAME} 通过软件模拟进行双精度浮点运算，因此速度非常慢。可以考虑使用基于整数的表示方法或单精度浮点数。
-    :CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - 在性能要求较高的代码段中，应避免执行未对齐的 4 字节内存访问。为提升性能，可以考虑启用 :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`。启用此选项将额外占用约 190 字节的 IRAM 和 870 字节的 flash 存储。请注意，正确对齐的内存操作始终能够以全速执行，且不会产生性能损耗。


 .. only:: esp32s2 or esp32s3 or esp32p4