From 913d38ba14add74aca1e7420deaeb917172a56b6 Mon Sep 17 00:00:00 2001 From: Alexey Lapshin Date: Sun, 17 Aug 2025 11:52:05 +0700 Subject: [PATCH 1/2] fix(newlib): fix CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS for c2/c3/c6/h2/h21 PMP configurations for load and store addresses may have different permissions (e.g., "R" vs. "RW"). Due to the timing alignment of internal signals, the address permission check may be incorrectly applied during the second part of a misaligned access transaction. As a workaround, insert two instructions (e.g. ADDI/NOP) between accessing to different memory regions. This spacing avoids the false permission check caused by signal timing overlap. --- components/esp_rom/CMakeLists.txt | 8 +- components/newlib/CMakeLists.txt | 43 +++-- components/newlib/Kconfig | 2 +- .../{src => priv_include}/string/local.h | 2 + components/newlib/src/libc.lf | 2 - components/newlib/src/newlib.lf | 21 +-- components/newlib/src/picolibc/libc.lf | 4 - components/newlib/src/port/riscv/memcpy.c | 96 ++++++++--- components/newlib/src/port/riscv/memmove.c | 150 ++++++++++++++++++ components/newlib/src/port/riscv/strcpy.c | 2 +- components/newlib/src/string/memcmp.c | 3 +- components/newlib/src/string/memmove.c | 88 ---------- components/newlib/src/string/strncmp.c | 3 +- components/newlib/src/string/strncpy.c | 4 +- .../test_apps/newlib/main/CMakeLists.txt | 7 +- ...performance.c => test_misaligned_access.c} | 54 +++++++ .../newlib/sdkconfig.ci.misaligned_mem | 1 + .../esp32c2/include/soc/Kconfig.soc_caps.in | 4 + components/soc/esp32c2/include/soc/soc_caps.h | 2 + .../esp32c3/include/soc/Kconfig.soc_caps.in | 4 + components/soc/esp32c3/include/soc/soc_caps.h | 2 + .../esp32c6/include/soc/Kconfig.soc_caps.in | 4 + components/soc/esp32c6/include/soc/soc_caps.h | 2 + .../esp32h2/include/soc/Kconfig.soc_caps.in | 4 + components/soc/esp32h2/include/soc/soc_caps.h | 2 + .../esp32h21/include/soc/Kconfig.soc_caps.in | 4 + .../soc/esp32h21/include/soc/soc_caps.h | 2 + 27 files changed, 356 insertions(+), 164 deletions(-) rename components/newlib/{src => priv_include}/string/local.h (98%) create mode 100644 components/newlib/src/port/riscv/memmove.c delete mode 100644 components/newlib/src/string/memmove.c rename components/newlib/test_apps/newlib/main/{test_misaligned_mem_performance.c => test_misaligned_access.c} (63%) diff --git a/components/esp_rom/CMakeLists.txt b/components/esp_rom/CMakeLists.txt index 69e01d0abd..cb22d5d2ca 100644 --- a/components/esp_rom/CMakeLists.txt +++ b/components/esp_rom/CMakeLists.txt @@ -149,6 +149,10 @@ if(ESP_TEE_BUILD) if(CONFIG_ESP_ROM_HAS_NEWLIB_NANO_FORMAT) rom_linker_script("newlib-nano") endif() + rom_linker_script("libc") + if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY) # TODO IDF-13852: use optimized memcpy for TEE ? + rom_linker_script("libc-suboptimal_for_misaligned_mem") + endif() endif() if(BOOTLOADER_BUILD) @@ -173,8 +177,8 @@ if(BOOTLOADER_BUILD) else() rom_linker_script("libc") endif() - if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY - AND NOT CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS) + if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY) + # For bootloader use only ROM functions due to 'relocation truncated to fit' error rom_linker_script("libc-suboptimal_for_misaligned_mem") endif() if(CONFIG_LIBC_NEWLIB) diff --git a/components/newlib/CMakeLists.txt b/components/newlib/CMakeLists.txt index 1bb182f584..7c8a1faa92 100644 --- a/components/newlib/CMakeLists.txt +++ b/components/newlib/CMakeLists.txt @@ -38,23 +38,21 @@ if(CONFIG_STDATOMIC_S32C1I_SPIRAM_WORKAROUND) endif() if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS) - if(CMAKE_C_COMPILER_ID MATCHES "Clang") - # TODO IDF-13089: remove this block and modify src/newlib.lf when clang was upgraded - list(APPEND srcs - "src/string/memcmp.c" - "src/string/memmove.c" - "src/string/strncmp.c" - "src/string/strncpy.c" - "src/port/riscv/memcpy.c" - "src/port/riscv/strcpy.c" - "src/port/riscv/strcmp.S") - list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcmp_impl") - list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memmove_impl") - list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncmp_impl") - list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncpy_impl") - list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcpy_impl") - list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcmp_impl") - endif() + list(APPEND srcs + "src/port/riscv/memcpy.c" + "src/port/riscv/memmove.c" + "src/string/memcmp.c" + "src/port/riscv/strcpy.c" + "src/string/strncpy.c" + "src/string/strncmp.c" + "src/port/riscv/strcmp.S") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcpy_impl") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memmove_impl") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcmp_impl") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcpy_impl") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncpy_impl") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncmp_impl") + list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcmp_impl") endif() if(CONFIG_LIBC_NEWLIB) @@ -96,6 +94,17 @@ if(CONFIG_STDATOMIC_S32C1I_SPIRAM_WORKAROUND) PROPERTIES COMPILE_FLAGS "-mno-disable-hardware-atomics") endif() +if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS) + # TODO GCC-419 and IDF-13089: cleanup files + set_source_files_properties("src/string/memcmp.c" + "src/string/strncmp.c" + "src/string/strncpy.c" + "src/port/riscv/memcpy.c" + "src/port/riscv/memmove.c" + "src/port/riscv/strcpy.c" + PROPERTIES COMPILE_FLAGS -O2) +endif() + # Forces the linker to include heap, syscall, pthread, assert, and retargetable locks from this component, # instead of the implementations provided by newlib. list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_heap_impl") diff --git a/components/newlib/Kconfig b/components/newlib/Kconfig index 9a30784c21..92d9afc412 100644 --- a/components/newlib/Kconfig +++ b/components/newlib/Kconfig @@ -151,7 +151,7 @@ menu "LibC" Enables performance-optimized implementations of memory and string functions when handling misaligned memory. - This increases the image size by ~1000 bytes. + Require approximately 800–1000 bytes of IRAM. Optimized functions include: - memcpy diff --git a/components/newlib/src/string/local.h b/components/newlib/priv_include/string/local.h similarity index 98% rename from components/newlib/src/string/local.h rename to components/newlib/priv_include/string/local.h index bfc1260efc..d6df832838 100644 --- a/components/newlib/src/string/local.h +++ b/components/newlib/priv_include/string/local.h @@ -22,7 +22,9 @@ * to avoid small performance penalties (if they are not zero). */ #define UNALIGNED_X(X) ((long)X & (sizeof (long) - 1)) +#ifndef _HAVE_HW_MISALIGNED_ACCESS #define _HAVE_HW_MISALIGNED_ACCESS (__riscv_misaligned_fast || __riscv_misaligned_slow) +#endif #if _HAVE_HW_MISALIGNED_ACCESS /* Hardware performs unaligned operations with little diff --git a/components/newlib/src/libc.lf b/components/newlib/src/libc.lf index a8ebcf8930..05fdef2306 100644 --- a/components/newlib/src/libc.lf +++ b/components/newlib/src/libc.lf @@ -11,8 +11,6 @@ archive: else: libc.a entries: - if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y: - libc_a-memcpy (noflash) if SPIRAM_CACHE_WORKAROUND = y: # The following libs are either used in a lot of places or in critical # code. (such as panic or abort) diff --git a/components/newlib/src/newlib.lf b/components/newlib/src/newlib.lf index 359acd6345..e311ff5b17 100644 --- a/components/newlib/src/newlib.lf +++ b/components/newlib/src/newlib.lf @@ -1,6 +1,14 @@ [mapping:newlib] archive: libnewlib.a entries: + if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y: + memcpy (noflash) + memmove (noflash) + memcmp (noflash) + strcpy (noflash) + strncpy (noflash) + strcmp (noflash) + strncmp (noflash) if LIBC_MISC_IN_IRAM = y: if HEAP_PLACE_FUNCTION_INTO_FLASH = n: heap (noflash) @@ -11,16 +19,3 @@ entries: stdatomic_s32c1i (noflash) if STDATOMIC_S32C1I_SPIRAM_WORKAROUND = y: stdatomic_s32c1i (noflash) - if CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TOOLCHAIN_GCC = y: - # memcpy() should be in IRAM due to early system use - # Others can stay in flash (performance difference is minimal) - # - # Performance Comparison: - # | func | flash | iram | - # |---------|-------|-------| - # | memcmp | 15986 | 15170 | - # | strcpy | 17499 | 16660 | - # | strcmp | 13125 | 11147 | - # | strncpy | 17386 | 16668 | - # | strncmp | 22161 | 21782 | - libc_a-memcpy (noflash) diff --git a/components/newlib/src/picolibc/libc.lf b/components/newlib/src/picolibc/libc.lf index 0892557473..f2a97b4cc9 100644 --- a/components/newlib/src/picolibc/libc.lf +++ b/components/newlib/src/picolibc/libc.lf @@ -8,10 +8,6 @@ archive: libc.a entries: - if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TARGET_ARCH_RISCV = y: - memcpy-asm (noflash) - if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TARGET_ARCH_XTENSA = y: - libc_machine_xtensa_memcpy (noflash) if SPIRAM_CACHE_WORKAROUND = y: # The following libs are either used in a lot of places or in critical code. (such as panic or abort) # Thus, they shall always be placed in IRAM. diff --git a/components/newlib/src/port/riscv/memcpy.c b/components/newlib/src/port/riscv/memcpy.c index bbee1d3f02..6c84199d91 100644 --- a/components/newlib/src/port/riscv/memcpy.c +++ b/components/newlib/src/port/riscv/memcpy.c @@ -18,14 +18,11 @@ #include #include -#include "esp_attr.h" -#include "../../string/local.h" +#include "string/local.h" #define unlikely(X) __builtin_expect (!!(X), 0) -IRAM_ATTR void * -__attribute__((optimize("-Os"))) __inhibit_loop_to_libcall memcpy(void *__restrict aa, const void *__restrict bb, size_t n) { @@ -65,29 +62,80 @@ small: if (unlikely(lend - la > 8)) { while (lend - la > 8) { - long b0 = *lb++; - long b1 = *lb++; - long b2 = *lb++; - long b3 = *lb++; - long b4 = *lb++; - long b5 = *lb++; - long b6 = *lb++; - long b7 = *lb++; - long b8 = *lb++; - *la++ = b0; - *la++ = b1; - *la++ = b2; - *la++ = b3; - *la++ = b4; - *la++ = b5; - *la++ = b6; - *la++ = b7; - *la++ = b8; + /* + * long b0 = *lb++; + * long b1 = *lb++; + * long b2 = *lb++; + * long b3 = *lb++; + * long b4 = *lb++; + * long b5 = *lb++; + * long b6 = *lb++; + * long b7 = *lb++; + * long b8 = *lb++; + * *la++ = b0; + * *la++ = b1; + * *la++ = b2; + * *la++ = b3; + * *la++ = b4; + * *la++ = b5; + * *la++ = b6; + * *la++ = b7; + * *la++ = b8; + */ + long src0, src1, src2, src3; + long src4, src5, src6, src7; + long src8; + /* DIG-694: need at least 2 instructions between lw and sw */ + asm volatile("lw %0, 0(%10)\n" // scr0 = lb[0]; + "lw %1, 4(%10)\n" // scr1 = lb[1]; + "lw %2, 8(%10)\n" // scr2 = lb[2]; + "lw %3, 12(%10)\n" // scr3 = lb[3]; + "lw %4, 16(%10)\n" // scr4 = lb[4]; + "lw %5, 20(%10)\n" // scr5 = lb[5]; + "lw %6, 24(%10)\n" // scr6 = lb[6]; + "lw %7, 28(%10)\n" // scr7 = lb[7]; + "lw %8, 32(%10)\n" // scr8 = lb[8]; + "addi %9, %9, 36\n" // la += 8 * 9; + "addi %10, %10, 36\n" // lb += 8 * 9; + "sw %0, -36(%9)\n" // *(la - 9) = src; + "sw %1, -32(%9)\n" // *(la - 8) = src; + "sw %2, -28(%9)\n" // *(la - 7) = src; + "sw %3, -24(%9)\n" // *(la - 6) = src; + "sw %4, -20(%9)\n" // *(la - 5) = src; + "sw %5, -16(%9)\n" // *(la - 4) = src; + "sw %6, -12(%9)\n" // *(la - 3) = src; + "sw %7, -8(%9)\n" // *(la - 2) = src; + "sw %8, -4(%9)\n" // *(la - 1) = src; + : "=r"(src0), "=r"(src1), "=r"(src2), "=r"(src3), + "=r"(src4), "=r"(src5), "=r"(src6), "=r"(src7), + "=r"(src8), + "+r"(la), "+r"(lb) + :: "memory"); } } - while (la < lend) { - BODY(la, lb, long); + /* + * BODY(la, lb, long); + */ + long src0; +#ifdef __OPTIMIZE_SIZE__ +#error "Enabled Os optimization may not work properly for DIG-694" + /* + * Replacing the string: + * *la++ = tt; + * To: + * "addi %2, %4, 4\n" // la++; + * "sw %0, -4(%4)\n" // *(la-1) = src0; + * May break some optimizations and slightly reduce performance. + */ +#endif + /* DIG-694: need at least 2 instructions between lw and sw */ + asm volatile("lw %0, 0(%1)\n" // long src0 = *lb; + "addi %1, %1, 4\n" // lb++; + "addi %2, %2, 4\n" // la++; + "sw %0, -4(%2)\n" // *(la-1) = src0; + : "=r"(src0), "+r"(lb), "+r"(la) + :: "memory"); } a = (char *)la; diff --git a/components/newlib/src/port/riscv/memmove.c b/components/newlib/src/port/riscv/memmove.c new file mode 100644 index 0000000000..208e5953cc --- /dev/null +++ b/components/newlib/src/port/riscv/memmove.c @@ -0,0 +1,150 @@ +/* + * SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc. + * + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0 + * + * SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD + */ +#include +#include <_ansi.h> +#include +#include +#include "string/local.h" + +void * +__inhibit_loop_to_libcall +memmove(void *dst_void, + const void *src_void, + size_t length) +{ + char *dst = dst_void; + const char *src = src_void; + long *aligned_dst; + const long *aligned_src; + + if (src < dst && dst < src + length) { + /* Destructive overlap...have to copy backwards */ + src += length; + dst += length; + + if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) { + aligned_dst = (long*)dst; + aligned_src = (long*)src; + + /* Copy one long word at a time if possible. */ + while (!TOO_SMALL_LITTLE_BLOCK(length)) { + /* + * const long src0 = *--aligned_src; + * *--aligned_dst = src0; + * length -= LITTLE_BLOCK_SIZE; + */ + long src0; + /* DIG-694: need at least 2 instructions between lw and sw */ + asm volatile("lw %0, -4(%1)\n" // src0 = *(aligned_src - 1); + "addi %1, %1, -4\n" // aligned_src--; + "addi %2, %2, -4\n" // aligned_dst--; + "addi %3, %3, -4\n" // length -= LITTLE_BLOCK_SIZE; + "sw %0, 0(%2)\n" // aligned_dst = src0; + : "=r"(src0), "+r"(aligned_src), "+r"(aligned_dst), "+r"(length) + :: "memory"); + } + + /* Pick up any residual with a byte copier. */ + dst = (char*)aligned_dst; + src = (char*)aligned_src; + } + + while (length--) { + *--dst = *--src; + } + } else { + /* Use optimizing algorithm for a non-destructive copy to closely + match memcpy. If the size is small or either SRC or DST is unaligned, + then punt into the byte copy loop. This should be rare. */ + if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) { + aligned_dst = (long*)dst; + aligned_src = (long*)src; + + /* Copy 8X long words at a time if possible. */ + while (length >= BIG_BLOCK_SIZE * 2) { + /* + * const long src0 = *aligned_src++; + * const long src1 = *aligned_src++; + * const long src2 = *aligned_src++; + * const long src3 = *aligned_src++; + * const long src4 = *aligned_src++; + * const long src5 = *aligned_src++; + * const long src6 = *aligned_src++; + * const long src7 = *aligned_src++; + * *aligned_dst++ = src0; + * *aligned_dst++ = src1; + * *aligned_dst++ = src2; + * *aligned_dst++ = src3; + * *aligned_dst++ = src4; + * *aligned_dst++ = src5; + * *aligned_dst++ = src6; + * *aligned_dst++ = src7; + */ + long src0, src1, src2, src3; + long src4, src5, src6, src7; + /* DIG-694: need at least 2 instructions between lw and sw */ + asm volatile("lw %0, 0(%8)\n" // src0 = aligned_src[0]; + "lw %1, 4(%8)\n" // src1 = aligned_src[1]; + "lw %2, 8(%8)\n" // src2 = aligned_src[2]; + "lw %3, 12(%8)\n" // src3 = aligned_src[3]; + "lw %4, 16(%8)\n" // src4 = aligned_src[4]; + "lw %5, 20(%8)\n" // src5 = aligned_src[5]; + "lw %6, 24(%8)\n" // src6 = aligned_src[6]; + "lw %7, 28(%8)\n" // src7 = aligned_src[7]; + "addi %8, %8, 32\n" // aligned_src += BIG_BLOCK_SIZE * 2; + "addi %9, %9, 32\n" // aligned_dst += BIG_BLOCK_SIZE * 2; + "addi %10, %10, -32\n" // length -= BIG_BLOCK_SIZE * 2; + "sw %0, -32(%9)\n" // *(aligned_dst - 8) = src0; + "sw %1, -28(%9)\n" // *(aligned_dst - 7) = src1; + "sw %2, -24(%9)\n" // *(aligned_dst - 6) = src2; + "sw %3, -20(%9)\n" // *(aligned_dst - 5) = src3; + "sw %4, -16(%9)\n" // *(aligned_dst - 4) = src4; + "sw %5, -12(%9)\n" // *(aligned_dst - 3) = src5; + "sw %6, -8(%9)\n" // *(aligned_dst - 2) = src6; + "sw %7, -4(%9)\n" // *(aligned_dst - 1) = src7; + : "=r"(src0), "=r"(src1), "=r"(src2), "=r"(src3), + "=r"(src4), "=r"(src5), "=r"(src6), "=r"(src7), + "+r"(aligned_src), "+r"(aligned_dst), "+r"(length) + :: "memory"); + } + + /* Copy one long word at a time if possible. */ + while (!TOO_SMALL_LITTLE_BLOCK(length)) { + /* + * const long src0 = *aligned_src++; + * *aligned_dst++ = src0; + * length -= LITTLE_BLOCK_SIZE; + */ + long src0; + /* DIG-694: need at least 2 instructions between lw and sw */ + asm volatile("lw %0, 0(%4)\n" // long src0 = *aligned_src; + "addi %1, %4, 4\n" // aligned_src++; + "addi %2, %5, 4\n" // aligned_dst++; + "addi %3, %6, -4\n" // length -= LITTLE_BLOCK_SIZE; + "sw %0, -4(%5)\n" // *(aligned_dst-1) = src0; + : "=r"(src0), "=r"(aligned_src), "=r"(aligned_dst), "=r"(length) + : "r"(aligned_src), "r"(aligned_dst), "r"(length)); + } + + /* Pick up any residual with a byte copier. */ + dst = (char*)aligned_dst; + src = (char*)aligned_src; + } + + while (length--) { + *dst++ = *src++; + } + } + + return dst_void; +} + +// Hook to force the linker to include this file +void esp_libc_include_memmove_impl(void) +{ +} diff --git a/components/newlib/src/port/riscv/strcpy.c b/components/newlib/src/port/riscv/strcpy.c index 361a04baa1..1884218729 100644 --- a/components/newlib/src/port/riscv/strcpy.c +++ b/components/newlib/src/port/riscv/strcpy.c @@ -30,7 +30,6 @@ unsigned long __newlib__libc_detect_null(unsigned long w) return ~(((w & mask) + mask) | w | mask); } -__attribute__((optimize("-Os"))) char *strcpy(char *dst, const char *src) { char *dst0 = dst; @@ -44,6 +43,7 @@ char *strcpy(char *dst, const char *src) const long *lsrc = (const long *)src; while (!__newlib__libc_detect_null(*lsrc)) { + /* DIG-694: there are enough instructions between lw and sw after compiler unrolls the loop */ *ldst++ = *lsrc++; } diff --git a/components/newlib/src/string/memcmp.c b/components/newlib/src/string/memcmp.c index 0a26ca8cd4..4445d09da2 100644 --- a/components/newlib/src/string/memcmp.c +++ b/components/newlib/src/string/memcmp.c @@ -6,9 +6,8 @@ * SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD */ #include -#include "local.h" +#include "string/local.h" -__attribute__((optimize("-Os"))) int memcmp(const void *m1, const void *m2, diff --git a/components/newlib/src/string/memmove.c b/components/newlib/src/string/memmove.c deleted file mode 100644 index 57071ddc09..0000000000 --- a/components/newlib/src/string/memmove.c +++ /dev/null @@ -1,88 +0,0 @@ -/* - * SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc. - * - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0 - * - * SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD - */ -#include -#include <_ansi.h> -#include -#include -#include "local.h" - -__attribute__((optimize("-Os"))) -void * -__inhibit_loop_to_libcall -memmove(void *dst_void, - const void *src_void, - size_t length) -{ - char *dst = dst_void; - const char *src = src_void; - long *aligned_dst; - const long *aligned_src; - - if (src < dst && dst < src + length) { - /* Destructive overlap...have to copy backwards */ - src += length; - dst += length; - - if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) { - aligned_dst = (long*)dst; - aligned_src = (long*)src; - - /* Copy one long word at a time if possible. */ - while (!TOO_SMALL_LITTLE_BLOCK(length)) { - *--aligned_dst = *--aligned_src; - length -= LITTLE_BLOCK_SIZE; - } - - /* Pick up any residual with a byte copier. */ - dst = (char*)aligned_dst; - src = (char*)aligned_src; - } - - while (length--) { - *--dst = *--src; - } - } else { - /* Use optimizing algorithm for a non-destructive copy to closely - match memcpy. If the size is small or either SRC or DST is unaligned, - then punt into the byte copy loop. This should be rare. */ - if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) { - aligned_dst = (long*)dst; - aligned_src = (long*)src; - - /* Copy 4X long words at a time if possible. */ - while (!TOO_SMALL_BIG_BLOCK(length)) { - *aligned_dst++ = *aligned_src++; - *aligned_dst++ = *aligned_src++; - *aligned_dst++ = *aligned_src++; - *aligned_dst++ = *aligned_src++; - length -= BIG_BLOCK_SIZE; - } - - /* Copy one long word at a time if possible. */ - while (!TOO_SMALL_LITTLE_BLOCK(length)) { - *aligned_dst++ = *aligned_src++; - length -= LITTLE_BLOCK_SIZE; - } - - /* Pick up any residual with a byte copier. */ - dst = (char*)aligned_dst; - src = (char*)aligned_src; - } - - while (length--) { - *dst++ = *src++; - } - } - - return dst_void; -} - -// Hook to force the linker to include this file -void esp_libc_include_memmove_impl(void) -{ -} diff --git a/components/newlib/src/string/strncmp.c b/components/newlib/src/string/strncmp.c index f0efd62189..ed71b484e7 100644 --- a/components/newlib/src/string/strncmp.c +++ b/components/newlib/src/string/strncmp.c @@ -7,9 +7,8 @@ */ #include #include -#include "local.h" +#include "string/local.h" -__attribute__((optimize("-Os"))) int strncmp(const char *s1, const char *s2, diff --git a/components/newlib/src/string/strncpy.c b/components/newlib/src/string/strncpy.c index 5821e1997b..0916244d35 100644 --- a/components/newlib/src/string/strncpy.c +++ b/components/newlib/src/string/strncpy.c @@ -7,9 +7,8 @@ */ #include #include -#include "local.h" +#include "string/local.h" -__attribute__((optimize("-Os"))) char * strncpy(char *__restrict dst0, const char *__restrict src0, @@ -29,6 +28,7 @@ strncpy(char *__restrict dst0, sized copies. */ while (!TOO_SMALL_LITTLE_BLOCK(count) && !DETECT_NULL(*aligned_src)) { count -= sizeof(long int); + /* DIG-694: there are enough instructions between lw and sw after compiler unrolls the loop */ *aligned_dst++ = *aligned_src++; } diff --git a/components/newlib/test_apps/newlib/main/CMakeLists.txt b/components/newlib/test_apps/newlib/main/CMakeLists.txt index d91fc695dc..474b1050f9 100644 --- a/components/newlib/test_apps/newlib/main/CMakeLists.txt +++ b/components/newlib/test_apps/newlib/main/CMakeLists.txt @@ -18,14 +18,9 @@ if(CONFIG_LIBC_NEWLIB) endif() if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS) - list(APPEND srcs "test_misaligned_mem_performance.c") + list(APPEND srcs "test_misaligned_access.c") endif() idf_component_register(SRCS "${srcs}" PRIV_REQUIRES unity vfs cmock driver esp_timer spi_flash test_utils pthread esp_psram WHOLE_ARCHIVE) - -if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS) - set_source_files_properties(test_misaligned_mem_performance.c - PROPERTIES COMPILE_FLAGS "-Wno-incompatible-pointer-types -Wno-strict-prototypes") -endif() diff --git a/components/newlib/test_apps/newlib/main/test_misaligned_mem_performance.c b/components/newlib/test_apps/newlib/main/test_misaligned_access.c similarity index 63% rename from components/newlib/test_apps/newlib/main/test_misaligned_mem_performance.c rename to components/newlib/test_apps/newlib/main/test_misaligned_access.c index cc9ab315d6..0ee1f0f7b3 100644 --- a/components/newlib/test_apps/newlib/main/test_misaligned_mem_performance.c +++ b/components/newlib/test_apps/newlib/main/test_misaligned_access.c @@ -6,11 +6,16 @@ #include #include #include "esp_heap_caps.h" +#include "soc/soc.h" #include "hal/cpu_ll.h" #include "unity.h" #define MAX_MEMTEST_SIZE 4096 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" +#pragma GCC diagnostic ignored "-Wstrict-prototypes" + uint32_t test_function_dest_src_size(void (*foo)(...), bool pass_size) { uint32_t ccount1, ccount2; @@ -106,3 +111,52 @@ TEST_CASE("strncmp", "[misaligned_mem]") /* esp32c2: 24369 cycles instead 49141. */ TEST_ASSERT_LESS_THAN(30000, ccount); } +#pragma GCC diagnostic pop // "-Wincompatible-pointer-types" "-Wstrict-prototypes" + +static bool fn_in_ram(void *fn) +{ + const int fnaddr = (int)fn; + return (fnaddr >= SOC_IRAM_LOW && fnaddr < SOC_IRAM_HIGH); +} + +TEST_CASE("mem functions in IRAM", "[misaligned_mem]") +{ + TEST_ASSERT_TRUE(fn_in_ram(memcpy)); + TEST_ASSERT_TRUE(fn_in_ram(memcmp)); + TEST_ASSERT_TRUE(fn_in_ram(memmove)); + TEST_ASSERT_TRUE(fn_in_ram(strcpy)); + TEST_ASSERT_TRUE(fn_in_ram(strncpy)); + TEST_ASSERT_TRUE(fn_in_ram(strcmp)); + TEST_ASSERT_TRUE(fn_in_ram(strncmp)); +} + +#if CONFIG_ESP_SYSTEM_MEMPROT_PMP +TEST_CASE("access across different PMP regions", "[misaligned_mem]") +{ + /* + * PMP configurations for load and store addresses may + * have different permissions (e.g., "R" vs. "RW"). + * + * Due to the timing alignment of internal signals, the address + * permission check may be incorrectly applied during the second + * part of a misaligned access transaction. + * + * As a workaround, insert two instructions (e.g. ADDI/NOP) between + * accessing to different memory regions. This spacing avoids the + * false permission check caused by signal timing overlap. + * + * This test may help identify the root issue in affected chips. + */ + + const void *src = (void *) SOC_DROM_MASK_LOW; + asm volatile("addi sp, sp, -16\n" + "lw t0, 2(%0)\n" +#if CONFIG_SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE + "nop\n" + "nop\n" +#endif + "sw t0, 3(sp)\n" + "addi sp, sp, 16" + :: "r"(src) : "memory"); +} +#endif diff --git a/components/newlib/test_apps/newlib/sdkconfig.ci.misaligned_mem b/components/newlib/test_apps/newlib/sdkconfig.ci.misaligned_mem index 17a61e401a..f5a15c7bc0 100644 --- a/components/newlib/test_apps/newlib/sdkconfig.ci.misaligned_mem +++ b/components/newlib/test_apps/newlib/sdkconfig.ci.misaligned_mem @@ -1 +1,2 @@ CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS=y +CONFIG_ESP_SYSTEM_MEMPROT_PMP=y diff --git a/components/soc/esp32c2/include/soc/Kconfig.soc_caps.in b/components/soc/esp32c2/include/soc/Kconfig.soc_caps.in index d0d74a5292..60b9c54c59 100644 --- a/components/soc/esp32c2/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32c2/include/soc/Kconfig.soc_caps.in @@ -263,6 +263,10 @@ config SOC_CPU_IDRAM_SPLIT_USING_PMP bool default y +config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE + bool + default y + config SOC_ECC_SUPPORT_POINT_VERIFY_QUIRK bool default y diff --git a/components/soc/esp32c2/include/soc/soc_caps.h b/components/soc/esp32c2/include/soc/soc_caps.h index facf692980..af19a55d4d 100644 --- a/components/soc/esp32c2/include/soc/soc_caps.h +++ b/components/soc/esp32c2/include/soc/soc_caps.h @@ -112,6 +112,8 @@ #define SOC_CPU_WATCHPOINT_MAX_REGION_SIZE 0x80000000 // bytes #define SOC_CPU_IDRAM_SPLIT_USING_PMP 1 +// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions +#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1 /*-------------------------- ECC CAPS --------------------------*/ #define SOC_ECC_SUPPORT_POINT_VERIFY_QUIRK 1 // C2 ECC peripheral has a bug in ECC point verification, if value of K is zero the verification fails diff --git a/components/soc/esp32c3/include/soc/Kconfig.soc_caps.in b/components/soc/esp32c3/include/soc/Kconfig.soc_caps.in index 07d3891d96..b0ce79412c 100644 --- a/components/soc/esp32c3/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32c3/include/soc/Kconfig.soc_caps.in @@ -351,6 +351,10 @@ config SOC_CPU_WATCHPOINT_MAX_REGION_SIZE hex default 0x80000000 +config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE + bool + default y + config SOC_DS_SIGNATURE_MAX_BIT_LEN int default 3072 diff --git a/components/soc/esp32c3/include/soc/soc_caps.h b/components/soc/esp32c3/include/soc/soc_caps.h index f56e816ff8..eaae1d8db9 100644 --- a/components/soc/esp32c3/include/soc/soc_caps.h +++ b/components/soc/esp32c3/include/soc/soc_caps.h @@ -143,6 +143,8 @@ #define SOC_CPU_BREAKPOINTS_NUM 8 #define SOC_CPU_WATCHPOINTS_NUM 8 #define SOC_CPU_WATCHPOINT_MAX_REGION_SIZE 0x80000000 // bytes +// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions +#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1 /*-------------------------- DIGITAL SIGNATURE CAPS ----------------------------------------*/ /** The maximum length of a Digital Signature in bits. */ diff --git a/components/soc/esp32c6/include/soc/Kconfig.soc_caps.in b/components/soc/esp32c6/include/soc/Kconfig.soc_caps.in index ac19a476db..fe2e6264f5 100644 --- a/components/soc/esp32c6/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32c6/include/soc/Kconfig.soc_caps.in @@ -427,6 +427,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY int default 4 +config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE + bool + default y + config SOC_DS_SIGNATURE_MAX_BIT_LEN int default 3072 diff --git a/components/soc/esp32c6/include/soc/soc_caps.h b/components/soc/esp32c6/include/soc/soc_caps.h index c59db3e62d..2906c6a1c7 100644 --- a/components/soc/esp32c6/include/soc/soc_caps.h +++ b/components/soc/esp32c6/include/soc/soc_caps.h @@ -163,6 +163,8 @@ #define SOC_CPU_HAS_PMA 1 #define SOC_CPU_IDRAM_SPLIT_USING_PMP 1 #define SOC_CPU_PMP_REGION_GRANULARITY 4 +// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions +#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1 // TODO: IDF-5360 (Copy from esp32c3, need check) /*-------------------------- DIGITAL SIGNATURE CAPS ----------------------------------------*/ diff --git a/components/soc/esp32h2/include/soc/Kconfig.soc_caps.in b/components/soc/esp32h2/include/soc/Kconfig.soc_caps.in index b7a521cf58..0db2579197 100644 --- a/components/soc/esp32h2/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32h2/include/soc/Kconfig.soc_caps.in @@ -427,6 +427,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY int default 4 +config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE + bool + default y + config SOC_MMU_PAGE_SIZE_CONFIGURABLE bool default y diff --git a/components/soc/esp32h2/include/soc/soc_caps.h b/components/soc/esp32h2/include/soc/soc_caps.h index 30a8fea465..feeb153f2c 100644 --- a/components/soc/esp32h2/include/soc/soc_caps.h +++ b/components/soc/esp32h2/include/soc/soc_caps.h @@ -180,6 +180,8 @@ #define SOC_CPU_HAS_PMA 1 #define SOC_CPU_IDRAM_SPLIT_USING_PMP 1 #define SOC_CPU_PMP_REGION_GRANULARITY 4 +// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions +#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1 /*-------------------------- MMU CAPS ----------------------------------------*/ #define SOC_MMU_PAGE_SIZE_CONFIGURABLE (1) diff --git a/components/soc/esp32h21/include/soc/Kconfig.soc_caps.in b/components/soc/esp32h21/include/soc/Kconfig.soc_caps.in index 6d7f798d0d..1508085055 100644 --- a/components/soc/esp32h21/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32h21/include/soc/Kconfig.soc_caps.in @@ -283,6 +283,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY int default 4 +config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE + bool + default y + config SOC_MMU_PAGE_SIZE_CONFIGURABLE bool default y diff --git a/components/soc/esp32h21/include/soc/soc_caps.h b/components/soc/esp32h21/include/soc/soc_caps.h index ce0ac6b56f..a9a5402f90 100644 --- a/components/soc/esp32h21/include/soc/soc_caps.h +++ b/components/soc/esp32h21/include/soc/soc_caps.h @@ -165,6 +165,8 @@ #define SOC_CPU_HAS_PMA 1 #define SOC_CPU_IDRAM_SPLIT_USING_PMP 1 #define SOC_CPU_PMP_REGION_GRANULARITY 4 +// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions +#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1 /*-------------------------- MMU CAPS ----------------------------------------*/ #define SOC_MMU_PAGE_SIZE_CONFIGURABLE (1) From b266d829dd4cdca905cc7975b81b21b7cdc6c4d1 Mon Sep 17 00:00:00 2001 From: Alexey Lapshin Date: Sun, 17 Aug 2025 11:52:54 +0700 Subject: [PATCH 2/2] change(newlib): enable LIBC_OPTIMIZED_MISALIGNED_ACCESS by default --- components/newlib/Kconfig | 2 +- docs/en/api-guides/performance/ram-usage.rst | 1 + docs/en/api-guides/performance/speed.rst | 1 - .../release-6.x/6.0/toolchain.rst | 61 +++++++++++++++++++ docs/zh_CN/api-guides/performance/speed.rst | 1 - 5 files changed, 63 insertions(+), 3 deletions(-) diff --git a/components/newlib/Kconfig b/components/newlib/Kconfig index 92d9afc412..2e849600bb 100644 --- a/components/newlib/Kconfig +++ b/components/newlib/Kconfig @@ -145,7 +145,7 @@ menu "LibC" config LIBC_OPTIMIZED_MISALIGNED_ACCESS bool "Use performance-optimized memXXX/strXXX functions on misaligned memory access" - default n + default y depends on ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY help Enables performance-optimized implementations of memory and string functions diff --git a/docs/en/api-guides/performance/ram-usage.rst b/docs/en/api-guides/performance/ram-usage.rst index 413296eb30..a72da893fa 100644 --- a/docs/en/api-guides/performance/ram-usage.rst +++ b/docs/en/api-guides/performance/ram-usage.rst @@ -194,6 +194,7 @@ The following options will reduce IRAM usage of some ESP-IDF features: :SOC_GPSPI_SUPPORTED: - Enable :ref:`CONFIG_HEAP_PLACE_FUNCTION_INTO_FLASH`. Provided that :ref:`CONFIG_SPI_MASTER_ISR_IN_IRAM` is not enabled and the heap functions are not incorrectly used from ISRs, this option is safe to enable in all configurations. :esp32c2: - Enable :ref:`CONFIG_BT_RELEASE_IRAM`. Release BT text section and merge BT data, bss & text into a large free heap region when ``esp_bt_mem_release`` is called. This makes Bluetooth unavailable until the next restart, but saving ~22 KB or more of IRAM. - Disable :ref:`CONFIG_LIBC_LOCKS_PLACE_IN_IRAM` if no ISRs that run while cache is disabled (i.e. IRAM ISRs) use libc lock APIs. + :CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - Disable :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS` to save approximately 1000 bytes of IRAM, at the cost of reduced performance. .. only:: esp32 diff --git a/docs/en/api-guides/performance/speed.rst b/docs/en/api-guides/performance/speed.rst index 7eede72b63..58e191a200 100644 --- a/docs/en/api-guides/performance/speed.rst +++ b/docs/en/api-guides/performance/speed.rst @@ -87,7 +87,6 @@ The following optimizations improve the execution of nearly all code, including :SOC_CPU_HAS_FPU: - Avoid using floating point arithmetic ``float``. Even though {IDF_TARGET_NAME} has a single precision hardware floating point unit, floating point calculations are always slower than integer calculations. If possible then use fixed point representations, a different method of integer representation, or convert part of the calculation to be integer only before switching to floating point. :not SOC_CPU_HAS_FPU: - Avoid using floating point arithmetic ``float``. On {IDF_TARGET_NAME} these calculations are emulated in software and are very slow. If possible, use fixed point representations, a different method of integer representation, or convert part of the calculation to be integer only before switching to floating point. - Avoid using double precision floating point arithmetic ``double``. These calculations are emulated in software and are very slow. If possible then use an integer-based representation, or single-precision floating point. - :CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - Avoid misaligned 4-byte memory accesses in performance-critical code sections. For potential performance improvements, consider enabling :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`, which requires approximately 190 bytes of IRAM and 870 bytes of flash memory. Note that properly aligned memory operations will always execute at full speed without performance penalties. .. only:: esp32s2 or esp32s3 or esp32p4 diff --git a/docs/en/migration-guides/release-6.x/6.0/toolchain.rst b/docs/en/migration-guides/release-6.x/6.0/toolchain.rst index 9fcf794b3c..65fd8f781a 100644 --- a/docs/en/migration-guides/release-6.x/6.0/toolchain.rst +++ b/docs/en/migration-guides/release-6.x/6.0/toolchain.rst @@ -107,3 +107,64 @@ The header ```` is no longer available in Picolibc. To ensure comp #include /* fatal error: sys/signal.h: No such file or directory */ #include /* Ok: standard and portable */ + +.. only:: CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY + + RISC-V Chips and Misaligned Memory Access in LibC Functions + ----------------------------------------------------------- + + Espressif RISC-V chips can perform misaligned memory accesses with only a small + performance penalty compared to aligned accesses. + + Previously, LibC functions that operate on memory (such as copy or comparison + functions) were implemented using byte-by-byte operations when a non-word-aligned + pointer was passed. Now, these functions use word (4-byte) load/store operations + whenever possible, resulting in a significant performance increase. These optimized + implementations are enabled by default via :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`, + which reduces the application’s memory budget (IRAM) by approximately 800–1000 bytes. + + The table below shows benchmark results on the ESP32-C3 chip using 4096-byte buffers: + + .. list-table:: Benchmark Results + :header-rows: 1 + :widths: 20 20 20 20 + + * - Function + - Old (CPU cycles) + - Optimized (CPU cycles) + - Improvement (%) + * - memcpy + - 32873 + - 4200 + - 87.2 + * - memcmp + - 57436 + - 14722 + - 74.4 + * - memmove + - 49336 + - 9237 + - 81.3 + * - strcpy + - 28678 + - 16659 + - 41.9 + * - strcmp + - 36867 + - 11146 + - 69.8 + + .. note:: + The results above apply to misaligned memory operations. + Performance for aligned memory operations remains unchanged. + + Functions with Improved Performance + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + - ``memcpy`` + - ``memcmp`` + - ``memmove`` + - ``strcpy`` + - ``strncpy`` + - ``strcmp`` + - ``strncmp`` diff --git a/docs/zh_CN/api-guides/performance/speed.rst b/docs/zh_CN/api-guides/performance/speed.rst index 61ae1e0511..f4b5343f50 100644 --- a/docs/zh_CN/api-guides/performance/speed.rst +++ b/docs/zh_CN/api-guides/performance/speed.rst @@ -87,7 +87,6 @@ :SOC_CPU_HAS_FPU: - 避免使用浮点运算 ``float``。尽管 {IDF_TARGET_NAME} 具备单精度浮点运算器,但是浮点运算总是慢于整数运算。因此可以考虑使用不同的整数表示方法进行运算,如定点表示法,或者将部分计算用整数运算后再切换为浮点运算。 :not SOC_CPU_HAS_FPU: - 避免使用浮点运算 ``float``。{IDF_TARGET_NAME} 通过软件模拟进行浮点运算,因此速度非常慢。可以考虑使用不同的整数表示方法进行运算,如定点表示法,或者将部分计算用整数运算后再切换为浮点运算。 - 避免使用双精度浮点运算 ``double``。{IDF_TARGET_NAME} 通过软件模拟进行双精度浮点运算,因此速度非常慢。可以考虑使用基于整数的表示方法或单精度浮点数。 - :CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - 在性能要求较高的代码段中,应避免执行未对齐的 4 字节内存访问。为提升性能,可以考虑启用 :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`。启用此选项将额外占用约 190 字节的 IRAM 和 870 字节的 flash 存储。请注意,正确对齐的内存操作始终能够以全速执行,且不会产生性能损耗。 .. only:: esp32s2 or esp32s3 or esp32p4