Merge branch 'feature/enable_misalign_optimization_for_iperf' into 'master'

feat(newlib): enable libc misalign optimized functions

Closes IDF-13820

See merge request espressif/esp-idf!41148
This commit is contained in:
Alexey Lapshin
2025-08-22 22:14:53 +04:00
31 changed files with 419 additions and 167 deletions

View File

@@ -149,6 +149,10 @@ if(ESP_TEE_BUILD)
if(CONFIG_ESP_ROM_HAS_NEWLIB_NANO_FORMAT)
rom_linker_script("newlib-nano")
endif()
rom_linker_script("libc")
if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY) # TODO IDF-13852: use optimized memcpy for TEE ?
rom_linker_script("libc-suboptimal_for_misaligned_mem")
endif()
endif()
if(BOOTLOADER_BUILD)
@@ -173,8 +177,8 @@ if(BOOTLOADER_BUILD)
else()
rom_linker_script("libc")
endif()
if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY
AND NOT CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
if(CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY)
# For bootloader use only ROM functions due to 'relocation truncated to fit' error
rom_linker_script("libc-suboptimal_for_misaligned_mem")
endif()
if(CONFIG_LIBC_NEWLIB)

View File

@@ -38,23 +38,21 @@ if(CONFIG_STDATOMIC_S32C1I_SPIRAM_WORKAROUND)
endif()
if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
# TODO IDF-13089: remove this block and modify src/newlib.lf when clang was upgraded
list(APPEND srcs
"src/string/memcmp.c"
"src/string/memmove.c"
"src/string/strncmp.c"
"src/string/strncpy.c"
"src/port/riscv/memcpy.c"
"src/port/riscv/memmove.c"
"src/string/memcmp.c"
"src/port/riscv/strcpy.c"
"src/string/strncpy.c"
"src/string/strncmp.c"
"src/port/riscv/strcmp.S")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcmp_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcpy_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memmove_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncmp_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncpy_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_memcmp_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcpy_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncpy_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strncmp_impl")
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_strcmp_impl")
endif()
endif()
if(CONFIG_LIBC_NEWLIB)
@@ -96,6 +94,17 @@ if(CONFIG_STDATOMIC_S32C1I_SPIRAM_WORKAROUND)
PROPERTIES COMPILE_FLAGS "-mno-disable-hardware-atomics")
endif()
if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
# TODO GCC-419 and IDF-13089: cleanup files
set_source_files_properties("src/string/memcmp.c"
"src/string/strncmp.c"
"src/string/strncpy.c"
"src/port/riscv/memcpy.c"
"src/port/riscv/memmove.c"
"src/port/riscv/strcpy.c"
PROPERTIES COMPILE_FLAGS -O2)
endif()
# Forces the linker to include heap, syscall, pthread, assert, and retargetable locks from this component,
# instead of the implementations provided by newlib.
list(APPEND EXTRA_LINK_FLAGS "-u esp_libc_include_heap_impl")

View File

@@ -145,13 +145,13 @@ menu "LibC"
config LIBC_OPTIMIZED_MISALIGNED_ACCESS
bool "Use performance-optimized memXXX/strXXX functions on misaligned memory access"
default n
default y
depends on ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY
help
Enables performance-optimized implementations of memory and string functions
when handling misaligned memory.
This increases the image size by ~1000 bytes.
Require approximately 8001000 bytes of IRAM.
Optimized functions include:
- memcpy

View File

@@ -22,7 +22,9 @@
* to avoid small performance penalties (if they are not zero). */
#define UNALIGNED_X(X) ((long)X & (sizeof (long) - 1))
#ifndef _HAVE_HW_MISALIGNED_ACCESS
#define _HAVE_HW_MISALIGNED_ACCESS (__riscv_misaligned_fast || __riscv_misaligned_slow)
#endif
#if _HAVE_HW_MISALIGNED_ACCESS
/* Hardware performs unaligned operations with little

View File

@@ -11,8 +11,6 @@ archive:
else:
libc.a
entries:
if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y:
libc_a-memcpy (noflash)
if SPIRAM_CACHE_WORKAROUND = y:
# The following libs are either used in a lot of places or in critical
# code. (such as panic or abort)

View File

@@ -1,6 +1,14 @@
[mapping:newlib]
archive: libnewlib.a
entries:
if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y:
memcpy (noflash)
memmove (noflash)
memcmp (noflash)
strcpy (noflash)
strncpy (noflash)
strcmp (noflash)
strncmp (noflash)
if LIBC_MISC_IN_IRAM = y:
if HEAP_PLACE_FUNCTION_INTO_FLASH = n:
heap (noflash)
@@ -11,16 +19,3 @@ entries:
stdatomic_s32c1i (noflash)
if STDATOMIC_S32C1I_SPIRAM_WORKAROUND = y:
stdatomic_s32c1i (noflash)
if CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TOOLCHAIN_GCC = y:
# memcpy() should be in IRAM due to early system use
# Others can stay in flash (performance difference is minimal)
#
# Performance Comparison:
# | func | flash | iram |
# |---------|-------|-------|
# | memcmp | 15986 | 15170 |
# | strcpy | 17499 | 16660 |
# | strcmp | 13125 | 11147 |
# | strncpy | 17386 | 16668 |
# | strncmp | 22161 | 21782 |
libc_a-memcpy (noflash)

View File

@@ -8,10 +8,6 @@
archive:
libc.a
entries:
if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TARGET_ARCH_RISCV = y:
memcpy-asm (noflash)
if LIBC_OPTIMIZED_MISALIGNED_ACCESS = y && IDF_TARGET_ARCH_XTENSA = y:
libc_machine_xtensa_memcpy (noflash)
if SPIRAM_CACHE_WORKAROUND = y:
# The following libs are either used in a lot of places or in critical code. (such as panic or abort)
# Thus, they shall always be placed in IRAM.

View File

@@ -18,14 +18,11 @@
#include <string.h>
#include <stdint.h>
#include "esp_attr.h"
#include "../../string/local.h"
#include "string/local.h"
#define unlikely(X) __builtin_expect (!!(X), 0)
IRAM_ATTR
void *
__attribute__((optimize("-Os")))
__inhibit_loop_to_libcall
memcpy(void *__restrict aa, const void *__restrict bb, size_t n)
{
@@ -65,29 +62,80 @@ small:
if (unlikely(lend - la > 8)) {
while (lend - la > 8) {
long b0 = *lb++;
long b1 = *lb++;
long b2 = *lb++;
long b3 = *lb++;
long b4 = *lb++;
long b5 = *lb++;
long b6 = *lb++;
long b7 = *lb++;
long b8 = *lb++;
*la++ = b0;
*la++ = b1;
*la++ = b2;
*la++ = b3;
*la++ = b4;
*la++ = b5;
*la++ = b6;
*la++ = b7;
*la++ = b8;
/*
* long b0 = *lb++;
* long b1 = *lb++;
* long b2 = *lb++;
* long b3 = *lb++;
* long b4 = *lb++;
* long b5 = *lb++;
* long b6 = *lb++;
* long b7 = *lb++;
* long b8 = *lb++;
* *la++ = b0;
* *la++ = b1;
* *la++ = b2;
* *la++ = b3;
* *la++ = b4;
* *la++ = b5;
* *la++ = b6;
* *la++ = b7;
* *la++ = b8;
*/
long src0, src1, src2, src3;
long src4, src5, src6, src7;
long src8;
/* DIG-694: need at least 2 instructions between lw and sw */
asm volatile("lw %0, 0(%10)\n" // scr0 = lb[0];
"lw %1, 4(%10)\n" // scr1 = lb[1];
"lw %2, 8(%10)\n" // scr2 = lb[2];
"lw %3, 12(%10)\n" // scr3 = lb[3];
"lw %4, 16(%10)\n" // scr4 = lb[4];
"lw %5, 20(%10)\n" // scr5 = lb[5];
"lw %6, 24(%10)\n" // scr6 = lb[6];
"lw %7, 28(%10)\n" // scr7 = lb[7];
"lw %8, 32(%10)\n" // scr8 = lb[8];
"addi %9, %9, 36\n" // la += 8 * 9;
"addi %10, %10, 36\n" // lb += 8 * 9;
"sw %0, -36(%9)\n" // *(la - 9) = src;
"sw %1, -32(%9)\n" // *(la - 8) = src;
"sw %2, -28(%9)\n" // *(la - 7) = src;
"sw %3, -24(%9)\n" // *(la - 6) = src;
"sw %4, -20(%9)\n" // *(la - 5) = src;
"sw %5, -16(%9)\n" // *(la - 4) = src;
"sw %6, -12(%9)\n" // *(la - 3) = src;
"sw %7, -8(%9)\n" // *(la - 2) = src;
"sw %8, -4(%9)\n" // *(la - 1) = src;
: "=r"(src0), "=r"(src1), "=r"(src2), "=r"(src3),
"=r"(src4), "=r"(src5), "=r"(src6), "=r"(src7),
"=r"(src8),
"+r"(la), "+r"(lb)
:: "memory");
}
}
while (la < lend) {
BODY(la, lb, long);
/*
* BODY(la, lb, long);
*/
long src0;
#ifdef __OPTIMIZE_SIZE__
#error "Enabled Os optimization may not work properly for DIG-694"
/*
* Replacing the string:
* *la++ = tt;
* To:
* "addi %2, %4, 4\n" // la++;
* "sw %0, -4(%4)\n" // *(la-1) = src0;
* May break some optimizations and slightly reduce performance.
*/
#endif
/* DIG-694: need at least 2 instructions between lw and sw */
asm volatile("lw %0, 0(%1)\n" // long src0 = *lb;
"addi %1, %1, 4\n" // lb++;
"addi %2, %2, 4\n" // la++;
"sw %0, -4(%2)\n" // *(la-1) = src0;
: "=r"(src0), "+r"(lb), "+r"(la)
:: "memory");
}
a = (char *)la;

View File

@@ -0,0 +1,150 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include <_ansi.h>
#include <stddef.h>
#include <limits.h>
#include "string/local.h"
void *
__inhibit_loop_to_libcall
memmove(void *dst_void,
const void *src_void,
size_t length)
{
char *dst = dst_void;
const char *src = src_void;
long *aligned_dst;
const long *aligned_src;
if (src < dst && dst < src + length) {
/* Destructive overlap...have to copy backwards */
src += length;
dst += length;
if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* Copy one long word at a time if possible. */
while (!TOO_SMALL_LITTLE_BLOCK(length)) {
/*
* const long src0 = *--aligned_src;
* *--aligned_dst = src0;
* length -= LITTLE_BLOCK_SIZE;
*/
long src0;
/* DIG-694: need at least 2 instructions between lw and sw */
asm volatile("lw %0, -4(%1)\n" // src0 = *(aligned_src - 1);
"addi %1, %1, -4\n" // aligned_src--;
"addi %2, %2, -4\n" // aligned_dst--;
"addi %3, %3, -4\n" // length -= LITTLE_BLOCK_SIZE;
"sw %0, 0(%2)\n" // aligned_dst = src0;
: "=r"(src0), "+r"(aligned_src), "+r"(aligned_dst), "+r"(length)
:: "memory");
}
/* Pick up any residual with a byte copier. */
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (length--) {
*--dst = *--src;
}
} else {
/* Use optimizing algorithm for a non-destructive copy to closely
match memcpy. If the size is small or either SRC or DST is unaligned,
then punt into the byte copy loop. This should be rare. */
if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* Copy 8X long words at a time if possible. */
while (length >= BIG_BLOCK_SIZE * 2) {
/*
* const long src0 = *aligned_src++;
* const long src1 = *aligned_src++;
* const long src2 = *aligned_src++;
* const long src3 = *aligned_src++;
* const long src4 = *aligned_src++;
* const long src5 = *aligned_src++;
* const long src6 = *aligned_src++;
* const long src7 = *aligned_src++;
* *aligned_dst++ = src0;
* *aligned_dst++ = src1;
* *aligned_dst++ = src2;
* *aligned_dst++ = src3;
* *aligned_dst++ = src4;
* *aligned_dst++ = src5;
* *aligned_dst++ = src6;
* *aligned_dst++ = src7;
*/
long src0, src1, src2, src3;
long src4, src5, src6, src7;
/* DIG-694: need at least 2 instructions between lw and sw */
asm volatile("lw %0, 0(%8)\n" // src0 = aligned_src[0];
"lw %1, 4(%8)\n" // src1 = aligned_src[1];
"lw %2, 8(%8)\n" // src2 = aligned_src[2];
"lw %3, 12(%8)\n" // src3 = aligned_src[3];
"lw %4, 16(%8)\n" // src4 = aligned_src[4];
"lw %5, 20(%8)\n" // src5 = aligned_src[5];
"lw %6, 24(%8)\n" // src6 = aligned_src[6];
"lw %7, 28(%8)\n" // src7 = aligned_src[7];
"addi %8, %8, 32\n" // aligned_src += BIG_BLOCK_SIZE * 2;
"addi %9, %9, 32\n" // aligned_dst += BIG_BLOCK_SIZE * 2;
"addi %10, %10, -32\n" // length -= BIG_BLOCK_SIZE * 2;
"sw %0, -32(%9)\n" // *(aligned_dst - 8) = src0;
"sw %1, -28(%9)\n" // *(aligned_dst - 7) = src1;
"sw %2, -24(%9)\n" // *(aligned_dst - 6) = src2;
"sw %3, -20(%9)\n" // *(aligned_dst - 5) = src3;
"sw %4, -16(%9)\n" // *(aligned_dst - 4) = src4;
"sw %5, -12(%9)\n" // *(aligned_dst - 3) = src5;
"sw %6, -8(%9)\n" // *(aligned_dst - 2) = src6;
"sw %7, -4(%9)\n" // *(aligned_dst - 1) = src7;
: "=r"(src0), "=r"(src1), "=r"(src2), "=r"(src3),
"=r"(src4), "=r"(src5), "=r"(src6), "=r"(src7),
"+r"(aligned_src), "+r"(aligned_dst), "+r"(length)
:: "memory");
}
/* Copy one long word at a time if possible. */
while (!TOO_SMALL_LITTLE_BLOCK(length)) {
/*
* const long src0 = *aligned_src++;
* *aligned_dst++ = src0;
* length -= LITTLE_BLOCK_SIZE;
*/
long src0;
/* DIG-694: need at least 2 instructions between lw and sw */
asm volatile("lw %0, 0(%4)\n" // long src0 = *aligned_src;
"addi %1, %4, 4\n" // aligned_src++;
"addi %2, %5, 4\n" // aligned_dst++;
"addi %3, %6, -4\n" // length -= LITTLE_BLOCK_SIZE;
"sw %0, -4(%5)\n" // *(aligned_dst-1) = src0;
: "=r"(src0), "=r"(aligned_src), "=r"(aligned_dst), "=r"(length)
: "r"(aligned_src), "r"(aligned_dst), "r"(length));
}
/* Pick up any residual with a byte copier. */
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (length--) {
*dst++ = *src++;
}
}
return dst_void;
}
// Hook to force the linker to include this file
void esp_libc_include_memmove_impl(void)
{
}

View File

@@ -30,7 +30,6 @@ unsigned long __newlib__libc_detect_null(unsigned long w)
return ~(((w & mask) + mask) | w | mask);
}
__attribute__((optimize("-Os")))
char *strcpy(char *dst, const char *src)
{
char *dst0 = dst;
@@ -44,6 +43,7 @@ char *strcpy(char *dst, const char *src)
const long *lsrc = (const long *)src;
while (!__newlib__libc_detect_null(*lsrc)) {
/* DIG-694: there are enough instructions between lw and sw after compiler unrolls the loop */
*ldst++ = *lsrc++;
}

View File

@@ -6,9 +6,8 @@
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include "local.h"
#include "string/local.h"
__attribute__((optimize("-Os")))
int
memcmp(const void *m1,
const void *m2,

View File

@@ -1,88 +0,0 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include <_ansi.h>
#include <stddef.h>
#include <limits.h>
#include "local.h"
__attribute__((optimize("-Os")))
void *
__inhibit_loop_to_libcall
memmove(void *dst_void,
const void *src_void,
size_t length)
{
char *dst = dst_void;
const char *src = src_void;
long *aligned_dst;
const long *aligned_src;
if (src < dst && dst < src + length) {
/* Destructive overlap...have to copy backwards */
src += length;
dst += length;
if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* Copy one long word at a time if possible. */
while (!TOO_SMALL_LITTLE_BLOCK(length)) {
*--aligned_dst = *--aligned_src;
length -= LITTLE_BLOCK_SIZE;
}
/* Pick up any residual with a byte copier. */
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (length--) {
*--dst = *--src;
}
} else {
/* Use optimizing algorithm for a non-destructive copy to closely
match memcpy. If the size is small or either SRC or DST is unaligned,
then punt into the byte copy loop. This should be rare. */
if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* Copy 4X long words at a time if possible. */
while (!TOO_SMALL_BIG_BLOCK(length)) {
*aligned_dst++ = *aligned_src++;
*aligned_dst++ = *aligned_src++;
*aligned_dst++ = *aligned_src++;
*aligned_dst++ = *aligned_src++;
length -= BIG_BLOCK_SIZE;
}
/* Copy one long word at a time if possible. */
while (!TOO_SMALL_LITTLE_BLOCK(length)) {
*aligned_dst++ = *aligned_src++;
length -= LITTLE_BLOCK_SIZE;
}
/* Pick up any residual with a byte copier. */
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (length--) {
*dst++ = *src++;
}
}
return dst_void;
}
// Hook to force the linker to include this file
void esp_libc_include_memmove_impl(void)
{
}

View File

@@ -7,9 +7,8 @@
*/
#include <string.h>
#include <limits.h>
#include "local.h"
#include "string/local.h"
__attribute__((optimize("-Os")))
int
strncmp(const char *s1,
const char *s2,

View File

@@ -7,9 +7,8 @@
*/
#include <string.h>
#include <limits.h>
#include "local.h"
#include "string/local.h"
__attribute__((optimize("-Os")))
char *
strncpy(char *__restrict dst0,
const char *__restrict src0,
@@ -29,6 +28,7 @@ strncpy(char *__restrict dst0,
sized copies. */
while (!TOO_SMALL_LITTLE_BLOCK(count) && !DETECT_NULL(*aligned_src)) {
count -= sizeof(long int);
/* DIG-694: there are enough instructions between lw and sw after compiler unrolls the loop */
*aligned_dst++ = *aligned_src++;
}

View File

@@ -18,14 +18,9 @@ if(CONFIG_LIBC_NEWLIB)
endif()
if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
list(APPEND srcs "test_misaligned_mem_performance.c")
list(APPEND srcs "test_misaligned_access.c")
endif()
idf_component_register(SRCS "${srcs}"
PRIV_REQUIRES unity vfs cmock driver esp_timer spi_flash test_utils pthread esp_psram
WHOLE_ARCHIVE)
if(CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS)
set_source_files_properties(test_misaligned_mem_performance.c
PROPERTIES COMPILE_FLAGS "-Wno-incompatible-pointer-types -Wno-strict-prototypes")
endif()

View File

@@ -6,11 +6,16 @@
#include <stdint.h>
#include <string.h>
#include "esp_heap_caps.h"
#include "soc/soc.h"
#include "hal/cpu_ll.h"
#include "unity.h"
#define MAX_MEMTEST_SIZE 4096
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
#pragma GCC diagnostic ignored "-Wstrict-prototypes"
uint32_t test_function_dest_src_size(void (*foo)(...), bool pass_size)
{
uint32_t ccount1, ccount2;
@@ -106,3 +111,52 @@ TEST_CASE("strncmp", "[misaligned_mem]")
/* esp32c2: 24369 cycles instead 49141. */
TEST_ASSERT_LESS_THAN(30000, ccount);
}
#pragma GCC diagnostic pop // "-Wincompatible-pointer-types" "-Wstrict-prototypes"
static bool fn_in_ram(void *fn)
{
const int fnaddr = (int)fn;
return (fnaddr >= SOC_IRAM_LOW && fnaddr < SOC_IRAM_HIGH);
}
TEST_CASE("mem functions in IRAM", "[misaligned_mem]")
{
TEST_ASSERT_TRUE(fn_in_ram(memcpy));
TEST_ASSERT_TRUE(fn_in_ram(memcmp));
TEST_ASSERT_TRUE(fn_in_ram(memmove));
TEST_ASSERT_TRUE(fn_in_ram(strcpy));
TEST_ASSERT_TRUE(fn_in_ram(strncpy));
TEST_ASSERT_TRUE(fn_in_ram(strcmp));
TEST_ASSERT_TRUE(fn_in_ram(strncmp));
}
#if CONFIG_ESP_SYSTEM_MEMPROT_PMP
TEST_CASE("access across different PMP regions", "[misaligned_mem]")
{
/*
* PMP configurations for load and store addresses may
* have different permissions (e.g., "R" vs. "RW").
*
* Due to the timing alignment of internal signals, the address
* permission check may be incorrectly applied during the second
* part of a misaligned access transaction.
*
* As a workaround, insert two instructions (e.g. ADDI/NOP) between
* accessing to different memory regions. This spacing avoids the
* false permission check caused by signal timing overlap.
*
* This test may help identify the root issue in affected chips.
*/
const void *src = (void *) SOC_DROM_MASK_LOW;
asm volatile("addi sp, sp, -16\n"
"lw t0, 2(%0)\n"
#if CONFIG_SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
"nop\n"
"nop\n"
#endif
"sw t0, 3(sp)\n"
"addi sp, sp, 16"
:: "r"(src) : "memory");
}
#endif

View File

@@ -1 +1,2 @@
CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS=y
CONFIG_ESP_SYSTEM_MEMPROT_PMP=y

View File

@@ -263,6 +263,10 @@ config SOC_CPU_IDRAM_SPLIT_USING_PMP
bool
default y
config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
bool
default y
config SOC_ECC_SUPPORT_POINT_VERIFY_QUIRK
bool
default y

View File

@@ -112,6 +112,8 @@
#define SOC_CPU_WATCHPOINT_MAX_REGION_SIZE 0x80000000 // bytes
#define SOC_CPU_IDRAM_SPLIT_USING_PMP 1
// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1
/*-------------------------- ECC CAPS --------------------------*/
#define SOC_ECC_SUPPORT_POINT_VERIFY_QUIRK 1 // C2 ECC peripheral has a bug in ECC point verification, if value of K is zero the verification fails

View File

@@ -351,6 +351,10 @@ config SOC_CPU_WATCHPOINT_MAX_REGION_SIZE
hex
default 0x80000000
config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
bool
default y
config SOC_DS_SIGNATURE_MAX_BIT_LEN
int
default 3072

View File

@@ -143,6 +143,8 @@
#define SOC_CPU_BREAKPOINTS_NUM 8
#define SOC_CPU_WATCHPOINTS_NUM 8
#define SOC_CPU_WATCHPOINT_MAX_REGION_SIZE 0x80000000 // bytes
// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1
/*-------------------------- DIGITAL SIGNATURE CAPS ----------------------------------------*/
/** The maximum length of a Digital Signature in bits. */

View File

@@ -427,6 +427,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY
int
default 4
config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
bool
default y
config SOC_DS_SIGNATURE_MAX_BIT_LEN
int
default 3072

View File

@@ -163,6 +163,8 @@
#define SOC_CPU_HAS_PMA 1
#define SOC_CPU_IDRAM_SPLIT_USING_PMP 1
#define SOC_CPU_PMP_REGION_GRANULARITY 4
// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1
// TODO: IDF-5360 (Copy from esp32c3, need check)
/*-------------------------- DIGITAL SIGNATURE CAPS ----------------------------------------*/

View File

@@ -427,6 +427,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY
int
default 4
config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
bool
default y
config SOC_MMU_PAGE_SIZE_CONFIGURABLE
bool
default y

View File

@@ -180,6 +180,8 @@
#define SOC_CPU_HAS_PMA 1
#define SOC_CPU_IDRAM_SPLIT_USING_PMP 1
#define SOC_CPU_PMP_REGION_GRANULARITY 4
// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1
/*-------------------------- MMU CAPS ----------------------------------------*/
#define SOC_MMU_PAGE_SIZE_CONFIGURABLE (1)

View File

@@ -283,6 +283,10 @@ config SOC_CPU_PMP_REGION_GRANULARITY
int
default 4
config SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE
bool
default y
config SOC_MMU_PAGE_SIZE_CONFIGURABLE
bool
default y

View File

@@ -165,6 +165,8 @@
#define SOC_CPU_HAS_PMA 1
#define SOC_CPU_IDRAM_SPLIT_USING_PMP 1
#define SOC_CPU_PMP_REGION_GRANULARITY 4
// DIG-694: misaligned access across PMP regions must be spaced at least by two instructions
#define SOC_CPU_MISALIGNED_ACCESS_ON_PMP_MISMATCH_ISSUE 1
/*-------------------------- MMU CAPS ----------------------------------------*/
#define SOC_MMU_PAGE_SIZE_CONFIGURABLE (1)

View File

@@ -194,6 +194,7 @@ The following options will reduce IRAM usage of some ESP-IDF features:
:SOC_GPSPI_SUPPORTED: - Enable :ref:`CONFIG_HEAP_PLACE_FUNCTION_INTO_FLASH`. Provided that :ref:`CONFIG_SPI_MASTER_ISR_IN_IRAM` is not enabled and the heap functions are not incorrectly used from ISRs, this option is safe to enable in all configurations.
:esp32c2: - Enable :ref:`CONFIG_BT_RELEASE_IRAM`. Release BT text section and merge BT data, bss & text into a large free heap region when ``esp_bt_mem_release`` is called. This makes Bluetooth unavailable until the next restart, but saving ~22 KB or more of IRAM.
- Disable :ref:`CONFIG_LIBC_LOCKS_PLACE_IN_IRAM` if no ISRs that run while cache is disabled (i.e. IRAM ISRs) use libc lock APIs.
:CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - Disable :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS` to save approximately 1000 bytes of IRAM, at the cost of reduced performance.
.. only:: esp32

View File

@@ -87,7 +87,6 @@ The following optimizations improve the execution of nearly all code, including
:SOC_CPU_HAS_FPU: - Avoid using floating point arithmetic ``float``. Even though {IDF_TARGET_NAME} has a single precision hardware floating point unit, floating point calculations are always slower than integer calculations. If possible then use fixed point representations, a different method of integer representation, or convert part of the calculation to be integer only before switching to floating point.
:not SOC_CPU_HAS_FPU: - Avoid using floating point arithmetic ``float``. On {IDF_TARGET_NAME} these calculations are emulated in software and are very slow. If possible, use fixed point representations, a different method of integer representation, or convert part of the calculation to be integer only before switching to floating point.
- Avoid using double precision floating point arithmetic ``double``. These calculations are emulated in software and are very slow. If possible then use an integer-based representation, or single-precision floating point.
:CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - Avoid misaligned 4-byte memory accesses in performance-critical code sections. For potential performance improvements, consider enabling :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`, which requires approximately 190 bytes of IRAM and 870 bytes of flash memory. Note that properly aligned memory operations will always execute at full speed without performance penalties.
.. only:: esp32s2 or esp32s3 or esp32p4

View File

@@ -107,3 +107,64 @@ The header ``<sys/signal.h>`` is no longer available in Picolibc. To ensure comp
#include <sys/signal.h> /* fatal error: sys/signal.h: No such file or directory */
#include <signal.h> /* Ok: standard and portable */
.. only:: CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY
RISC-V Chips and Misaligned Memory Access in LibC Functions
-----------------------------------------------------------
Espressif RISC-V chips can perform misaligned memory accesses with only a small
performance penalty compared to aligned accesses.
Previously, LibC functions that operate on memory (such as copy or comparison
functions) were implemented using byte-by-byte operations when a non-word-aligned
pointer was passed. Now, these functions use word (4-byte) load/store operations
whenever possible, resulting in a significant performance increase. These optimized
implementations are enabled by default via :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`,
which reduces the applications memory budget (IRAM) by approximately 8001000 bytes.
The table below shows benchmark results on the ESP32-C3 chip using 4096-byte buffers:
.. list-table:: Benchmark Results
:header-rows: 1
:widths: 20 20 20 20
* - Function
- Old (CPU cycles)
- Optimized (CPU cycles)
- Improvement (%)
* - memcpy
- 32873
- 4200
- 87.2
* - memcmp
- 57436
- 14722
- 74.4
* - memmove
- 49336
- 9237
- 81.3
* - strcpy
- 28678
- 16659
- 41.9
* - strcmp
- 36867
- 11146
- 69.8
.. note::
The results above apply to misaligned memory operations.
Performance for aligned memory operations remains unchanged.
Functions with Improved Performance
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- ``memcpy``
- ``memcmp``
- ``memmove``
- ``strcpy``
- ``strncpy``
- ``strcmp``
- ``strncmp``

View File

@@ -87,7 +87,6 @@
:SOC_CPU_HAS_FPU: - 避免使用浮点运算 ``float``。尽管 {IDF_TARGET_NAME} 具备单精度浮点运算器,但是浮点运算总是慢于整数运算。因此可以考虑使用不同的整数表示方法进行运算,如定点表示法,或者将部分计算用整数运算后再切换为浮点运算。
:not SOC_CPU_HAS_FPU: - 避免使用浮点运算 ``float``。{IDF_TARGET_NAME} 通过软件模拟进行浮点运算,因此速度非常慢。可以考虑使用不同的整数表示方法进行运算,如定点表示法,或者将部分计算用整数运算后再切换为浮点运算。
- 避免使用双精度浮点运算 ``double``。{IDF_TARGET_NAME} 通过软件模拟进行双精度浮点运算,因此速度非常慢。可以考虑使用基于整数的表示方法或单精度浮点数。
:CONFIG_ESP_ROM_HAS_SUBOPTIMAL_NEWLIB_ON_MISALIGNED_MEMORY: - 在性能要求较高的代码段中,应避免执行未对齐的 4 字节内存访问。为提升性能,可以考虑启用 :ref:`CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS`。启用此选项将额外占用约 190 字节的 IRAM 和 870 字节的 flash 存储。请注意,正确对齐的内存操作始终能够以全速执行,且不会产生性能损耗。
.. only:: esp32s2 or esp32s3 or esp32p4