Merge branch 'fix/multi_core_race_cond_in_panic_handler_v5.0' into 'release/v5.0'

fix(panic_handler): Prevent race condition in panic handler (v5.0) See merge request espressif/esp-idf!38668
2025-05-07 10:09:57 +08:00
parent 54ec0a99f1 a896bc5b7d
commit ae32f091fc
2 changed files with 35 additions and 32 deletions
--- a/components/esp_system/panic.c
+++ b/components/esp_system/panic.c
@ -191,6 +191,7 @@ void esp_panic_handler_disable_timg_wdts(void)
 /* This function enables the RTC WDT with the given timeout in milliseconds */
 void esp_panic_handler_enable_rtc_wdt(uint32_t timeout_ms)
 {
+    wdt_hal_context_t rtc_wdt_ctx = {.inst = WDT_RWDT, .rwdt_dev = &RTCCNTL}; // Use a local context variable to avoid race conditions when both cores enter the panic handler
    wdt_hal_init(&rtc_wdt_ctx, WDT_RWDT, 0, false);
    uint32_t stage_timeout_ticks = (uint32_t)(timeout_ms * rtc_clk_slow_freq_get_hz() / 1000ULL);
    wdt_hal_write_protect_disable(&rtc_wdt_ctx);
--- a/components/esp_system/port/panic_handler.c
+++ b/components/esp_system/port/panic_handler.c
@ -125,41 +125,23 @@ static void frame_to_panic_info(void *frame, panic_info_t *info, bool pseudo_exc
 static void panic_handler(void *frame, bool pseudo_excause)
 {
    /* If watchdogs are enabled, the panic handler runs the risk of getting aborted pre-emptively because
-     * an overzealous watchdog decides to reset it. On the other hand, if we disable all watchdogs, we run
-     * the risk of somehow halting in the panic handler and not resetting.
+     * an overzealous watchdog decides to reset it. Hence, we feed the WDTs here.
     *
-     * We have to do this before we do anything that might cause issues in the WDT interrupt handlers,
-     * for example stalling the other core on ESP32 may cause the ESP32_ECO3_CACHE_LOCK_FIX
-     * handler to get stuck.
+     * However, we do not feed the WDTs in multi-core mode because we do not have a reliable way to handle
+     * concurrency issues when both cores enter the panic handler at the same time. Hence, we avoid performing
+     * any WDT configurations until one of the cores is put in to a busy_wait() state below. As a side note,
+     * it may so happen that neither of the cores end up in a busy_wait() state and still try to work with the
+     * WDTs simultaneously but chances of that happening are low. (TODO: IDF-12900)
     *
     * We do this before we increment the panic handler entry count to ensure that the WDTs are fed.
     */
+#if CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE
    esp_panic_handler_feed_wdts();
+#endif // CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE

    /* Increment the panic handler entry count */
    esp_panic_handler_increment_entry_count();

-    /* Configuring the RTC WDT as early as possible in the panic handler
-     * is critical for system safety.
-     *
-     * The RTC WDT is relied upon for a complete system reset, as it is the only
-     * watchdog timer capable of resetting both the main system and the RTC subsystem.
-     * In contrast, the Timer Group Watchdog Timers can only reset the main system
-     * but not the RTC module.
-     *
-     * The timeout value for the RTC WDT is set to 10 seconds. The primary reason for
-     * choosing a 10 second timeout is to allow the panic handler to run to completion
-     * which may include core dump collection and apptrace flushing.
-     *
-     * Explanation for why the core dump takes time:
-     * 64KB of core dump data (stacks of about 30 tasks) will produce ~85KB base64 data.
-     * @ 115200 UART speed it will take more than 6 sec to print them out.
-     *
-     * TODO: Make the timeout configurable or more intelligent based on the panic reason and the
-     * config options.
-     */
-    esp_panic_handler_enable_rtc_wdt(10000);
-
    panic_info_t info = { 0 };

    /*
@ -189,6 +171,31 @@ static void panic_handler(void *frame, bool pseudo_excause)
            }
        }
    }
+#endif // !CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE
+
+    /* Configuring the RTC WDT is critical for system safety.
+     *
+     * The RTC WDT is relied upon for a complete system reset, as it is the only
+     * watchdog timer capable of resetting both the main system and the RTC subsystem.
+     * In contrast, the Timer Group Watchdog Timers can only reset the main system
+     * but not the RTC module.
+     *
+     * We have to do this before we do anything that might cause issues in the WDT interrupt handlers,
+     * for example stalling the other core on ESP32 may cause the ESP32_ECO3_CACHE_LOCK_FIX
+     * handler to get stuck.
+     *
+     * The timeout value for the RTC WDT is set to 10 seconds. The primary reason for
+     * choosing a 10 second timeout is to allow the panic handler to run to completion
+     * which may include core dump collection and apptrace flushing.
+     *
+     * Explanation for why the core dump takes time:
+     * 64KB of core dump data (stacks of about 30 tasks) will produce ~85KB base64 data.
+     * @ 115200 UART speed it will take more than 6 sec to print them out.
+     *
+     * TODO: Make the timeout configurable or more intelligent based on the panic reason and the
+     * config options.
+     */
+    esp_panic_handler_enable_rtc_wdt(10000);

    /* Before we stall the other CPU, we need to disable all WDTs except the RTC WDT.
     * This is because the TIMG WDTs cannot reset the RTC subsystem, which stores the CPU stalling
@ -197,6 +204,7 @@ static void panic_handler(void *frame, bool pseudo_excause)
     */
    esp_panic_handler_disable_timg_wdts();

+#if !CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE
    esp_rom_delay_us(1);
    // Stall all other cores
    for (uint32_t i = 0; i < SOC_CPU_CORES_NUM; i++) {
@ -204,12 +212,6 @@ static void panic_handler(void *frame, bool pseudo_excause)
            esp_cpu_stall(i);
        }
    }
-#else
-    /* In single core mode, we don't need to disable the TIMG WDTs,
-     * but we do it anyway to keep the code consistent and to avoid
-     * managing the state of multiple WDTs.
-     */
-    esp_panic_handler_disable_timg_wdts();
 #endif // !CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE

    esp_ipc_isr_stall_abort();