fix(panic_handler): Prevent race condition in panic handler

This commit updates all RTC WDT contexts to be local instead of global to avoid race conditions when both cores enter the panic handler simultaneously.
2025-10-01 01:21:06 +02:00 · 2025-03-21 13:01:48 +01:00
parent 3d83751dcc
commit edf4234da9
2 changed files with 39 additions and 36 deletions
--- a/components/esp_system/panic.c
+++ b/components/esp_system/panic.c
@@ -203,6 +203,7 @@ void esp_panic_handler_disable_timg_wdts(void)
 /* This function enables the RTC WDT with the given timeout in milliseconds */
 void esp_panic_handler_enable_rtc_wdt(uint32_t timeout_ms)
 {
+    wdt_hal_context_t rtc_wdt_ctx = RWDT_HAL_CONTEXT_DEFAULT(); // Use a local context variable to avoid race conditions when both cores enter the panic handler
    wdt_hal_init(&rtc_wdt_ctx, WDT_RWDT, 0, false);
    uint32_t stage_timeout_ticks = (uint32_t)(timeout_ms * rtc_clk_slow_freq_get_hz() / 1000ULL);
    wdt_hal_write_protect_disable(&rtc_wdt_ctx);
--- a/components/esp_system/port/panic_handler.c
+++ b/components/esp_system/port/panic_handler.c
@@ -135,45 +135,23 @@ void busy_wait(void)
 static void panic_handler(void *frame, bool pseudo_excause)
 {
    /* If watchdogs are enabled, the panic handler runs the risk of getting aborted pre-emptively because
-     * an overzealous watchdog decides to reset it. On the other hand, if we disable all watchdogs, we run
-     * the risk of somehow halting in the panic handler and not resetting.
+     * an overzealous watchdog decides to reset it. Hence, we feed the WDTs here.
     *
-     * We have to do this before we do anything that might cause issues in the WDT interrupt handlers,
-     * for example stalling the other core on ESP32 may cause the ESP32_ECO3_CACHE_LOCK_FIX
-     * handler to get stuck.
+     * However, we do not feed the WDTs in multi-core mode because we do not have a reliable way to handle
+     * concurrency issues when both cores enter the panic handler at the same time. Hence, we avoid performing
+     * any WDT configurations until one of the cores is put in to a busy_wait() state below. As a side note,
+     * it may so happen that neither of the cores end up in a busy_wait() state and still try to work with the
+     * WDTs simultaneously but chances of that happening are low. (TODO: IDF-12900)
     *
     * We do this before we increment the panic handler entry count to ensure that the WDTs are fed.
     */
+#if CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE
    esp_panic_handler_feed_wdts();
+#endif // CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE

    /* Increment the panic handler entry count */
    esp_panic_handler_increment_entry_count();

-    /* Configuring the RTC WDT as early as possible in the panic handler
-     * is critical for system safety.
-     *
-     * The RTC WDT is relied upon for a complete system reset, as it is the only
-     * watchdog timer capable of resetting both the main system and the RTC subsystem.
-     * In contrast, the Timer Group Watchdog Timers can only reset the main system
-     * but not the RTC module.
-     *
-     * The timeout value for the RTC WDT is set to 10 seconds. The primary reason for
-     * choosing a 10 second timeout is to allow the panic handler to run to completion
-     * which may include core dump collection and apptrace flushing.
-     *
-     * Explanation for why the core dump takes time:
-     * 64KB of core dump data (stacks of about 30 tasks) will produce ~85KB base64 data.
-     * @ 115200 UART speed it will take more than 6 sec to print them out.
-     *
-     * TODO: Make the timeout configurable or more intelligent based on the panic reason and the
-     * config options.
-     */
-#if CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS
-    esp_panic_handler_enable_rtc_wdt((CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS + 10) * 1000);
-#else
-    esp_panic_handler_enable_rtc_wdt(10000);
-#endif /* CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS */
-
    panic_info_t info = { 0 };

    /*
@@ -219,6 +197,35 @@ static void panic_handler(void *frame, bool pseudo_excause)
        }
 #endif // CONFIG_ESP_SYSTEM_HW_STACK_GUARD
    }
+#endif // !CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE
+
+    /* Configuring the RTC WDT is critical for system safety.
+     *
+     * The RTC WDT is relied upon for a complete system reset, as it is the only
+     * watchdog timer capable of resetting both the main system and the RTC subsystem.
+     * In contrast, the Timer Group Watchdog Timers can only reset the main system
+     * but not the RTC module.
+     *
+     * We have to do this before we do anything that might cause issues in the WDT interrupt handlers,
+     * for example stalling the other core on ESP32 may cause the ESP32_ECO3_CACHE_LOCK_FIX
+     * handler to get stuck.
+     *
+     * The timeout value for the RTC WDT is set to 10 seconds. The primary reason for
+     * choosing a 10 second timeout is to allow the panic handler to run to completion
+     * which may include core dump collection and apptrace flushing.
+     *
+     * Explanation for why the core dump takes time:
+     * 64KB of core dump data (stacks of about 30 tasks) will produce ~85KB base64 data.
+     * @ 115200 UART speed it will take more than 6 sec to print them out.
+     *
+     * TODO: Make the timeout configurable or more intelligent based on the panic reason and the
+     * config options.
+     */
+#if CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS
+    esp_panic_handler_enable_rtc_wdt((CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS + 10) * 1000);
+#else
+    esp_panic_handler_enable_rtc_wdt(10000);
+#endif /* CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS */

    /* Before we stall the other CPU, we need to disable all WDTs except the RTC WDT.
     * This is because the TIMG WDTs cannot reset the RTC subsystem, which stores the CPU stalling
@@ -227,6 +234,7 @@ static void panic_handler(void *frame, bool pseudo_excause)
     */
    esp_panic_handler_disable_timg_wdts();

+#if !CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE
    esp_rom_delay_us(1);
    // Stall all other cores
    for (uint32_t i = 0; i < SOC_CPU_CORES_NUM; i++) {
@@ -234,12 +242,6 @@ static void panic_handler(void *frame, bool pseudo_excause)
            esp_cpu_stall(i);
        }
    }
-#else
-    /* In single core mode, we don't need to disable the TIMG WDTs,
-     * but we do it anyway to keep the code consistent and to avoid
-     * managing the state of multiple WDTs.
-     */
-    esp_panic_handler_disable_timg_wdts();
 #endif // !CONFIG_ESP_SYSTEM_SINGLE_CORE_MODE

    esp_ipc_isr_stall_abort();