From 5146926723276f1e0f88650d73eca084bc977bca Mon Sep 17 00:00:00 2001 From: Will Miles Date: Wed, 20 Aug 2025 10:22:42 -0400 Subject: [PATCH 1/5] Use direct references to RTC RAM on ESP8266 ESP8266 RTC RAM requires 32-bit accesses, but there's no need to jump through a bunch of functions for it. Use references to simplify access and harmonize the implementation with ESP32. --- wled00/util.cpp | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/wled00/util.cpp b/wled00/util.cpp index 84473ce71..47a2aa8ea 100644 --- a/wled00/util.cpp +++ b/wled00/util.cpp @@ -725,11 +725,15 @@ void *realloc_malloc(void *ptr, size_t size) { #define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /rst.cfg.json) #define BOOTLOOP_ACTION_OTA 2 // swap the boot partition #define BOOTLOOP_ACTION_DUMP 3 // nothing seems to help, dump files to serial and reboot (until hardware reset) + #ifdef ESP8266 #define BOOTLOOP_INTERVAL_TICKS (5 * 160000) // time limit between crashes: ~5 seconds in RTC ticks -#define BOOT_TIME_IDX 0 // index in RTC memory for boot time -#define CRASH_COUNTER_IDX 1 // index in RTC memory for crash counter -#define ACTIONT_TRACKER_IDX 2 // index in RTC memory for boot action +// Place variables in RTC memory via references, since RTC memory is not exposed via the linker in the Non-OS SDK +// Use an offset of 32 as there's some hints that the first 128 bytes of "user" memory are used by the OTA system +// Ref: https://github.com/esp8266/Arduino/blob/78d0d0aceacc1553f45ad8154592b0af22d1eede/cores/esp8266/Esp.cpp#L168 +static volatile uint32_t& bl_last_boottime = *(RTC_USER_MEM + 32); +static volatile uint32_t& bl_crashcounter = *(RTC_USER_MEM + 33); +static volatile uint32_t& bl_actiontracker = *(RTC_USER_MEM + 34); #else #define BOOTLOOP_INTERVAL_TICKS 5000 // time limit between crashes: ~5 seconds in milliseconds // variables in RTC_NOINIT memory persist between reboots (but not on hardware reset) @@ -776,33 +780,24 @@ static bool detectBootLoop() { } #else // ESP8266 rst_info* resetreason = system_get_rst_info(); - uint32_t bl_last_boottime; - uint32_t bl_crashcounter; - uint32_t bl_actiontracker; uint32_t rtctime = system_get_rtc_time(); if (!(resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST)) { // no crash detected, init variables bl_crashcounter = 0; - ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t)); - ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t)); + bl_last_boottime = rtctime; if(resetreason->reason != REASON_SOFT_RESTART) { bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler) - ESP.rtcUserMemoryWrite(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t)); } } else { // system has crashed - ESP.rtcUserMemoryRead(BOOT_TIME_IDX, &bl_last_boottime, sizeof(uint32_t)); - ESP.rtcUserMemoryRead(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t)); uint32_t rebootinterval = rtctime - bl_last_boottime; - ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t)); // store current ticks for next reboot + bl_last_boottime = rtctime; if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) { bl_crashcounter++; - ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t)); if (bl_crashcounter >= BOOTLOOP_THRESHOLD) { DEBUG_PRINTLN(F("BOOTLOOP DETECTED")); bl_crashcounter = 0; - ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t)); return true; } } @@ -812,12 +807,9 @@ static bool detectBootLoop() { } void handleBootLoop() { - DEBUG_PRINTLN(F("checking for bootloop")); + DEBUG_PRINTF_P(PSTR("checking for bootloop: time %d, counter %d, action %d\n"), bl_last_boottime, bl_crashcounter, bl_actiontracker); if (!detectBootLoop()) return; // no bootloop detected -#ifdef ESP8266 - uint32_t bl_actiontracker; - ESP.rtcUserMemoryRead(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t)); -#endif + if (bl_actiontracker == BOOTLOOP_ACTION_RESTORE) { restoreConfig(); // note: if this fails, could reset immediately. instead just let things play out and save a few lines of code bl_actiontracker = BOOTLOOP_ACTION_RESET; // reset config if it keeps bootlooping @@ -836,10 +828,8 @@ void handleBootLoop() { #endif else dumpFilesToSerial(); -#ifdef ESP8266 - ESP.rtcUserMemoryWrite(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t)); -#endif - ESP.restart(); // restart cleanly and don't wait for another crash + + ESP.restart(); // restart cleanly and don't wait for another crash } /* From 85d4db83ed3750dd2984cd54ef0de840c7daac81 Mon Sep 17 00:00:00 2001 From: Will Miles Date: Wed, 20 Aug 2025 11:32:32 -0400 Subject: [PATCH 2/5] Isolate platform differences in bootloop check Separate the platform-specific code from the logic, so any future changes can be made in fewer places. --- wled00/util.cpp | 170 ++++++++++++++++++++++++++---------------------- 1 file changed, 93 insertions(+), 77 deletions(-) diff --git a/wled00/util.cpp b/wled00/util.cpp index 47a2aa8ea..7f0c2201b 100644 --- a/wled00/util.cpp +++ b/wled00/util.cpp @@ -720,116 +720,132 @@ void *realloc_malloc(void *ptr, size_t size) { // checks if the ESP reboots multiple times due to a crash or watchdog timeout // if a bootloop is detected: restore settings from backup, then reset settings, then switch boot image (and repeat) -#define BOOTLOOP_THRESHOLD 5 // number of consecutive crashes to trigger bootloop detection -#define BOOTLOOP_ACTION_RESTORE 0 // default action: restore config from /bak.cfg.json -#define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /rst.cfg.json) -#define BOOTLOOP_ACTION_OTA 2 // swap the boot partition -#define BOOTLOOP_ACTION_DUMP 3 // nothing seems to help, dump files to serial and reboot (until hardware reset) +#define BOOTLOOP_INTERVAL_MILLIS 5000 // time limit between crashes: 5 seconds +#define BOOTLOOP_THRESHOLD 5 // number of consecutive crashes to trigger bootloop detection +#define BOOTLOOP_ACTION_RESTORE 0 // default action: restore config from /bkp.cfg.json +#define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /rst.cfg.json) +#define BOOTLOOP_ACTION_OTA 2 // swap the boot partition +#define BOOTLOOP_ACTION_DUMP 3 // nothing seems to help, dump files to serial and reboot (until hardware reset) + +// Platform-agnostic abstraction +enum class ResetReason { + Power, + Software, + Crash, + Brownout +}; #ifdef ESP8266 -#define BOOTLOOP_INTERVAL_TICKS (5 * 160000) // time limit between crashes: ~5 seconds in RTC ticks // Place variables in RTC memory via references, since RTC memory is not exposed via the linker in the Non-OS SDK // Use an offset of 32 as there's some hints that the first 128 bytes of "user" memory are used by the OTA system // Ref: https://github.com/esp8266/Arduino/blob/78d0d0aceacc1553f45ad8154592b0af22d1eede/cores/esp8266/Esp.cpp#L168 static volatile uint32_t& bl_last_boottime = *(RTC_USER_MEM + 32); static volatile uint32_t& bl_crashcounter = *(RTC_USER_MEM + 33); static volatile uint32_t& bl_actiontracker = *(RTC_USER_MEM + 34); + +static inline ResetReason rebootReason() { + rst_info* resetreason = system_get_rst_info(); + if (resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST) return ResetReason::Crash; + if (resetreason->reason == REASON_SOFT_RESTART) return ResetReason::Software; + return ResetReason::Power; +} + +static inline uint32_t getRtcMillis() { return system_get_rtc_time() / 160; }; // rtc ticks ~160000Hz + #else -#define BOOTLOOP_INTERVAL_TICKS 5000 // time limit between crashes: ~5 seconds in milliseconds // variables in RTC_NOINIT memory persist between reboots (but not on hardware reset) RTC_NOINIT_ATTR static uint32_t bl_last_boottime; RTC_NOINIT_ATTR static uint32_t bl_crashcounter; RTC_NOINIT_ATTR static uint32_t bl_actiontracker; + +static inline ResetReason rebootReason() { + esp_reset_reason_t reason = esp_reset_reason(); + if (reason == ESP_RST_BROWNOUT) return ResetReason::Brownout; + if (reason == ESP_RST_SW) return ResetReason::Software; + if (reason == ESP_RST_PANIC || reason == ESP_RST_WDT || reason == ESP_RST_INT_WDT || reason == ESP_RST_TASK_WDT) return ResetReason::Crash; + return ResetReason::Power; +} + +#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0) +static inline uint32_t getRtcMillis() { return esp_rtc_get_time_us() / 1000; } +#elif ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(3, 3, 0) +static inline uint32_t getRtcMillis() { return rtc_time_slowclk_to_us(rtc_time_get(), rtc_clk_slow_freq_get_hz()) / 1000; } +#endif + void bootloopCheckOTA() { bl_actiontracker = BOOTLOOP_ACTION_OTA; } // swap boot image if bootloop is detected instead of restoring config + #endif // detect bootloop by checking the reset reason and the time since last boot static bool detectBootLoop() { -#if !defined(ESP8266) - #if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0) - uint32_t rtctime = esp_rtc_get_time_us() / 1000; // convert to milliseconds - #elif ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(3, 3, 0) - uint64_t rtc_ticks = rtc_time_get(); - uint32_t rtctime = rtc_time_slowclk_to_us(rtc_ticks, rtc_clk_slow_freq_get_hz()) / 1000; // convert to milliseconds - #endif + uint32_t rtctime = getRtcMillis(); + bool result = false; - esp_reset_reason_t reason = esp_reset_reason(); - - if (!(reason == ESP_RST_PANIC || reason == ESP_RST_WDT || reason == ESP_RST_INT_WDT || reason == ESP_RST_TASK_WDT)) { - // no crash detected, init variables - bl_crashcounter = 0; - bl_last_boottime = rtctime; - if(reason != ESP_RST_SW) + switch(rebootReason()) { + case ResetReason::Power: bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler) - } else if (reason == ESP_RST_BROWNOUT) { - // crash due to brownout can't be detected unless using flash memory to store bootloop variables - // this is a simpler way to preemtively revert the config in case current brownout is caused by a bad choice of settings - DEBUG_PRINTLN(F("brownout detected")); - //restoreConfig(); // TODO: blindly restoring config if brownout detected is a bad idea, need a better way (if at all) - } else { - uint32_t rebootinterval = rtctime - bl_last_boottime; - bl_last_boottime = rtctime; // store current runtime for next reboot - if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) { - bl_crashcounter++; - if (bl_crashcounter >= BOOTLOOP_THRESHOLD) { - DEBUG_PRINTLN(F("!BOOTLOOP DETECTED!")); - bl_crashcounter = 0; - return true; - } - } - } -#else // ESP8266 - rst_info* resetreason = system_get_rst_info(); - uint32_t rtctime = system_get_rtc_time(); + // fall through + case ResetReason::Software: + // no crash detected, reset counter + bl_crashcounter = 0; + break; - if (!(resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST)) { - // no crash detected, init variables - bl_crashcounter = 0; - bl_last_boottime = rtctime; - if(resetreason->reason != REASON_SOFT_RESTART) { - bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler) - } - } else { - // system has crashed - uint32_t rebootinterval = rtctime - bl_last_boottime; - bl_last_boottime = rtctime; - if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) { - bl_crashcounter++; - if (bl_crashcounter >= BOOTLOOP_THRESHOLD) { - DEBUG_PRINTLN(F("BOOTLOOP DETECTED")); - bl_crashcounter = 0; - return true; + case ResetReason::Crash: + { + uint32_t rebootinterval = rtctime - bl_last_boottime; + if (rebootinterval < BOOTLOOP_INTERVAL_MILLIS) { + bl_crashcounter++; + if (bl_crashcounter >= BOOTLOOP_THRESHOLD) { + DEBUG_PRINTLN(F("!BOOTLOOP DETECTED!")); + bl_crashcounter = 0; + result = true; + } } + break; } + + case ResetReason::Brownout: + // crash due to brownout can't be detected unless using flash memory to store bootloop variables + DEBUG_PRINTLN(F("brownout detected")); + //restoreConfig(); // TODO: blindly restoring config if brownout detected is a bad idea, need a better way (if at all) + break; } -#endif - return false; // no bootloop detected + + bl_last_boottime = rtctime; // store current runtime for next reboot + + return result; } void handleBootLoop() { DEBUG_PRINTF_P(PSTR("checking for bootloop: time %d, counter %d, action %d\n"), bl_last_boottime, bl_crashcounter, bl_actiontracker); if (!detectBootLoop()) return; // no bootloop detected - if (bl_actiontracker == BOOTLOOP_ACTION_RESTORE) { - restoreConfig(); // note: if this fails, could reset immediately. instead just let things play out and save a few lines of code - bl_actiontracker = BOOTLOOP_ACTION_RESET; // reset config if it keeps bootlooping - } else if (bl_actiontracker == BOOTLOOP_ACTION_RESET) { - resetConfig(); - bl_actiontracker = BOOTLOOP_ACTION_OTA; // swap boot partition if it keeps bootlooping. On ESP8266 this is the same as BOOTLOOP_ACTION_NONE - } + switch(bl_actiontracker) { + case BOOTLOOP_ACTION_RESTORE: + restoreConfig(); + ++bl_actiontracker; + break; + case BOOTLOOP_ACTION_RESET: + resetConfig(); + ++bl_actiontracker; + break; + case BOOTLOOP_ACTION_OTA: #ifndef ESP8266 - else if (bl_actiontracker == BOOTLOOP_ACTION_OTA) { - if(Update.canRollBack()) { - DEBUG_PRINTLN(F("Swapping boot partition...")); - Update.rollBack(); // swap boot partition - } - bl_actiontracker = BOOTLOOP_ACTION_DUMP; // out of options + if(Update.canRollBack()) { + DEBUG_PRINTLN(F("Swapping boot partition...")); + Update.rollBack(); // swap boot partition + } + ++bl_actiontracker; + break; +#else + // fall through +#endif + case BOOTLOOP_ACTION_DUMP: + dumpFilesToSerial(); + break; } - #endif - else - dumpFilesToSerial(); - ESP.restart(); // restart cleanly and don't wait for another crash + ESP.restart(); // restart cleanly and don't wait for another crash } /* From 46f3bc0ced8351dbb2b4955e95292883d61bfeb6 Mon Sep 17 00:00:00 2001 From: Will Miles Date: Tue, 26 Aug 2025 21:00:54 -0400 Subject: [PATCH 3/5] Bootloop: Include soft wdt on ESP8266 --- wled00/util.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/wled00/util.cpp b/wled00/util.cpp index 7f0c2201b..7002df154 100644 --- a/wled00/util.cpp +++ b/wled00/util.cpp @@ -744,9 +744,13 @@ static volatile uint32_t& bl_crashcounter = *(RTC_USER_MEM + 33); static volatile uint32_t& bl_actiontracker = *(RTC_USER_MEM + 34); static inline ResetReason rebootReason() { - rst_info* resetreason = system_get_rst_info(); - if (resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST) return ResetReason::Crash; - if (resetreason->reason == REASON_SOFT_RESTART) return ResetReason::Software; + uint32_t resetReason = system_get_rst_info()->reason; + if (resetReason == REASON_EXCEPTION_RST + || resetReason == REASON_WDT_RST + || resetReason == REASON_SOFT_WDT_RST) + return ResetReason::Crash; + if (resetReason == REASON_SOFT_RESTART) + return ResetReason::Software; return ResetReason::Power; } From dd13c2df47f35faae0960bdfd1eeded6b2a0b1f8 Mon Sep 17 00:00:00 2001 From: Will Miles Date: Thu, 28 Aug 2025 21:10:20 -0400 Subject: [PATCH 4/5] Reset crash counter after long interval Don't treat consecutive but infrequent crashes as bootloops. The bootloop recovery actions only make sense when there is no opportunity for a user to reconfigure their system. Suggested by @coderabbitai --- wled00/util.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wled00/util.cpp b/wled00/util.cpp index 7002df154..0d5d7373b 100644 --- a/wled00/util.cpp +++ b/wled00/util.cpp @@ -804,7 +804,11 @@ static bool detectBootLoop() { bl_crashcounter = 0; result = true; } - } + } else { + // Reset counter on long intervals to track only consecutive short-interval crashes + bl_crashcounter = 0; + // TODO: crash reporting goes here + } break; } From 6f914d79b12fc5f6a81132a3efc630e11296f02c Mon Sep 17 00:00:00 2001 From: Will Miles Date: Thu, 28 Aug 2025 21:17:12 -0400 Subject: [PATCH 5/5] Increase boot loop timeout Any repeating crash that prevents a human from logging in and fixing the config should be treated as a boot loop. Increase the detection timeout, so anything that's fast enough to preclude a user fix will trigger the recovery behaviour. --- wled00/util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wled00/util.cpp b/wled00/util.cpp index 0d5d7373b..e1b4830c3 100644 --- a/wled00/util.cpp +++ b/wled00/util.cpp @@ -720,7 +720,7 @@ void *realloc_malloc(void *ptr, size_t size) { // checks if the ESP reboots multiple times due to a crash or watchdog timeout // if a bootloop is detected: restore settings from backup, then reset settings, then switch boot image (and repeat) -#define BOOTLOOP_INTERVAL_MILLIS 5000 // time limit between crashes: 5 seconds +#define BOOTLOOP_INTERVAL_MILLIS 120000 // time limit between crashes: 120 seconds (2 minutes) #define BOOTLOOP_THRESHOLD 5 // number of consecutive crashes to trigger bootloop detection #define BOOTLOOP_ACTION_RESTORE 0 // default action: restore config from /bkp.cfg.json #define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /rst.cfg.json)