Bootloop detection & recovery (#4793)

* added boot loop detection and config backup
* automatic OTA rollback if loading backup does not fix it
* added new file handling functions
* adding verification of json files, added config restore at bootup if broken
* added function to compare contents of two files for future use (currently not used)
This commit is contained in:
Damian Schneider
2025-08-15 20:43:04 +02:00
committed by GitHub
parent b8b59b2bb1
commit c9c442a933
6 changed files with 327 additions and 2 deletions

View File

@@ -1,6 +1,12 @@
#include "wled.h"
#include "fcn_declare.h"
#include "const.h"
#ifdef ESP8266
#include "user_interface.h" // for bootloop detection
#elif ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
#include "esp32/rtc.h" // for bootloop detection
#include <Update.h>
#endif
//helper to get int value at a position in string
@@ -706,6 +712,125 @@ void *realloc_malloc(void *ptr, size_t size) {
}
#endif
// bootloop detection and handling
// checks if the ESP reboots multiple times due to a crash or watchdog timeout
// if a bootloop is detected: restore settings from backup, then reset settings, then switch boot image (and repeat)
#define BOOTLOOP_THRESHOLD 5 // number of consecutive crashes to trigger bootloop detection
#define BOOTLOOP_ACTION_RESTORE 0 // default action: restore config from /cfg.bak
#define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /cfg.fault)
#define BOOTLOOP_ACTION_OTA 2 // swap the boot partition
#define BOOTLOOP_ACTION_DUMP 3 // nothing seems to help, dump files to serial and reboot (until hardware reset)
#ifdef ESP8266
#define BOOTLOOP_INTERVAL_TICKS (5 * 160000) // time limit between crashes: ~5 seconds in RTC ticks
#define BOOT_TIME_IDX 0 // index in RTC memory for boot time
#define CRASH_COUNTER_IDX 1 // index in RTC memory for crash counter
#define ACTIONT_TRACKER_IDX 2 // index in RTC memory for boot action
#else
#define BOOTLOOP_INTERVAL_TICKS 5000 // time limit between crashes: ~5 seconds in milliseconds
// variables in RTC_NOINIT memory persist between reboots (but not on hardware reset)
RTC_NOINIT_ATTR static uint32_t bl_last_boottime;
RTC_NOINIT_ATTR static uint32_t bl_crashcounter;
RTC_NOINIT_ATTR static uint32_t bl_actiontracker;
void bootloopCheckOTA() { bl_actiontracker = BOOTLOOP_ACTION_OTA; } // swap boot image if bootloop is detected instead of restoring config
#endif
// detect bootloop by checking the reset reason and the time since last boot
static bool detectBootLoop() {
#if !defined(ESP8266)
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
uint32_t rtctime = esp_rtc_get_time_us() / 1000; // convert to milliseconds
esp_reset_reason_t reason = esp_reset_reason();
if (!(reason == ESP_RST_PANIC || reason == ESP_RST_WDT || reason == ESP_RST_INT_WDT || reason == ESP_RST_TASK_WDT)) {
// no crash detected, init variables
bl_crashcounter = 0;
bl_last_boottime = rtctime;
if(reason != ESP_RST_SW)
bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler)
} else if (reason == ESP_RST_BROWNOUT) {
// crash due to brownout can't be detected unless using flash memory to store bootloop variables
// this is a simpler way to preemtively revert the config in case current brownout is caused by a bad choice of settings
DEBUG_PRINTLN(F("brownout detected"));
//restoreConfig(); // TODO: blindly restoring config if brownout detected is a bad idea, need a better way (if at all)
} else {
uint32_t rebootinterval = rtctime - bl_last_boottime;
bl_last_boottime = rtctime; // store current runtime for next reboot
if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) {
bl_crashcounter++;
if (bl_crashcounter >= BOOTLOOP_THRESHOLD) {
DEBUG_PRINTLN(F("!BOOTLOOP DETECTED!"));
bl_crashcounter = 0;
return true;
}
}
}
#endif
#else // ESP8266
rst_info* resetreason = system_get_rst_info();
uint32_t bl_last_boottime;
uint32_t bl_crashcounter;
uint32_t bl_actiontracker;
uint32_t rtctime = system_get_rtc_time();
if (!(resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST)) {
// no crash detected, init variables
bl_crashcounter = 0;
ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t));
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
if(resetreason->reason != REASON_SOFT_RESTART) {
bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler)
ESP.rtcUserMemoryWrite(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t));
}
} else {
// system has crashed
ESP.rtcUserMemoryRead(BOOT_TIME_IDX, &bl_last_boottime, sizeof(uint32_t));
ESP.rtcUserMemoryRead(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
uint32_t rebootinterval = rtctime - bl_last_boottime;
ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t)); // store current ticks for next reboot
if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) {
bl_crashcounter++;
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
if (bl_crashcounter >= BOOTLOOP_THRESHOLD) {
DEBUG_PRINTLN(F("BOOTLOOP DETECTED"));
bl_crashcounter = 0;
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
return true;
}
}
}
#endif
return false; // no bootloop detected
}
void handleBootLoop() {
DEBUG_PRINTLN(F("checking for bootloop"));
if (!detectBootLoop()) return; // no bootloop detected
#ifdef ESP8266
uint32_t bl_actiontracker;
ESP.rtcUserMemoryRead(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t));
#endif
if (bl_actiontracker == BOOTLOOP_ACTION_RESTORE) {
restoreConfig(); // note: if this fails, could reset immediately. instead just let things play out and save a few lines of code
bl_actiontracker = BOOTLOOP_ACTION_RESET; // reset config if it keeps bootlooping
} else if (bl_actiontracker == BOOTLOOP_ACTION_RESET) {
resetConfig();
bl_actiontracker = BOOTLOOP_ACTION_OTA; // swap boot partition if it keeps bootlooping. On ESP8266 this is the same as BOOTLOOP_ACTION_NONE
}
#ifndef ESP8266
else if (bl_actiontracker == BOOTLOOP_ACTION_OTA) {
if(Update.canRollBack()) {
DEBUG_PRINTLN(F("Swapping boot partition..."));
Update.rollBack(); // swap boot partition
}
bl_actiontracker = BOOTLOOP_ACTION_DUMP; // out of options
}
#endif
else
dumpFilesToSerial();
ESP.restart(); // restart cleanly and don't wait for another crash
}
/*
* Fixed point integer based Perlin noise functions by @dedehai
* Note: optimized for speed and to mimic fastled inoise functions, not for accuracy or best randomness