Throttle reboot from native watchdog

Found out that increasing the threshold to 20 crashes pushes the file
system based rollbacks as well. So introducing a throttling behaviour
instead. Now native watchdog performs reboot with ramdump at 5 restarts.
After that packageWatchdog/ RescueParty takes over to perform other
mitigations. Ram dump + reboot will not be performed more than once
in 24hrs.

Test: manual
Bug: 291137901
Change-Id: Ia192411dad94e8e25c26f700d2fe7f94d41439b8
This commit is contained in:
Shrinidhi Hegde 2024-04-09 15:33:21 +00:00
parent ef2e67eec1
commit ab9b683840

View file

@ -355,20 +355,35 @@ void Service::Reap(const siginfo_t& siginfo) {
// If we crash > 4 times in 'fatal_crash_window_' minutes or before boot_completed,
// reboot into bootloader or set crashing property
boot_clock::time_point now = boot_clock::now();
constexpr const char native_watchdog_reboot_time[] = "persist.init.svc.last_fatal_reboot_epoch";
uint64_t throttle_window =
std::chrono::duration_cast<std::chrono::seconds>(std::chrono::hours(24)).count();
if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART) &&
!was_last_exit_ok_) {
bool boot_completed = GetBoolProperty("sys.boot_completed", false);
if (now < time_crashed_ + fatal_crash_window_ || !boot_completed) {
if (++crash_count_ > 4) {
auto exit_reason = boot_completed ?
"in " + std::to_string(fatal_crash_window_.count()) + " minutes" :
"before boot completed";
auto exit_reason =
boot_completed
? "in " + std::to_string(fatal_crash_window_.count()) + " minutes"
: "before boot completed";
if (flags_ & SVC_CRITICAL) {
if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false)) {
// Aborts into `fatal_reboot_target_'.
SetFatalRebootTarget(fatal_reboot_target_);
LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
<< exit_reason;
uint64_t epoch_time =
std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
// Do not reboot again If it was already initiated in the last 24hrs
if (epoch_time - GetIntProperty(native_watchdog_reboot_time, 0) >
throttle_window) {
SetProperty(native_watchdog_reboot_time, std::to_string(epoch_time));
// Aborts into `fatal_reboot_target_'.
SetFatalRebootTarget(fatal_reboot_target_);
LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
<< exit_reason;
} else {
LOG(INFO) << "Reboot already performed in last 24hrs because of crash.";
}
}
} else {
LOG(ERROR) << "process with updatable components '" << name_