Throttle reboot from native watchdog
Found out that increasing the threshold to 20 crashes pushes the file system based rollbacks as well. So introducing a throttling behaviour instead. Now native watchdog performs reboot with ramdump at 5 restarts. After that packageWatchdog/ RescueParty takes over to perform other mitigations. Ram dump + reboot will not be performed more than once in 24hrs. Test: manual Bug: 291137901 Change-Id: Ia192411dad94e8e25c26f700d2fe7f94d41439b8
This commit is contained in:
parent
ef2e67eec1
commit
ab9b683840
1 changed files with 22 additions and 7 deletions
|
@ -355,20 +355,35 @@ void Service::Reap(const siginfo_t& siginfo) {
|
|||
// If we crash > 4 times in 'fatal_crash_window_' minutes or before boot_completed,
|
||||
// reboot into bootloader or set crashing property
|
||||
boot_clock::time_point now = boot_clock::now();
|
||||
constexpr const char native_watchdog_reboot_time[] = "persist.init.svc.last_fatal_reboot_epoch";
|
||||
uint64_t throttle_window =
|
||||
std::chrono::duration_cast<std::chrono::seconds>(std::chrono::hours(24)).count();
|
||||
if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART) &&
|
||||
!was_last_exit_ok_) {
|
||||
bool boot_completed = GetBoolProperty("sys.boot_completed", false);
|
||||
if (now < time_crashed_ + fatal_crash_window_ || !boot_completed) {
|
||||
if (++crash_count_ > 4) {
|
||||
auto exit_reason = boot_completed ?
|
||||
"in " + std::to_string(fatal_crash_window_.count()) + " minutes" :
|
||||
"before boot completed";
|
||||
auto exit_reason =
|
||||
boot_completed
|
||||
? "in " + std::to_string(fatal_crash_window_.count()) + " minutes"
|
||||
: "before boot completed";
|
||||
if (flags_ & SVC_CRITICAL) {
|
||||
if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false)) {
|
||||
// Aborts into `fatal_reboot_target_'.
|
||||
SetFatalRebootTarget(fatal_reboot_target_);
|
||||
LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
|
||||
<< exit_reason;
|
||||
uint64_t epoch_time =
|
||||
std::chrono::duration_cast<std::chrono::seconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch())
|
||||
.count();
|
||||
// Do not reboot again If it was already initiated in the last 24hrs
|
||||
if (epoch_time - GetIntProperty(native_watchdog_reboot_time, 0) >
|
||||
throttle_window) {
|
||||
SetProperty(native_watchdog_reboot_time, std::to_string(epoch_time));
|
||||
// Aborts into `fatal_reboot_target_'.
|
||||
SetFatalRebootTarget(fatal_reboot_target_);
|
||||
LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
|
||||
<< exit_reason;
|
||||
} else {
|
||||
LOG(INFO) << "Reboot already performed in last 24hrs because of crash.";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG(ERROR) << "process with updatable components '" << name_
|
||||
|
|
Loading…
Reference in a new issue