userspace reboot: stop post-data services and wait for them to be killed
* Refactored code around stopping services a little bit to reuse it between full reboot and userspace reboot. * Add a scope_guard to fallback to full reboot in case userspace reboot fails. * In case of userspace reboot init will also wait for services to be terminated/killed and log the ones that didn't react to SIGTERM/SIGKILL in time. * If some of the services didn't react to SIGKILL, fail userspace reboot. Test: adb reboot userspace Bug: 135984674 Change-Id: I820c7bc406169333b0f929f0eea028d8384eb2ac
This commit is contained in:
parent
d11c6f7fd8
commit
3f4b0d6113
5 changed files with 157 additions and 63 deletions
170
init/reboot.cpp
170
init/reboot.cpp
|
@ -22,6 +22,7 @@
|
|||
#include <linux/loop.h>
|
||||
#include <mntent.h>
|
||||
#include <semaphore.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/cdefs.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mount.h>
|
||||
|
@ -31,6 +32,7 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <thread>
|
||||
|
@ -41,6 +43,7 @@
|
|||
#include <android-base/logging.h>
|
||||
#include <android-base/macros.h>
|
||||
#include <android-base/properties.h>
|
||||
#include <android-base/scopeguard.h>
|
||||
#include <android-base/strings.h>
|
||||
#include <android-base/unique_fd.h>
|
||||
#include <bootloader_message/bootloader_message.h>
|
||||
|
@ -59,6 +62,7 @@
|
|||
#include "service.h"
|
||||
#include "service_list.h"
|
||||
#include "sigchld_handler.h"
|
||||
#include "util.h"
|
||||
|
||||
#define PROC_SYSRQ "/proc/sysrq-trigger"
|
||||
|
||||
|
@ -75,6 +79,19 @@ namespace init {
|
|||
|
||||
static bool shutting_down = false;
|
||||
|
||||
static const std::set<std::string> kDebuggingServices{"tombstoned", "logd", "adbd", "console"};
|
||||
|
||||
static std::vector<Service*> GetDebuggingServices(bool only_post_data) {
|
||||
std::vector<Service*> ret;
|
||||
ret.reserve(kDebuggingServices.size());
|
||||
for (const auto& s : ServiceList::GetInstance()) {
|
||||
if (kDebuggingServices.count(s->name()) && (!only_post_data || s->is_post_data())) {
|
||||
ret.push_back(s.get());
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// represents umount status during reboot / shutdown.
|
||||
enum UmountStat {
|
||||
/* umount succeeded. */
|
||||
|
@ -446,6 +463,49 @@ static void KillZramBackingDevice() {
|
|||
LOG(INFO) << "zram_backing_dev: `" << backing_dev << "` is cleared successfully.";
|
||||
}
|
||||
|
||||
// Stops given services, waits for them to be stopped for |timeout| ms.
|
||||
// If terminate is true, then SIGTERM is sent to services, otherwise SIGKILL is sent.
|
||||
static void StopServices(const std::vector<Service*>& services, std::chrono::milliseconds timeout,
|
||||
bool terminate) {
|
||||
LOG(INFO) << "Stopping " << services.size() << " services by sending "
|
||||
<< (terminate ? "SIGTERM" : "SIGKILL");
|
||||
std::vector<pid_t> pids;
|
||||
pids.reserve(services.size());
|
||||
for (const auto& s : services) {
|
||||
if (s->pid() > 0) {
|
||||
pids.push_back(s->pid());
|
||||
}
|
||||
if (terminate) {
|
||||
s->Terminate();
|
||||
} else {
|
||||
s->Stop();
|
||||
}
|
||||
}
|
||||
if (timeout > 0ms) {
|
||||
WaitToBeReaped(pids, timeout);
|
||||
} else {
|
||||
// Even if we don't to wait for services to stop, we still optimistically reap zombies.
|
||||
ReapAnyOutstandingChildren();
|
||||
}
|
||||
}
|
||||
|
||||
// Like StopServices, but also logs all the services that failed to stop after the provided timeout.
|
||||
// Returns number of violators.
|
||||
static int StopServicesAndLogViolations(const std::vector<Service*>& services,
|
||||
std::chrono::milliseconds timeout, bool terminate) {
|
||||
StopServices(services, timeout, terminate);
|
||||
int still_running = 0;
|
||||
for (const auto& s : services) {
|
||||
if (s->IsRunning()) {
|
||||
LOG(ERROR) << "[service-misbehaving] : service '" << s->name() << "' is still running "
|
||||
<< timeout.count() << "ms after receiving "
|
||||
<< (terminate ? "SIGTERM" : "SIGKILL");
|
||||
still_running++;
|
||||
}
|
||||
}
|
||||
return still_running;
|
||||
}
|
||||
|
||||
//* Reboot / shutdown the system.
|
||||
// cmd ANDROID_RB_* as defined in android_reboot.h
|
||||
// reason Reason string like "reboot", "shutdown,userrequested"
|
||||
|
@ -510,12 +570,13 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
|
|||
// Start reboot monitor thread
|
||||
sem_post(&reboot_semaphore);
|
||||
|
||||
// keep debugging tools until non critical ones are all gone.
|
||||
const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
|
||||
// watchdogd is a vendor specific component but should be alive to complete shutdown safely.
|
||||
const std::set<std::string> to_starts{"watchdogd"};
|
||||
std::vector<Service*> stop_first;
|
||||
stop_first.reserve(ServiceList::GetInstance().services().size());
|
||||
for (const auto& s : ServiceList::GetInstance()) {
|
||||
if (kill_after_apps.count(s->name())) {
|
||||
if (kDebuggingServices.count(s->name())) {
|
||||
// keep debugging tools until non critical ones are all gone.
|
||||
s->SetShutdownCritical();
|
||||
} else if (to_starts.count(s->name())) {
|
||||
if (auto result = s->Start(); !result) {
|
||||
|
@ -529,6 +590,8 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
|
|||
LOG(ERROR) << "Could not start shutdown critical service '" << s->name()
|
||||
<< "': " << result.error();
|
||||
}
|
||||
} else {
|
||||
stop_first.push_back(s.get());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -571,49 +634,12 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
|
|||
// optional shutdown step
|
||||
// 1. terminate all services except shutdown critical ones. wait for delay to finish
|
||||
if (shutdown_timeout > 0ms) {
|
||||
LOG(INFO) << "terminating init services";
|
||||
|
||||
// Ask all services to terminate except shutdown critical ones.
|
||||
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
|
||||
if (!s->IsShutdownCritical()) s->Terminate();
|
||||
}
|
||||
|
||||
int service_count = 0;
|
||||
// Only wait up to half of timeout here
|
||||
auto termination_wait_timeout = shutdown_timeout / 2;
|
||||
while (t.duration() < termination_wait_timeout) {
|
||||
ReapAnyOutstandingChildren();
|
||||
|
||||
service_count = 0;
|
||||
for (const auto& s : ServiceList::GetInstance()) {
|
||||
// Count the number of services running except shutdown critical.
|
||||
// Exclude the console as it will ignore the SIGTERM signal
|
||||
// and not exit.
|
||||
// Note: SVC_CONSOLE actually means "requires console" but
|
||||
// it is only used by the shell.
|
||||
if (!s->IsShutdownCritical() && s->pid() != 0 && (s->flags() & SVC_CONSOLE) == 0) {
|
||||
service_count++;
|
||||
}
|
||||
}
|
||||
|
||||
if (service_count == 0) {
|
||||
// All terminable services terminated. We can exit early.
|
||||
break;
|
||||
}
|
||||
|
||||
// Wait a bit before recounting the number or running services.
|
||||
std::this_thread::sleep_for(50ms);
|
||||
}
|
||||
LOG(INFO) << "Terminating running services took " << t
|
||||
<< " with remaining services:" << service_count;
|
||||
}
|
||||
|
||||
// minimum safety steps before restarting
|
||||
// 2. kill all services except ones that are necessary for the shutdown sequence.
|
||||
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
|
||||
if (!s->IsShutdownCritical()) s->Stop();
|
||||
StopServicesAndLogViolations(stop_first, shutdown_timeout / 2, true /* SIGTERM */);
|
||||
}
|
||||
// Send SIGKILL to ones that didn't terminate cleanly.
|
||||
StopServicesAndLogViolations(stop_first, 0ms, false /* SIGKILL */);
|
||||
SubcontextTerminate();
|
||||
// Reap subcontext pids.
|
||||
ReapAnyOutstandingChildren();
|
||||
|
||||
// 3. send volume shutdown to vold
|
||||
|
@ -625,9 +651,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
|
|||
LOG(INFO) << "vold not running, skipping vold shutdown";
|
||||
}
|
||||
// logcat stopped here
|
||||
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
|
||||
if (kill_after_apps.count(s->name())) s->Stop();
|
||||
}
|
||||
StopServices(GetDebuggingServices(false /* only_post_data */), 0ms, false /* SIGKILL */);
|
||||
// 4. sync, try umount, and optionally run fsck for user shutdown
|
||||
{
|
||||
Timer sync_timer;
|
||||
|
@ -660,6 +684,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
|
|||
}
|
||||
|
||||
static void EnterShutdown() {
|
||||
LOG(INFO) << "Entering shutdown mode";
|
||||
shutting_down = true;
|
||||
// Skip wait for prop if it is in progress
|
||||
ResetWaitForProp();
|
||||
|
@ -675,21 +700,61 @@ static void EnterShutdown() {
|
|||
}
|
||||
|
||||
static void LeaveShutdown() {
|
||||
LOG(INFO) << "Leaving shutdown mode";
|
||||
shutting_down = false;
|
||||
SendStartSendingMessagesMessage();
|
||||
}
|
||||
|
||||
static void DoUserspaceReboot() {
|
||||
static Result<void> DoUserspaceReboot() {
|
||||
LOG(INFO) << "Userspace reboot initiated";
|
||||
auto guard = android::base::make_scope_guard([] {
|
||||
// Leave shutdown so that we can handle a full reboot.
|
||||
LeaveShutdown();
|
||||
property_set("sys.powerctl", "reboot,abort-userspace-reboot");
|
||||
});
|
||||
// Triggering userspace-reboot-requested will result in a bunch of set_prop
|
||||
// actions. We should make sure, that all of them are propagated before
|
||||
// proceeding with userspace reboot.
|
||||
// TODO(b/135984674): implement proper synchronization logic.
|
||||
std::this_thread::sleep_for(500ms);
|
||||
EnterShutdown();
|
||||
// TODO(b/135984674): tear down post-data services
|
||||
LeaveShutdown();
|
||||
std::vector<Service*> stop_first;
|
||||
// Remember the services that were enabled. We will need to manually enable them again otherwise
|
||||
// triggers like class_start won't restart them.
|
||||
std::vector<Service*> were_enabled;
|
||||
stop_first.reserve(ServiceList::GetInstance().services().size());
|
||||
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
|
||||
if (s->is_post_data() && !kDebuggingServices.count(s->name())) {
|
||||
stop_first.push_back(s);
|
||||
}
|
||||
if (s->is_post_data() && s->IsEnabled()) {
|
||||
were_enabled.push_back(s);
|
||||
}
|
||||
}
|
||||
// TODO(b/135984674): do we need shutdown animation for userspace reboot?
|
||||
// TODO(b/135984674): control userspace timeout via read-only property?
|
||||
StopServicesAndLogViolations(stop_first, 10s, true /* SIGTERM */);
|
||||
if (int r = StopServicesAndLogViolations(stop_first, 20s, false /* SIGKILL */); r > 0) {
|
||||
// TODO(b/135984674): store information about offending services for debugging.
|
||||
return Error() << r << " post-data services are still running";
|
||||
}
|
||||
// TODO(b/135984674): remount userdata
|
||||
if (int r = StopServicesAndLogViolations(GetDebuggingServices(true /* only_post_data */), 5s,
|
||||
false /* SIGKILL */);
|
||||
r > 0) {
|
||||
// TODO(b/135984674): store information about offending services for debugging.
|
||||
return Error() << r << " debugging services are still running";
|
||||
}
|
||||
// TODO(b/135984674): deactivate APEX modules and switch back to bootstrap namespace.
|
||||
// Re-enable services
|
||||
for (const auto& s : were_enabled) {
|
||||
LOG(INFO) << "Re-enabling service '" << s->name() << "'";
|
||||
s->Enable();
|
||||
}
|
||||
LeaveShutdown();
|
||||
ActionManager::GetInstance().QueueEventTrigger("userspace-reboot-resume");
|
||||
guard.Disable(); // Go on with userspace reboot.
|
||||
return {};
|
||||
}
|
||||
|
||||
static void HandleUserspaceReboot() {
|
||||
|
@ -697,10 +762,7 @@ static void HandleUserspaceReboot() {
|
|||
auto& am = ActionManager::GetInstance();
|
||||
am.ClearQueue();
|
||||
am.QueueEventTrigger("userspace-reboot-requested");
|
||||
auto handler = [](const BuiltinArguments&) {
|
||||
DoUserspaceReboot();
|
||||
return Result<void>{};
|
||||
};
|
||||
auto handler = [](const BuiltinArguments&) { return DoUserspaceReboot(); };
|
||||
am.QueueBuiltinAction(handler, "userspace-reboot");
|
||||
}
|
||||
|
||||
|
|
|
@ -75,6 +75,7 @@ class Service {
|
|||
const std::vector<std::string>& args);
|
||||
|
||||
bool IsRunning() { return (flags_ & SVC_RUNNING) != 0; }
|
||||
bool IsEnabled() { return (flags_ & SVC_DISABLED) == 0; }
|
||||
Result<void> ExecStart();
|
||||
Result<void> Start();
|
||||
Result<void> StartIfNotDisabled();
|
||||
|
|
|
@ -28,28 +28,31 @@
|
|||
#include <android-base/scopeguard.h>
|
||||
#include <android-base/stringprintf.h>
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "init.h"
|
||||
#include "service.h"
|
||||
#include "service_list.h"
|
||||
|
||||
using android::base::StringPrintf;
|
||||
using android::base::boot_clock;
|
||||
using android::base::make_scope_guard;
|
||||
using android::base::StringPrintf;
|
||||
using android::base::Timer;
|
||||
|
||||
namespace android {
|
||||
namespace init {
|
||||
|
||||
static bool ReapOneProcess() {
|
||||
static pid_t ReapOneProcess() {
|
||||
siginfo_t siginfo = {};
|
||||
// This returns a zombie pid or informs us that there are no zombies left to be reaped.
|
||||
// It does NOT reap the pid; that is done below.
|
||||
if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) {
|
||||
PLOG(ERROR) << "waitid failed";
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto pid = siginfo.si_pid;
|
||||
if (pid == 0) return false;
|
||||
if (pid == 0) return 0;
|
||||
|
||||
// At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
|
||||
// whenever the function returns from this point forward.
|
||||
|
@ -92,7 +95,7 @@ static bool ReapOneProcess() {
|
|||
LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
|
||||
}
|
||||
|
||||
if (!service) return true;
|
||||
if (!service) return pid;
|
||||
|
||||
service->Reap(siginfo);
|
||||
|
||||
|
@ -100,13 +103,33 @@ static bool ReapOneProcess() {
|
|||
ServiceList::GetInstance().RemoveService(*service);
|
||||
}
|
||||
|
||||
return true;
|
||||
return pid;
|
||||
}
|
||||
|
||||
void ReapAnyOutstandingChildren() {
|
||||
while (ReapOneProcess()) {
|
||||
while (ReapOneProcess() != 0) {
|
||||
}
|
||||
}
|
||||
|
||||
void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout) {
|
||||
Timer t;
|
||||
std::vector<pid_t> alive_pids(pids.begin(), pids.end());
|
||||
while (!alive_pids.empty() && t.duration() < timeout) {
|
||||
pid_t pid;
|
||||
while ((pid = ReapOneProcess()) != 0) {
|
||||
auto it = std::find(alive_pids.begin(), alive_pids.end(), pid);
|
||||
if (it != alive_pids.end()) {
|
||||
alive_pids.erase(it);
|
||||
}
|
||||
}
|
||||
if (alive_pids.empty()) {
|
||||
break;
|
||||
}
|
||||
std::this_thread::sleep_for(50ms);
|
||||
}
|
||||
LOG(INFO) << "Waiting for " << pids.size() << " pids to be reaped took " << t << " with "
|
||||
<< alive_pids.size() << " of them still running";
|
||||
}
|
||||
|
||||
} // namespace init
|
||||
} // namespace android
|
||||
|
|
|
@ -17,11 +17,16 @@
|
|||
#ifndef _INIT_SIGCHLD_HANDLER_H_
|
||||
#define _INIT_SIGCHLD_HANDLER_H_
|
||||
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
|
||||
namespace android {
|
||||
namespace init {
|
||||
|
||||
void ReapAnyOutstandingChildren();
|
||||
|
||||
void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout);
|
||||
|
||||
} // namespace init
|
||||
} // namespace android
|
||||
|
||||
|
|
|
@ -918,11 +918,14 @@ on property:ro.debuggable=1
|
|||
on init && property:ro.debuggable=1
|
||||
start console
|
||||
|
||||
on userspace-reboot:
|
||||
on userspace-reboot
|
||||
# TODO(b/135984674): reset all necessary properties here.
|
||||
setprop sys.init.userspace_reboot_in_progress 1
|
||||
setprop sys.boot_completed 0
|
||||
setprop sys.init.updatable_crashing 0
|
||||
setprop apexd.status 0
|
||||
|
||||
on userspace-reboot-resume:
|
||||
on userspace-reboot-resume
|
||||
# TODO(b/135984674): remount userdata and reset checkpointing
|
||||
trigger nonencrypted
|
||||
trigger post-fs-data
|
||||
|
|
Loading…
Reference in a new issue