userspace reboot: stop post-data services and wait for them to be killed

* Refactored code around stopping services a little bit to reuse it
  between full reboot and userspace reboot.
* Add a scope_guard to fallback to full reboot in case userspace reboot
  fails.
* In case of userspace reboot init will also wait for services to be
  terminated/killed and log the ones that didn't react to
  SIGTERM/SIGKILL in time.
* If some of the services didn't react to SIGKILL, fail userspace reboot.

Test: adb reboot userspace
Bug: 135984674
Change-Id: I820c7bc406169333b0f929f0eea028d8384eb2ac
This commit is contained in:
Nikita Ioffe 2019-10-09 15:23:02 +01:00
parent d11c6f7fd8
commit 3f4b0d6113
5 changed files with 157 additions and 63 deletions

View file

@ -22,6 +22,7 @@
#include <linux/loop.h>
#include <mntent.h>
#include <semaphore.h>
#include <stdlib.h>
#include <sys/cdefs.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
@ -31,6 +32,7 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <chrono>
#include <memory>
#include <set>
#include <thread>
@ -41,6 +43,7 @@
#include <android-base/logging.h>
#include <android-base/macros.h>
#include <android-base/properties.h>
#include <android-base/scopeguard.h>
#include <android-base/strings.h>
#include <android-base/unique_fd.h>
#include <bootloader_message/bootloader_message.h>
@ -59,6 +62,7 @@
#include "service.h"
#include "service_list.h"
#include "sigchld_handler.h"
#include "util.h"
#define PROC_SYSRQ "/proc/sysrq-trigger"
@ -75,6 +79,19 @@ namespace init {
static bool shutting_down = false;
static const std::set<std::string> kDebuggingServices{"tombstoned", "logd", "adbd", "console"};
static std::vector<Service*> GetDebuggingServices(bool only_post_data) {
std::vector<Service*> ret;
ret.reserve(kDebuggingServices.size());
for (const auto& s : ServiceList::GetInstance()) {
if (kDebuggingServices.count(s->name()) && (!only_post_data || s->is_post_data())) {
ret.push_back(s.get());
}
}
return ret;
}
// represents umount status during reboot / shutdown.
enum UmountStat {
/* umount succeeded. */
@ -446,6 +463,49 @@ static void KillZramBackingDevice() {
LOG(INFO) << "zram_backing_dev: `" << backing_dev << "` is cleared successfully.";
}
// Stops given services, waits for them to be stopped for |timeout| ms.
// If terminate is true, then SIGTERM is sent to services, otherwise SIGKILL is sent.
static void StopServices(const std::vector<Service*>& services, std::chrono::milliseconds timeout,
bool terminate) {
LOG(INFO) << "Stopping " << services.size() << " services by sending "
<< (terminate ? "SIGTERM" : "SIGKILL");
std::vector<pid_t> pids;
pids.reserve(services.size());
for (const auto& s : services) {
if (s->pid() > 0) {
pids.push_back(s->pid());
}
if (terminate) {
s->Terminate();
} else {
s->Stop();
}
}
if (timeout > 0ms) {
WaitToBeReaped(pids, timeout);
} else {
// Even if we don't to wait for services to stop, we still optimistically reap zombies.
ReapAnyOutstandingChildren();
}
}
// Like StopServices, but also logs all the services that failed to stop after the provided timeout.
// Returns number of violators.
static int StopServicesAndLogViolations(const std::vector<Service*>& services,
std::chrono::milliseconds timeout, bool terminate) {
StopServices(services, timeout, terminate);
int still_running = 0;
for (const auto& s : services) {
if (s->IsRunning()) {
LOG(ERROR) << "[service-misbehaving] : service '" << s->name() << "' is still running "
<< timeout.count() << "ms after receiving "
<< (terminate ? "SIGTERM" : "SIGKILL");
still_running++;
}
}
return still_running;
}
//* Reboot / shutdown the system.
// cmd ANDROID_RB_* as defined in android_reboot.h
// reason Reason string like "reboot", "shutdown,userrequested"
@ -510,12 +570,13 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
// Start reboot monitor thread
sem_post(&reboot_semaphore);
// keep debugging tools until non critical ones are all gone.
const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
// watchdogd is a vendor specific component but should be alive to complete shutdown safely.
const std::set<std::string> to_starts{"watchdogd"};
std::vector<Service*> stop_first;
stop_first.reserve(ServiceList::GetInstance().services().size());
for (const auto& s : ServiceList::GetInstance()) {
if (kill_after_apps.count(s->name())) {
if (kDebuggingServices.count(s->name())) {
// keep debugging tools until non critical ones are all gone.
s->SetShutdownCritical();
} else if (to_starts.count(s->name())) {
if (auto result = s->Start(); !result) {
@ -529,6 +590,8 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
LOG(ERROR) << "Could not start shutdown critical service '" << s->name()
<< "': " << result.error();
}
} else {
stop_first.push_back(s.get());
}
}
@ -571,49 +634,12 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
// optional shutdown step
// 1. terminate all services except shutdown critical ones. wait for delay to finish
if (shutdown_timeout > 0ms) {
LOG(INFO) << "terminating init services";
// Ask all services to terminate except shutdown critical ones.
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
if (!s->IsShutdownCritical()) s->Terminate();
}
int service_count = 0;
// Only wait up to half of timeout here
auto termination_wait_timeout = shutdown_timeout / 2;
while (t.duration() < termination_wait_timeout) {
ReapAnyOutstandingChildren();
service_count = 0;
for (const auto& s : ServiceList::GetInstance()) {
// Count the number of services running except shutdown critical.
// Exclude the console as it will ignore the SIGTERM signal
// and not exit.
// Note: SVC_CONSOLE actually means "requires console" but
// it is only used by the shell.
if (!s->IsShutdownCritical() && s->pid() != 0 && (s->flags() & SVC_CONSOLE) == 0) {
service_count++;
}
}
if (service_count == 0) {
// All terminable services terminated. We can exit early.
break;
}
// Wait a bit before recounting the number or running services.
std::this_thread::sleep_for(50ms);
}
LOG(INFO) << "Terminating running services took " << t
<< " with remaining services:" << service_count;
}
// minimum safety steps before restarting
// 2. kill all services except ones that are necessary for the shutdown sequence.
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
if (!s->IsShutdownCritical()) s->Stop();
StopServicesAndLogViolations(stop_first, shutdown_timeout / 2, true /* SIGTERM */);
}
// Send SIGKILL to ones that didn't terminate cleanly.
StopServicesAndLogViolations(stop_first, 0ms, false /* SIGKILL */);
SubcontextTerminate();
// Reap subcontext pids.
ReapAnyOutstandingChildren();
// 3. send volume shutdown to vold
@ -625,9 +651,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
LOG(INFO) << "vold not running, skipping vold shutdown";
}
// logcat stopped here
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
if (kill_after_apps.count(s->name())) s->Stop();
}
StopServices(GetDebuggingServices(false /* only_post_data */), 0ms, false /* SIGKILL */);
// 4. sync, try umount, and optionally run fsck for user shutdown
{
Timer sync_timer;
@ -660,6 +684,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
}
static void EnterShutdown() {
LOG(INFO) << "Entering shutdown mode";
shutting_down = true;
// Skip wait for prop if it is in progress
ResetWaitForProp();
@ -675,21 +700,61 @@ static void EnterShutdown() {
}
static void LeaveShutdown() {
LOG(INFO) << "Leaving shutdown mode";
shutting_down = false;
SendStartSendingMessagesMessage();
}
static void DoUserspaceReboot() {
static Result<void> DoUserspaceReboot() {
LOG(INFO) << "Userspace reboot initiated";
auto guard = android::base::make_scope_guard([] {
// Leave shutdown so that we can handle a full reboot.
LeaveShutdown();
property_set("sys.powerctl", "reboot,abort-userspace-reboot");
});
// Triggering userspace-reboot-requested will result in a bunch of set_prop
// actions. We should make sure, that all of them are propagated before
// proceeding with userspace reboot.
// TODO(b/135984674): implement proper synchronization logic.
std::this_thread::sleep_for(500ms);
EnterShutdown();
// TODO(b/135984674): tear down post-data services
LeaveShutdown();
std::vector<Service*> stop_first;
// Remember the services that were enabled. We will need to manually enable them again otherwise
// triggers like class_start won't restart them.
std::vector<Service*> were_enabled;
stop_first.reserve(ServiceList::GetInstance().services().size());
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
if (s->is_post_data() && !kDebuggingServices.count(s->name())) {
stop_first.push_back(s);
}
if (s->is_post_data() && s->IsEnabled()) {
were_enabled.push_back(s);
}
}
// TODO(b/135984674): do we need shutdown animation for userspace reboot?
// TODO(b/135984674): control userspace timeout via read-only property?
StopServicesAndLogViolations(stop_first, 10s, true /* SIGTERM */);
if (int r = StopServicesAndLogViolations(stop_first, 20s, false /* SIGKILL */); r > 0) {
// TODO(b/135984674): store information about offending services for debugging.
return Error() << r << " post-data services are still running";
}
// TODO(b/135984674): remount userdata
if (int r = StopServicesAndLogViolations(GetDebuggingServices(true /* only_post_data */), 5s,
false /* SIGKILL */);
r > 0) {
// TODO(b/135984674): store information about offending services for debugging.
return Error() << r << " debugging services are still running";
}
// TODO(b/135984674): deactivate APEX modules and switch back to bootstrap namespace.
// Re-enable services
for (const auto& s : were_enabled) {
LOG(INFO) << "Re-enabling service '" << s->name() << "'";
s->Enable();
}
LeaveShutdown();
ActionManager::GetInstance().QueueEventTrigger("userspace-reboot-resume");
guard.Disable(); // Go on with userspace reboot.
return {};
}
static void HandleUserspaceReboot() {
@ -697,10 +762,7 @@ static void HandleUserspaceReboot() {
auto& am = ActionManager::GetInstance();
am.ClearQueue();
am.QueueEventTrigger("userspace-reboot-requested");
auto handler = [](const BuiltinArguments&) {
DoUserspaceReboot();
return Result<void>{};
};
auto handler = [](const BuiltinArguments&) { return DoUserspaceReboot(); };
am.QueueBuiltinAction(handler, "userspace-reboot");
}

View file

@ -75,6 +75,7 @@ class Service {
const std::vector<std::string>& args);
bool IsRunning() { return (flags_ & SVC_RUNNING) != 0; }
bool IsEnabled() { return (flags_ & SVC_DISABLED) == 0; }
Result<void> ExecStart();
Result<void> Start();
Result<void> StartIfNotDisabled();

View file

@ -28,28 +28,31 @@
#include <android-base/scopeguard.h>
#include <android-base/stringprintf.h>
#include <thread>
#include "init.h"
#include "service.h"
#include "service_list.h"
using android::base::StringPrintf;
using android::base::boot_clock;
using android::base::make_scope_guard;
using android::base::StringPrintf;
using android::base::Timer;
namespace android {
namespace init {
static bool ReapOneProcess() {
static pid_t ReapOneProcess() {
siginfo_t siginfo = {};
// This returns a zombie pid or informs us that there are no zombies left to be reaped.
// It does NOT reap the pid; that is done below.
if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) {
PLOG(ERROR) << "waitid failed";
return false;
return 0;
}
auto pid = siginfo.si_pid;
if (pid == 0) return false;
if (pid == 0) return 0;
// At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
// whenever the function returns from this point forward.
@ -92,7 +95,7 @@ static bool ReapOneProcess() {
LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
}
if (!service) return true;
if (!service) return pid;
service->Reap(siginfo);
@ -100,13 +103,33 @@ static bool ReapOneProcess() {
ServiceList::GetInstance().RemoveService(*service);
}
return true;
return pid;
}
void ReapAnyOutstandingChildren() {
while (ReapOneProcess()) {
while (ReapOneProcess() != 0) {
}
}
void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout) {
Timer t;
std::vector<pid_t> alive_pids(pids.begin(), pids.end());
while (!alive_pids.empty() && t.duration() < timeout) {
pid_t pid;
while ((pid = ReapOneProcess()) != 0) {
auto it = std::find(alive_pids.begin(), alive_pids.end(), pid);
if (it != alive_pids.end()) {
alive_pids.erase(it);
}
}
if (alive_pids.empty()) {
break;
}
std::this_thread::sleep_for(50ms);
}
LOG(INFO) << "Waiting for " << pids.size() << " pids to be reaped took " << t << " with "
<< alive_pids.size() << " of them still running";
}
} // namespace init
} // namespace android

View file

@ -17,11 +17,16 @@
#ifndef _INIT_SIGCHLD_HANDLER_H_
#define _INIT_SIGCHLD_HANDLER_H_
#include <chrono>
#include <vector>
namespace android {
namespace init {
void ReapAnyOutstandingChildren();
void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout);
} // namespace init
} // namespace android

View file

@ -918,11 +918,14 @@ on property:ro.debuggable=1
on init && property:ro.debuggable=1
start console
on userspace-reboot:
on userspace-reboot
# TODO(b/135984674): reset all necessary properties here.
setprop sys.init.userspace_reboot_in_progress 1
setprop sys.boot_completed 0
setprop sys.init.updatable_crashing 0
setprop apexd.status 0
on userspace-reboot-resume:
on userspace-reboot-resume
# TODO(b/135984674): remount userdata and reset checkpointing
trigger nonencrypted
trigger post-fs-data