init: Add more diagnostics for b/223076262.
This adds three more diagnostics to stuck exec services: 1. /proc/pid/fds is dumped 2. /proc/pid/status is dumped 3. HandleSignalFd is called to see if a SIGCHLD got stuck somewhere Bug: 223076262 Test: while (1) in linkerconfig Ignore-AOSP-First: diagnostics Change-Id: Ida601d86e18be9d49b143fb88b418cbc171ecac6
This commit is contained in:
parent
9bfe3e784e
commit
d7f2bfba54
3 changed files with 49 additions and 4 deletions
|
@ -33,7 +33,10 @@
|
|||
#define _REALLY_INCLUDE_SYS__SYSTEM_PROPERTIES_H_
|
||||
#include <sys/_system_properties.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
@ -580,10 +583,10 @@ static void HandleSigtermSignal(const signalfd_siginfo& siginfo) {
|
|||
|
||||
static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;
|
||||
|
||||
static void HandleSignalFd() {
|
||||
static void HandleSignalFd(bool one_off) {
|
||||
signalfd_siginfo siginfo;
|
||||
auto started = std::chrono::steady_clock::now();
|
||||
for (;;) {
|
||||
do {
|
||||
ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
|
||||
if (bytes_read < 0 && errno == EAGAIN) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
|
@ -601,7 +604,7 @@ static void HandleSignalFd() {
|
|||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
} while (!one_off);
|
||||
|
||||
switch (siginfo.ssi_signo) {
|
||||
case SIGCHLD:
|
||||
|
@ -662,7 +665,8 @@ static void InstallSignalFdHandler(Epoll* epoll) {
|
|||
}
|
||||
|
||||
constexpr int flags = EPOLLIN | EPOLLPRI;
|
||||
if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd, flags); !result.ok()) {
|
||||
auto handler = std::bind(HandleSignalFd, false);
|
||||
if (auto result = epoll->RegisterHandler(signal_fd, handler, flags); !result.ok()) {
|
||||
LOG(FATAL) << result.error();
|
||||
}
|
||||
}
|
||||
|
@ -791,6 +795,32 @@ static Result<void> ConnectEarlyStageSnapuserdAction(const BuiltinArguments& arg
|
|||
return {};
|
||||
}
|
||||
|
||||
static void DumpPidFds(const std::string& prefix, pid_t pid) {
|
||||
std::error_code ec;
|
||||
std::string proc_dir = "/proc/" + std::to_string(pid) + "/fd";
|
||||
for (const auto& entry : std::filesystem::directory_iterator(proc_dir)) {
|
||||
std::string target;
|
||||
if (android::base::Readlink(entry.path(), &target)) {
|
||||
LOG(ERROR) << prefix << target;
|
||||
} else {
|
||||
LOG(ERROR) << prefix << entry.path();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void DumpFile(const std::string& prefix, const std::string& file) {
|
||||
std::ifstream fp(file);
|
||||
if (!fp) {
|
||||
LOG(ERROR) << "Could not open " << file;
|
||||
return;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(fp, line)) {
|
||||
LOG(ERROR) << prefix << line;
|
||||
}
|
||||
}
|
||||
|
||||
int SecondStageMain(int argc, char** argv) {
|
||||
if (REBOOT_BOOTLOADER_ON_PANIC) {
|
||||
InstallRebootSignalHandlers();
|
||||
|
@ -996,11 +1026,23 @@ int SecondStageMain(int argc, char** argv) {
|
|||
(*function)();
|
||||
}
|
||||
} else if (Service::is_exec_service_running()) {
|
||||
static bool dumped_diagnostics = false;
|
||||
std::chrono::duration<double> waited =
|
||||
std::chrono::steady_clock::now() - Service::exec_service_started();
|
||||
if (waited >= kDiagnosticTimeout) {
|
||||
LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
|
||||
<< " without SIGCHLD";
|
||||
if (!dumped_diagnostics) {
|
||||
DumpPidFds("exec service opened: ", Service::exec_service_pid());
|
||||
|
||||
std::string status_file =
|
||||
"/proc/" + std::to_string(Service::exec_service_pid()) + "/status";
|
||||
DumpFile("exec service: ", status_file);
|
||||
dumped_diagnostics = true;
|
||||
|
||||
LOG(INFO) << "Attempting to handle any stuck SIGCHLDs...";
|
||||
HandleSignalFd(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!IsShuttingDown()) {
|
||||
|
|
|
@ -127,6 +127,7 @@ static bool ExpandArgsAndExecv(const std::vector<std::string>& args, bool sigsto
|
|||
|
||||
unsigned long Service::next_start_order_ = 1;
|
||||
bool Service::is_exec_service_running_ = false;
|
||||
pid_t Service::exec_service_pid_ = -1;
|
||||
std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;
|
||||
|
||||
Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
|
||||
|
@ -389,6 +390,7 @@ Result<void> Service::ExecStart() {
|
|||
|
||||
flags_ |= SVC_EXEC;
|
||||
is_exec_service_running_ = true;
|
||||
exec_service_pid_ = pid_;
|
||||
exec_service_started_ = std::chrono::steady_clock::now();
|
||||
|
||||
LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
|
||||
|
|
|
@ -102,6 +102,7 @@ class Service {
|
|||
size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }
|
||||
|
||||
static bool is_exec_service_running() { return is_exec_service_running_; }
|
||||
static pid_t exec_service_pid() { return exec_service_pid_; }
|
||||
static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
|
||||
return exec_service_started_;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue