init: Add more diagnostics for signalfd hangs.

This adds two new diagnostics. First, signalfd reads are now non-blocking. If the read takes more than 10 seconds, we log an error.

Second, init now wakes up from epoll() every 10 seconds. If it waits on an "exec" command for more than 10 seconds, it logs an error.

This change will be reverted as soon as we get feedback.

Bug: 223076262
Test: device boots
Change-Id: I7ee98d159599217a641b3de2564a92c2435f57ef
This commit is contained in:
David Anderson 2022-03-15 21:55:23 -07:00
parent a323e0233d
commit 14f9c15e05
3 changed files with 37 additions and 6 deletions

View file

@ -580,12 +580,29 @@ static void HandleSigtermSignal(const signalfd_siginfo& siginfo) {
HandlePowerctlMessage("shutdown,container");
}
static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;
static void HandleSignalFd() {
signalfd_siginfo siginfo;
ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
if (bytes_read != sizeof(siginfo)) {
PLOG(ERROR) << "Failed to read siginfo from signal_fd";
return;
auto started = std::chrono::steady_clock::now();
for (;;) {
ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
if (bytes_read < 0 && errno == EAGAIN) {
auto now = std::chrono::steady_clock::now();
std::chrono::duration<double> waited = now - started;
if (waited >= kDiagnosticTimeout) {
LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!";
started = now;
}
std::this_thread::sleep_for(100ms);
continue;
}
if (bytes_read != sizeof(siginfo)) {
PLOG(ERROR) << "Failed to read siginfo from signal_fd";
return;
}
break;
}
switch (siginfo.ssi_signo) {
@ -641,7 +658,7 @@ static void InstallSignalFdHandler(Epoll* epoll) {
LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
}
signal_fd = signalfd(-1, &mask, SFD_CLOEXEC);
signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK);
if (signal_fd == -1) {
PLOG(FATAL) << "failed to create signalfd";
}
@ -1020,7 +1037,7 @@ int SecondStageMain(int argc, char** argv) {
setpriority(PRIO_PROCESS, 0, 0);
while (true) {
// By default, sleep until something happens.
auto epoll_timeout = std::optional<std::chrono::milliseconds>{};
auto epoll_timeout = std::optional<std::chrono::milliseconds>{kDiagnosticTimeout};
auto shutdown_command = shutdown_state.CheckShutdown();
if (shutdown_command) {
@ -1060,6 +1077,13 @@ int SecondStageMain(int argc, char** argv) {
for (const auto& function : *pending_functions) {
(*function)();
}
} else if (Service::is_exec_service_running()) {
std::chrono::duration<double> waited =
std::chrono::steady_clock::now() - Service::exec_service_started();
if (waited >= kDiagnosticTimeout) {
LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
<< " without SIGCHLD";
}
}
if (!IsShuttingDown()) {
HandleControlMessages();

View file

@ -127,6 +127,7 @@ static bool ExpandArgsAndExecv(const std::vector<std::string>& args, bool sigsto
unsigned long Service::next_start_order_ = 1;
bool Service::is_exec_service_running_ = false;
std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;
Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
const std::vector<std::string>& args, bool from_apex)
@ -388,6 +389,7 @@ Result<void> Service::ExecStart() {
flags_ |= SVC_EXEC;
is_exec_service_running_ = true;
exec_service_started_ = std::chrono::steady_clock::now();
LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
<< " gid " << proc_attr_.gid << "+" << proc_attr_.supp_gids.size() << " context "

View file

@ -102,6 +102,9 @@ class Service {
size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }
static bool is_exec_service_running() { return is_exec_service_running_; }
static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
return exec_service_started_;
}
const std::string& name() const { return name_; }
const std::set<std::string>& classnames() const { return classnames_; }
@ -148,6 +151,8 @@ class Service {
static unsigned long next_start_order_;
static bool is_exec_service_running_;
static std::chrono::time_point<std::chrono::steady_clock> exec_service_started_;
static pid_t exec_service_pid_;
std::string name_;
std::set<std::string> classnames_;