diff --git a/libcutils/include/private/android_filesystem_config.h b/libcutils/include/private/android_filesystem_config.h index 8209167e3..3be8ad0ea 100644 --- a/libcutils/include/private/android_filesystem_config.h +++ b/libcutils/include/private/android_filesystem_config.h @@ -130,6 +130,7 @@ #define AID_INCIDENTD 1067 /* incidentd daemon */ #define AID_SECURE_ELEMENT 1068 /* secure element subsystem */ #define AID_LMKD 1069 /* low memory killer daemon */ +#define AID_LLKD 1070 /* live lock daemon */ /* Changes to this file must be made in AOSP, *not* in internal branches. */ #define AID_SHELL 2000 /* adb and debug shell user */ diff --git a/llkd/Android.bp b/llkd/Android.bp new file mode 100644 index 000000000..a6edd26ce --- /dev/null +++ b/llkd/Android.bp @@ -0,0 +1,42 @@ +cc_library_headers { + name: "llkd_headers", + + export_include_dirs: ["include"], +} + +cc_library_static { + name: "libllkd", + + srcs: [ + "libllkd.cpp", + ], + + shared_libs: [ + "libbase", + "libcutils", + "liblog", + ], + + export_include_dirs: ["include"], + + cflags: ["-Werror"], +} + +cc_binary { + name: "llkd", + + srcs: [ + "llkd.cpp", + ], + shared_libs: [ + "libbase", + "libcutils", + "liblog", + ], + static_libs: [ + "libllkd", + ], + cflags: ["-Werror"], + + init_rc: ["llkd.rc"], +} diff --git a/llkd/OWNERS b/llkd/OWNERS new file mode 100644 index 000000000..b6af537ef --- /dev/null +++ b/llkd/OWNERS @@ -0,0 +1,2 @@ +salyzyn@google.com +surenb@google.com diff --git a/llkd/README.md b/llkd/README.md new file mode 100644 index 000000000..146a99895 --- /dev/null +++ b/llkd/README.md @@ -0,0 +1,116 @@ +Android Live-LocK Daemon +======================== + +Introduction +------------ + +Android Live-LocK Daemon (llkd) is used to catch kernel deadlocks and mitigate. + +Code is structured to allow integration into another service as either as part +of the main loop, or spun off as a thread should that be necessary. A default +standalone implementation is provided by llkd component. + +The 'C' interface from libllkd component is thus: + + #include "llkd.h" + bool llkInit(const char* threadname) /* return true if enabled */ + unsigned llkCheckMillseconds(void) /* ms to sleep for next check */ + +If a threadname is provided, a thread will be automatically spawned, otherwise +caller must call llkCheckMilliseconds in its main loop. Function will return +the period of time before the next expected call to this handler. + +Operations +---------- + +If a thread is in D or Z state with no forward progress for longer than +ro.llk.timeout_ms, or ro.llk.[D|Z].timeout_ms, kill the process or parent +process respectively. If another scan shows the same process continues to +exist, then have a confirmed live-lock condition and need to panic. Panic +the kernel in a manner to provide the greatest bugreporting details as to the +condition. Add a alarm self watchdog should llkd ever get locked up that is +double the expected time to flow through the mainloop. Sampling is every +ro.llk_sample_ms. + +Default will not monitor init, or [kthreadd] and all that [kthreadd] spawns. +This reduces the effectiveness of llkd by limiting its coverage. If there is +value in covering [kthreadd] spawned threads, the requirement will be that +the drivers not remain in a persistent 'D' state, or that they have mechanisms +to recover the thread should it be killed externally (this is good driver +coding hygiene, a common request to add such to publicly reviewed kernel.org +maintained drivers). For instance use wait_event_interruptible() instead of +wait_event(). The blacklists can be adjusted accordingly if these +conditions are met to cover kernel components. + +An accompanying gTest set have been added, and will setup a persistent D or Z +process, with and without forward progress, but not in a live-lock state +because that would require a buggy kernel, or a module or kernel modification +to stimulate. The test will check that llkd will mitigate first by killing +the appropriate process. D state is setup by vfork() waiting for exec() in +child process. Z state is setup by fork() and an un-waited for child process. +Should be noted that both of these conditions should never happen on Android +on purpose, and llkd effectively sweeps up processes that create these +conditions. If the test can, it will reconfigure llkd to expedite the test +duration by adjusting the ro.llk.* Android properties. Tests run the D state +with some scheduling progress to ensure that ABA checking prevents false +triggers. + +Android Properties +------------------ + +Android Properties llkd respond to (_ms parms are in milliseconds): + +#### ro.config.low_ram +default false, if true do not sysrq t (dump all threads). + +#### ro.llk.enable +default false, allow live-lock daemon to be enabled. + +#### ro.khungtask.enable +default false, allow [khungtask] daemon to be enabled. + +#### ro.llk.mlockall +default false, enable call to mlockall(). + +#### ro.khungtask.timeout +default value 12 minutes, [khungtask] maximum timelimit. + +#### ro.llk.timeout_ms +default 10 minutes, D or Z maximum timelimit, double this value and it sets +the alarm watchdog for llkd. + +#### ro.llk.D.timeout_ms +default ro.llk.timeout_ms, D maximum timelimit. + +#### ro.llk.Z.timeout_ms +default ro.llk.timeout_ms, Z maximum timelimit. + +#### ro.llk.check_ms +default 2 minutes samples of threads for D or Z. + +#### ro.llk.blacklist.process +default 0,1,2 (kernel, init and [kthreadd]) plus process names +init,[kthreadd],[khungtaskd],lmkd,lmkd.llkd,llkd,watchdogd, +[watchdogd],[watchdogd/0],...,[watchdogd/]. + +#### ro.llk.blacklist.parent +default 0,2 (kernel and [kthreadd]). + +#### ro.llk.blacklist.uid +default , comma separated list of uid numbers or names. + +Architectural Concerns +---------------------- + +- Figure out how to communicate the kernel panic better to bootstat canonical + boot reason determination. This may require an alteration to bootstat, or + some logging from llkd. Would like to see boot reason to be + watchdog,livelock as a minimum requirement. Or more specifically would want + watchdog,livelock,device or watchdog,livelock,zombie be reported. + Currently reports panic,sysrq (user requested panic) or panic depending on + system support of pstore. +- Create kernel module and associated gTest to actually test panic. +- Create gTest to test out blacklist (ro.llk.blacklist. generally + not be inputs). Could require more test-only interfaces to libllkd. +- Speed up gTest using something else than ro.llk., which should + not be inputs. diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h new file mode 100644 index 000000000..2ae28ed77 --- /dev/null +++ b/llkd/include/llkd.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _LLKD_H_ +#define _LLKD_H_ + +#ifndef LOG_TAG +#define LOG_TAG "livelock" +#endif + +#include +#include + +__BEGIN_DECLS + +bool llkInit(const char* threadname); /* threadname NULL, not spawned */ +unsigned llkCheckMilliseconds(void); + +/* clang-format off */ +#define LLK_ENABLE_PROPERTY "ro.llk.enable" +#define LLK_ENABLE_DEFAULT false +#define KHT_ENABLE_PROPERTY "ro.khungtask.enable" +#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall" +#define LLK_MLOCKALL_DEFAULT true +#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms" +#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout" +#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms" +#define LLK_Z_TIMEOUT_MS_PROPERTY "ro.llk.Z.timeout_ms" +#define LLK_CHECK_MS_PROPERTY "ro.llk.check_ms" +/* LLK_CHECK_MS_DEFAULT = actual timeout_ms / LLK_CHECKS_PER_TIMEOUT_DEFAULT */ +#define LLK_CHECKS_PER_TIMEOUT_DEFAULT 5 +#define LLK_BLACKLIST_PROCESS_PROPERTY "ro.llk.blacklist.process" +#define LLK_BLACKLIST_PROCESS_DEFAULT \ + "0,1,2,init,[kthreadd],[khungtaskd],lmkd,lmkd.llkd,llkd,watchdogd,[watchdogd],[watchdogd/0]" +#define LLK_BLACKLIST_PARENT_PROPERTY "ro.llk.blacklist.parent" +#define LLK_BLACKLIST_PARENT_DEFAULT "0,2,[kthreadd]" +#define LLK_BLACKLIST_UID_PROPERTY "ro.llk.blacklist.uid" +#define LLK_BLACKLIST_UID_DEFAULT "" +/* clang-format on */ + +__END_DECLS + +#ifdef __cplusplus +extern "C++" { /* In case this included wrapped with __BEGIN_DECLS */ + +#include + +__BEGIN_DECLS +/* C++ code allowed to not specify threadname argument for this C linkage */ +bool llkInit(const char* threadname = nullptr); +__END_DECLS +std::chrono::milliseconds llkCheck(bool checkRunning = false); + +/* clang-format off */ +#define LLK_TIMEOUT_MS_DEFAULT std::chrono::duration_cast(std::chrono::minutes(10)) +#define LLK_TIMEOUT_MS_MINIMUM std::chrono::duration_cast(std::chrono::seconds(10)) +#define LLK_CHECK_MS_MINIMUM std::chrono::duration_cast(std::chrono::seconds(1)) +/* clang-format on */ + +} /* extern "C++" */ +#endif /* __cplusplus */ + +#endif /* _LLKD_H_ */ diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp new file mode 100644 index 000000000..b25eb06c0 --- /dev/null +++ b/llkd/libllkd.cpp @@ -0,0 +1,1159 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "llkd.h" + +#include +#include // opendir() and readdir() +#include +#include +#include +#include // getpwuid() +#include +#include +#include // ___STRING, __predict_true() and _predict_false() +#include // mlockall() +#include +#include // lstat() +#include // __NR_getdents64 +#include // get_nprocs_conf() +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +#define TASK_COMM_LEN 16 // internal kernel, not uapi, from .../linux/include/linux/sched.h + +using namespace std::chrono_literals; +using namespace std::chrono; + +namespace { + +constexpr pid_t kernelPid = 0; +constexpr pid_t initPid = 1; +constexpr pid_t kthreaddPid = 2; + +constexpr char procdir[] = "/proc/"; + +// Configuration +milliseconds llkUpdate; // last check ms signature +milliseconds llkCycle; // ms to next thread check +bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled +bool llkRunning = false; // thread is running +bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked +milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout +enum { llkStateD, llkStateZ, llkNumStates }; // state indexes +milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state +milliseconds llkCheckMs; // checking interval to inspect any + // persistent live-locked states +bool llkLowRam; // ro.config.low_ram +bool khtEnable = LLK_ENABLE_DEFAULT; // [khungtaskd] panic +// [khungtaskd] should have a timeout beyond the granularity of llkTimeoutMs. +// Provides a wide angle of margin b/c khtTimeout is also its granularity. +seconds khtTimeout = duration_cast(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) / + LLK_CHECKS_PER_TIMEOUT_DEFAULT); + +// Blacklist variables, initialized with comma separated lists of high false +// positive and/or dangerous references, e.g. without self restart, for pid, +// ppid, name and uid: + +// list of pids, or tids or names to skip. kernel pid (0), init pid (1), +// [kthreadd] pid (2), ourselves, "init", "[kthreadd]", "lmkd", "llkd" or +// combinations of watchdogd in kernel and user space. +std::unordered_set llkBlacklistProcess; +// list of parent pids, comm or cmdline names to skip. default: +// kernel pid (0), [kthreadd] (2), or ourselves, enforced and implied +std::unordered_set llkBlacklistParent; +// list of uids, and uid names, to skip, default nothing +std::unordered_set llkBlacklistUid; + +class dir { + public: + enum level { proc, task, numLevels }; + + private: + int fd; + size_t available_bytes; + dirent* next; + // each directory level picked to be just north of 4K in size + static constexpr size_t buffEntries = 15; + static dirent buff[numLevels][buffEntries]; + + bool fill(enum level index) { + if (index >= numLevels) return false; + if (available_bytes != 0) return true; + if (__predict_false(fd < 0)) return false; + // getdents64 has no libc wrapper + auto rc = TEMP_FAILURE_RETRY(syscall(__NR_getdents64, fd, buff[index], sizeof(buff[0]), 0)); + if (rc <= 0) return false; + available_bytes = rc; + next = buff[index]; + return true; + } + + public: + dir() : fd(-1), available_bytes(0), next(nullptr) {} + + explicit dir(const char* directory) + : fd(__predict_true(directory != nullptr) + ? ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY) + : -1), + available_bytes(0), + next(nullptr) {} + + explicit dir(const std::string&& directory) + : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)), + available_bytes(0), + next(nullptr) {} + + explicit dir(const std::string& directory) + : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)), + available_bytes(0), + next(nullptr) {} + + // Don't need any copy or move constructors. + explicit dir(const dir& c) = delete; + explicit dir(dir& c) = delete; + explicit dir(dir&& c) = delete; + + ~dir() { + if (fd >= 0) { + ::close(fd); + } + } + + operator bool() const { return fd >= 0; } + + void reset(void) { + if (fd >= 0) { + ::close(fd); + fd = -1; + available_bytes = 0; + next = nullptr; + } + } + + dir& reset(const char* directory) { + reset(); + // available_bytes will _always_ be zero here as its value is + // intimately tied to fd < 0 or not. + fd = ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY); + return *this; + } + + void rewind(void) { + if (fd >= 0) { + ::lseek(fd, off_t(0), SEEK_SET); + available_bytes = 0; + next = nullptr; + } + } + + dirent* read(enum level index = proc, dirent* def = nullptr) { + if (!fill(index)) return def; + auto ret = next; + available_bytes -= next->d_reclen; + next = reinterpret_cast(reinterpret_cast(next) + next->d_reclen); + return ret; + } +} llkTopDirectory; + +dirent dir::buff[dir::numLevels][dir::buffEntries]; + +// helper functions + +bool llkIsMissingExeLink(pid_t tid) { + char c; + // CAP_SYS_PTRACE is required to prevent ret == -1, but ENOENT is signal + auto ret = ::readlink((procdir + std::to_string(tid) + "/exe").c_str(), &c, sizeof(c)); + return (ret == -1) && (errno == ENOENT); +} + +// Common routine where caller accepts empty content as error/passthrough. +// Reduces the churn of reporting read errors in the callers. +std::string ReadFile(std::string&& path) { + std::string content; + if (!android::base::ReadFileToString(path, &content)) { + PLOG(DEBUG) << "Read " << path << " failed"; + content = ""; + } + return content; +} + +std::string llkProcGetName(pid_t tid, const char* node = "/cmdline") { + std::string content = ReadFile(procdir + std::to_string(tid) + node); + static constexpr char needles[] = " \t\r\n"; // including trailing nul + auto pos = content.find_first_of(needles, 0, sizeof(needles)); + if (pos != std::string::npos) { + content.erase(pos); + } + return content; +} + +uid_t llkProcGetUid(pid_t tid) { + // Get the process' uid. The following read from /status is admittedly + // racy, prone to corruption due to shape-changes. The consequences are + // not catastrophic as we sample a few times before taking action. + // + // If /loginuid worked on reliably, or on Android (all tasks report -1)... + // Android lmkd causes /cgroup to contain memory://uid_/pid_ + // which is tighter, but also not reliable. + std::string content = ReadFile(procdir + std::to_string(tid) + "/status"); + static constexpr char Uid[] = "\nUid:"; + auto pos = content.find(Uid); + if (pos == std::string::npos) { + return -1; + } + pos += ::strlen(Uid); + while ((pos < content.size()) && ::isblank(content[pos])) { + ++pos; + } + content.erase(0, pos); + for (pos = 0; (pos < content.size()) && ::isdigit(content[pos]); ++pos) { + ; + } + // Content of form 'Uid: 0 0 0 0', newline is error + if ((pos >= content.size()) || !::isblank(content[pos])) { + return -1; + } + content.erase(pos); + uid_t ret; + if (!android::base::ParseInt(content, &ret, uid_t(0))) { + return -1; + } + return ret; +} + +struct proc { + pid_t tid; // monitored thread id (in Z or D state). + nanoseconds schedUpdate; // /proc//sched "se.avg.lastUpdateTime", + uint64_t nrSwitches; // /proc//sched "nr_switches" for + // refined ABA problem detection, determine + // forward scheduling progress. + milliseconds update; // llkUpdate millisecond signature of last. + milliseconds count; // duration in state. + pid_t pid; // /proc/ before iterating through + // /proc//task/ for threads. + pid_t ppid; // /proc//stat field 4 parent pid. + uid_t uid; // /proc//status Uid: field. + unsigned time; // sum of /proc//stat field 14 utime & + // 15 stime for coarse ABA problem detection. + std::string cmdline; // cached /cmdline content + char state; // /proc//stat field 3: Z or D + // (others we do not monitor: S, R, T or ?) + char comm[TASK_COMM_LEN + 3]; // space for adding '[' and ']' + bool exeMissingValid; // exeMissing has been cached + bool cmdlineValid; // cmdline has been cached + bool updated; // cleared before monitoring pass. + bool killed; // sent a kill to this thread, next panic... + + void setComm(const char* _comm) { strncpy(comm + 1, _comm, sizeof(comm) - 2); } + + proc(pid_t tid, pid_t pid, pid_t ppid, const char* _comm, int time, char state) + : tid(tid), + schedUpdate(0), + nrSwitches(0), + update(llkUpdate), + count(0), + pid(pid), + ppid(ppid), + uid(-1), + time(time), + state(state), + exeMissingValid(false), + cmdlineValid(false), + updated(true), + killed(false) { + memset(comm, '\0', sizeof(comm)); + setComm(_comm); + } + + const char* getComm(void) { + if (comm[1] == '\0') { // comm Valid? + strncpy(comm + 1, llkProcGetName(tid, "/comm").c_str(), sizeof(comm) - 2); + } + if (!exeMissingValid) { + if (llkIsMissingExeLink(tid)) { + comm[0] = '['; + } + exeMissingValid = true; + } + size_t len = strlen(comm + 1); + if (__predict_true(len < (sizeof(comm) - 1))) { + if (comm[0] == '[') { + if ((comm[len] != ']') && __predict_true(len < (sizeof(comm) - 2))) { + comm[++len] = ']'; + comm[++len] = '\0'; + } + } else { + if (comm[len] == ']') { + comm[len] = '\0'; + } + } + } + return &comm[comm[0] != '[']; + } + + const char* getCmdline(void) { + if (!cmdlineValid) { + cmdline = llkProcGetName(tid); + cmdlineValid = true; + } + return cmdline.c_str(); + } + + uid_t getUid(void) { + if (uid <= 0) { // Churn on root user, because most likely to setuid() + uid = llkProcGetUid(tid); + } + return uid; + } + + void reset(void) { // reset cache, if we detected pid rollover + uid = -1; + state = '?'; + cmdline = ""; + comm[0] = '\0'; + exeMissingValid = false; + cmdlineValid = false; + } +}; + +std::unordered_map tids; + +// Check range and setup defaults, in order of propagation: +// llkTimeoutMs +// llkCheckMs +// ... +// KISS to keep it all self-contained, and called multiple times as parameters +// are interpreted so that defaults, llkCheckMs and llkCycle make sense. +void llkValidate() { + if (llkTimeoutMs == 0ms) { + llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; + } + llkTimeoutMs = std::max(llkTimeoutMs, LLK_TIMEOUT_MS_MINIMUM); + if (llkCheckMs == 0ms) { + llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT; + } + llkCheckMs = std::min(llkCheckMs, llkTimeoutMs); + + for (size_t state = 0; state < ARRAY_SIZE(llkStateTimeoutMs); ++state) { + if (llkStateTimeoutMs[state] == 0ms) { + llkStateTimeoutMs[state] = llkTimeoutMs; + } + llkStateTimeoutMs[state] = + std::min(std::max(llkStateTimeoutMs[state], LLK_TIMEOUT_MS_MINIMUM), llkTimeoutMs); + llkCheckMs = std::min(llkCheckMs, llkStateTimeoutMs[state]); + } + + llkCheckMs = std::max(llkCheckMs, LLK_CHECK_MS_MINIMUM); + if (llkCycle == 0ms) { + llkCycle = llkCheckMs; + } + llkCycle = std::min(llkCycle, llkCheckMs); +} + +milliseconds llkGetTimespecDiffMs(timespec* from, timespec* to) { + return duration_cast(seconds(to->tv_sec - from->tv_sec)) + + duration_cast(nanoseconds(to->tv_nsec - from->tv_nsec)); +} + +std::string llkProcGetName(pid_t tid, const char* comm, const char* cmdline) { + if ((cmdline != nullptr) && (*cmdline != '\0')) { + return cmdline; + } + if ((comm != nullptr) && (*comm != '\0')) { + return comm; + } + + // UNLIKELY! Here because killed before we kill it? + // Assume change is afoot, do not call llkTidAlloc + + // cmdline ? + std::string content = llkProcGetName(tid); + if (content.size() != 0) { + return content; + } + // Comm instead? + content = llkProcGetName(tid, "/comm"); + if (llkIsMissingExeLink(tid) && (content.size() != 0)) { + return '[' + content + ']'; + } + return content; +} + +int llkKillOneProcess(pid_t pid, char state, pid_t tid, const char* tcomm = nullptr, + const char* tcmdline = nullptr, const char* pcomm = nullptr, + const char* pcmdline = nullptr) { + std::string forTid; + if (tid != pid) { + forTid = " for '" + llkProcGetName(tid, tcomm, tcmdline) + "' (" + std::to_string(tid) + ")"; + } + LOG(INFO) << "Killing '" << llkProcGetName(pid, pcomm, pcmdline) << "' (" << pid + << ") to check forward scheduling progress in " << state << " state" << forTid; + // CAP_KILL required + errno = 0; + auto r = ::kill(pid, SIGKILL); + if (r) { + PLOG(ERROR) << "kill(" << pid << ")=" << r << ' '; + } + + return r; +} + +// Kill one process +int llkKillOneProcess(pid_t pid, proc* tprocp) { + return llkKillOneProcess(pid, tprocp->state, tprocp->tid, tprocp->getComm(), + tprocp->getCmdline()); +} + +// Kill one process specified by kprocp +int llkKillOneProcess(proc* kprocp, proc* tprocp) { + if (kprocp == nullptr) { + return -2; + } + + return llkKillOneProcess(kprocp->tid, tprocp->state, tprocp->tid, tprocp->getComm(), + tprocp->getCmdline(), kprocp->getComm(), kprocp->getCmdline()); +} + +// Acquire file descriptor from environment, or open and cache it. +// NB: cache is unnecessary in our current context, pedantically +// required to prevent leakage of file descriptors in the future. +int llkFileToWriteFd(const std::string& file) { + static std::unordered_map cache; + auto search = cache.find(file); + if (search != cache.end()) return search->second; + auto fd = android_get_control_file(file.c_str()); + if (fd >= 0) return fd; + fd = TEMP_FAILURE_RETRY(::open(file.c_str(), O_WRONLY | O_CLOEXEC)); + if (fd >= 0) cache.emplace(std::make_pair(file, fd)); + return fd; +} + +// Wrap android::base::WriteStringToFile to use android_get_control_file. +bool llkWriteStringToFile(const std::string& string, const std::string& file) { + auto fd = llkFileToWriteFd(file); + if (fd < 0) return false; + return android::base::WriteStringToFd(string, fd); +} + +bool llkWriteStringToFileConfirm(const std::string& string, const std::string& file) { + auto fd = llkFileToWriteFd(file); + auto ret = (fd < 0) ? false : android::base::WriteStringToFd(string, fd); + std::string content; + if (!android::base::ReadFileToString(file, &content)) return ret; + return android::base::Trim(content) == string; +} + +void llkPanicKernel(bool dump, pid_t tid) __noreturn; +void llkPanicKernel(bool dump, pid_t tid) { + auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger"); + if (sysrqTriggerFd < 0) { + // DYB + llkKillOneProcess(initPid, 'R', tid); + // The answer to life, the universe and everything + ::exit(42); + // NOTREACHED + } + ::sync(); + if (dump) { + // Show all locks that are held + android::base::WriteStringToFd("d", sysrqTriggerFd); + // This can trigger hardware watchdog, that is somewhat _ok_. + // But useless if pstore configured for <256KB, low ram devices ... + if (!llkLowRam) { + android::base::WriteStringToFd("t", sysrqTriggerFd); + } + ::usleep(200000); // let everything settle + } + android::base::WriteStringToFd("c", sysrqTriggerFd); + // NOTREACHED + // DYB + llkKillOneProcess(initPid, 'R', tid); + // I sat at my desk, stared into the garden and thought '42 will do'. + // I typed it out. End of story + ::exit(42); + // NOTREACHED +} + +void llkAlarmHandler(int) { + llkPanicKernel(false, ::getpid()); +} + +milliseconds GetUintProperty(const std::string& key, milliseconds def) { + return milliseconds(android::base::GetUintProperty(key, static_cast(def.count()), + static_cast(def.max().count()))); +} + +seconds GetUintProperty(const std::string& key, seconds def) { + return seconds(android::base::GetUintProperty(key, static_cast(def.count()), + static_cast(def.max().count()))); +} + +proc* llkTidLookup(pid_t tid) { + auto search = tids.find(tid); + if (search == tids.end()) { + return nullptr; + } + return &search->second; +} + +void llkTidRemove(pid_t tid) { + tids.erase(tid); +} + +proc* llkTidAlloc(pid_t tid, pid_t pid, pid_t ppid, const char* comm, int time, char state) { + auto it = tids.emplace(std::make_pair(tid, proc(tid, pid, ppid, comm, time, state))); + return &it.first->second; +} + +std::string llkFormat(milliseconds ms) { + auto sec = duration_cast(ms); + std::ostringstream s; + s << sec.count() << '.'; + auto f = s.fill('0'); + auto w = s.width(3); + s << std::right << (ms - sec).count(); + s.width(w); + s.fill(f); + s << 's'; + return s.str(); +} + +std::string llkFormat(seconds s) { + return std::to_string(s.count()) + 's'; +} + +std::string llkFormat(bool flag) { + return flag ? "true" : "false"; +} + +std::string llkFormat(const std::unordered_set& blacklist) { + std::string ret; + for (auto entry : blacklist) { + if (ret.size()) { + ret += ","; + } + ret += entry; + } + return ret; +} + +// We only officially support comma separators, but wetware being what they +// are will take some liberty and I do not believe they should be punished. +std::unordered_set llkSplit(const std::string& s, + const std::string& delimiters = ", \t:") { + std::unordered_set result; + + size_t base = 0; + size_t found; + while (true) { + found = s.find_first_of(delimiters, base); + result.emplace(s.substr(base, found - base)); + if (found == s.npos) break; + base = found + 1; + } + return result; +} + +bool llkSkipName(const std::string& name, + const std::unordered_set& blacklist = llkBlacklistProcess) { + if ((name.size() == 0) || (blacklist.size() == 0)) { + return false; + } + + return blacklist.find(name) != blacklist.end(); +} + +bool llkSkipPid(pid_t pid) { + return llkSkipName(std::to_string(pid), llkBlacklistProcess); +} + +bool llkSkipPpid(pid_t ppid) { + return llkSkipName(std::to_string(ppid), llkBlacklistParent); +} + +bool llkSkipUid(uid_t uid) { + // Match by number? + if (llkSkipName(std::to_string(uid), llkBlacklistUid)) { + return true; + } + + // Match by name? + auto pwd = ::getpwuid(uid); + return (pwd != nullptr) && __predict_true(pwd->pw_name != nullptr) && + __predict_true(pwd->pw_name[0] != '\0') && llkSkipName(pwd->pw_name, llkBlacklistUid); +} + +bool getValidTidDir(dirent* dp, std::string* piddir) { + if (!::isdigit(dp->d_name[0])) { + return false; + } + + // Corner case can not happen in reality b/c of above ::isdigit check + if (__predict_false(dp->d_type != DT_DIR)) { + if (__predict_false(dp->d_type == DT_UNKNOWN)) { // can't b/c procfs + struct stat st; + *piddir = procdir; + *piddir += dp->d_name; + return (lstat(piddir->c_str(), &st) == 0) && (st.st_mode & S_IFDIR); + } + return false; + } + + *piddir = procdir; + *piddir += dp->d_name; + return true; +} + +bool llkIsMonitorState(char state) { + return (state == 'Z') || (state == 'D'); +} + +// returns -1 if not found +long long getSchedValue(const std::string& schedString, const char* key) { + auto pos = schedString.find(key); + if (pos == std::string::npos) { + return -1; + } + pos = schedString.find(':', pos); + if (__predict_false(pos == std::string::npos)) { + return -1; + } + while ((++pos < schedString.size()) && ::isblank(schedString[pos])) { + ; + } + long long ret; + if (!android::base::ParseInt(schedString.substr(pos), &ret, static_cast(0))) { + return -1; + } + return ret; +} + +// Primary ABA mitigation watching last time schedule activity happened +void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { + // Audit finds /proc//sched is just over 1K, and + // is rarely larger than 2K, even less on Android. + // For example, the "se.avg.lastUpdateTime" field we are + // interested in typically within the primary set in + // the first 1K. + // + // Proc entries can not be read >1K atomically via libbase, + // but if there are problems we assume at least a few + // samples of reads occur before we take any real action. + std::string schedString = ReadFile(piddir + "/sched"); + if (schedString.size() == 0) { + // /schedstat is not as standardized, but in 3.1+ + // Android devices, the third field is nr_switches + // from /sched: + schedString = ReadFile(piddir + "/schedstat"); + if (schedString.size() == 0) { + return; + } + auto val = static_cast(-1); + if (((::sscanf(schedString.c_str(), "%*d %*d %llu", &val)) == 1) && + (val != static_cast(-1)) && (val != 0) && + (val != procp->nrSwitches)) { + procp->nrSwitches = val; + procp->count = 0ms; + procp->killed = false; + } + return; + } + + auto val = getSchedValue(schedString, "\nse.avg.lastUpdateTime"); + if (val == -1) { + val = getSchedValue(schedString, "\nse.svg.last_update_time"); + } + if (val != -1) { + auto schedUpdate = nanoseconds(val); + if (schedUpdate != procp->schedUpdate) { + procp->schedUpdate = schedUpdate; + procp->count = 0ms; + procp->killed = false; + } + } + + val = getSchedValue(schedString, "\nnr_switches"); + if (val != -1) { + if (static_cast(val) != procp->nrSwitches) { + procp->nrSwitches = val; + procp->count = 0ms; + procp->killed = false; + } + } +} + +void llkLogConfig(void) { + LOG(INFO) << "ro.config.low_ram=" << llkFormat(llkLowRam) << "\n" + << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n" + << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n" + << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n" + << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n" + << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n" + << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n" + << LLK_Z_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateZ]) << "\n" + << LLK_CHECK_MS_PROPERTY "=" << llkFormat(llkCheckMs) << "\n" + << LLK_BLACKLIST_PROCESS_PROPERTY "=" << llkFormat(llkBlacklistProcess) << "\n" + << LLK_BLACKLIST_PARENT_PROPERTY "=" << llkFormat(llkBlacklistParent) << "\n" + << LLK_BLACKLIST_UID_PROPERTY "=" << llkFormat(llkBlacklistUid); +} + +void* llkThread(void* obj) { + LOG(INFO) << "started"; + + std::string name = std::to_string(::gettid()); + if (!llkSkipName(name)) { + llkBlacklistProcess.emplace(name); + } + name = static_cast(obj); + prctl(PR_SET_NAME, name.c_str()); + if (__predict_false(!llkSkipName(name))) { + llkBlacklistProcess.insert(name); + } + // No longer modifying llkBlacklistProcess. + llkRunning = true; + llkLogConfig(); + while (llkRunning) { + ::usleep(duration_cast(llkCheck(true)).count()); + } + // NOTREACHED + LOG(INFO) << "exiting"; + return nullptr; +} + +} // namespace + +milliseconds llkCheck(bool checkRunning) { + if (!llkEnable || (checkRunning != llkRunning)) { + return milliseconds::max(); + } + + // Reset internal watchdog, which is a healthy engineering margin of + // double the maximum wait or cycle time for the mainloop that calls us. + // + // This alarm is effectively the live lock detection of llkd, as + // we understandably can not monitor ourselves otherwise. + ::alarm(duration_cast(llkTimeoutMs * 2).count()); + + // kernel jiffy precision fastest acquisition + static timespec last; + timespec now; + ::clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + auto ms = llkGetTimespecDiffMs(&last, &now); + if (ms < llkCycle) { + return llkCycle - ms; + } + last = now; + + LOG(VERBOSE) << "opendir(\"" << procdir << "\")"; + if (__predict_false(!llkTopDirectory)) { + // gid containing AID_READPROC required + llkTopDirectory.reset(procdir); + if (__predict_false(!llkTopDirectory)) { + // Most likely reason we could be here is a resource limit. + // Keep our processing down to a minimum, but not so low that + // we do not recover in a timely manner should the issue be + // transitory. + LOG(DEBUG) << "opendir(\"" << procdir << "\") failed"; + return llkTimeoutMs; + } + } + + for (auto& it : tids) { + it.second.updated = false; + } + + auto prevUpdate = llkUpdate; + llkUpdate += ms; + ms -= llkCycle; + auto myPid = ::getpid(); + auto myTid = ::gettid(); + for (auto dp = llkTopDirectory.read(); dp != nullptr; dp = llkTopDirectory.read()) { + std::string piddir; + + if (!getValidTidDir(dp, &piddir)) { + continue; + } + + // Get the process tasks + std::string taskdir = piddir + "/task/"; + int pid = -1; + LOG(VERBOSE) << "+opendir(\"" << taskdir << "\")"; + dir taskDirectory(taskdir); + if (__predict_false(!taskDirectory)) { + LOG(DEBUG) << "+opendir(\"" << taskdir << "\") failed"; + } + for (auto tp = taskDirectory.read(dir::task, dp); tp != nullptr; + tp = taskDirectory.read(dir::task)) { + if (!getValidTidDir(tp, &piddir)) { + continue; + } + + // Get the process stat + std::string stat = ReadFile(piddir + "/stat"); + if (stat.size() == 0) { + continue; + } + unsigned tid = -1; + char pdir[TASK_COMM_LEN + 1]; + char state = '?'; + unsigned ppid = -1; + unsigned utime = -1; + unsigned stime = -1; + int dummy; + pdir[0] = '\0'; + // tid should not change value + auto match = ::sscanf( + stat.c_str(), + "%u (%" ___STRING( + TASK_COMM_LEN) "[^)]) %c %u %*d %*d %*d %*d %*d %*d %*d %*d %*d %u %u %d", + &tid, pdir, &state, &ppid, &utime, &stime, &dummy); + if (pid == -1) { + pid = tid; + } + LOG(VERBOSE) << "match " << match << ' ' << tid << " (" << pdir << ") " << state << ' ' + << ppid << " ... " << utime << ' ' << stime << ' ' << dummy; + if (match != 7) { + continue; + } + + auto procp = llkTidLookup(tid); + if (procp == nullptr) { + procp = llkTidAlloc(tid, pid, ppid, pdir, utime + stime, state); + } else { + // comm can change ... + procp->setComm(pdir); + procp->updated = true; + // pid/ppid/tid wrap? + if (((procp->update != prevUpdate) && (procp->update != llkUpdate)) || + (procp->ppid != ppid) || (procp->pid != pid)) { + procp->reset(); + } else if (procp->time != (utime + stime)) { // secondary ABA. + // watching utime+stime granularity jiffy + procp->state = '?'; + } + procp->update = llkUpdate; + procp->pid = pid; + procp->ppid = ppid; + procp->time = utime + stime; + if (procp->state != state) { + procp->count = 0ms; + procp->killed = false; + procp->state = state; + } else { + procp->count += llkCycle; + } + } + + // Filter checks in intuitive order of CPU cost to evaluate + // If tid unique continue, if ppid or pid unique break + + if (pid == myPid) { + break; + } + if (!llkIsMonitorState(state)) { + continue; + } + if ((tid == myTid) || llkSkipPid(tid)) { + continue; + } + if (llkSkipPpid(ppid)) { + break; + } + + if (llkSkipName(procp->getComm())) { + continue; + } + if (llkSkipName(procp->getCmdline())) { + break; + } + + auto pprocp = llkTidLookup(ppid); + if (pprocp == nullptr) { + pprocp = llkTidAlloc(ppid, ppid, 0, "", 0, '?'); + } + if ((pprocp != nullptr) && (llkSkipName(pprocp->getComm(), llkBlacklistParent) || + llkSkipName(pprocp->getCmdline(), llkBlacklistParent))) { + break; + } + + if ((llkBlacklistUid.size() != 0) && llkSkipUid(procp->getUid())) { + continue; + } + + // ABA mitigation watching last time schedule activity happened + llkCheckSchedUpdate(procp, piddir); + + // Can only fall through to here if registered D or Z state !!! + if (procp->count < llkStateTimeoutMs[(state == 'Z') ? llkStateZ : llkStateD]) { + LOG(VERBOSE) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" + << pid << "->" << tid << ' ' << procp->getComm(); + continue; + } + + // We have to kill it to determine difference between live lock + // and persistent state blocked on a resource. Is there something + // wrong with a process that has no forward scheduling progress in + // Z or D? Yes, generally means improper accounting in the + // process, but not always ... + // + // Whomever we hit with a test kill must accept the Android + // Aphorism that everything can be burned to the ground and + // must survive. + if (procp->killed == false) { + procp->killed = true; + // confirm: re-read uid before committing to a panic. + procp->uid = -1; + switch (state) { + case 'Z': // kill ppid to free up a Zombie + // Killing init will kernel panic without diagnostics + // so skip right to controlled kernel panic with + // diagnostics. + if (ppid == initPid) { + break; + } + LOG(WARNING) << "Z " << llkFormat(procp->count) << ' ' << ppid << "->" + << pid << "->" << tid << ' ' << procp->getComm() << " [kill]"; + if ((llkKillOneProcess(pprocp, procp) >= 0) || + (llkKillOneProcess(ppid, procp) >= 0)) { + continue; + } + break; + + case 'D': // kill tid to free up an uninterruptible D + // If ABA is doing its job, we would not need or + // want the following. Test kill is a Hail Mary + // to make absolutely sure there is no forward + // scheduling progress. The cost when ABA is + // not working is we kill a process that likes to + // stay in 'D' state, instead of panicing the + // kernel (worse). + LOG(WARNING) << "D " << llkFormat(procp->count) << ' ' << pid << "->" << tid + << ' ' << procp->getComm() << " [kill]"; + if ((llkKillOneProcess(llkTidLookup(pid), procp) >= 0) || + (llkKillOneProcess(pid, 'D', tid) >= 0) || + (llkKillOneProcess(procp, procp) >= 0) || + (llkKillOneProcess(tid, 'D', tid) >= 0)) { + continue; + } + break; + } + } + // We are here because we have confirmed kernel live-lock + LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid + << "->" << tid << ' ' << procp->getComm() << " [panic]"; + llkPanicKernel(true, tid); + } + LOG(VERBOSE) << "+closedir()"; + } + llkTopDirectory.rewind(); + LOG(VERBOSE) << "closedir()"; + + // garbage collection of old process references + for (auto p = tids.begin(); p != tids.end();) { + if (!p->second.updated) { + IF_ALOG(LOG_VERBOSE, LOG_TAG) { + std::string ppidCmdline = llkProcGetName(p->second.ppid, nullptr, nullptr); + if (ppidCmdline.size()) { + ppidCmdline = "(" + ppidCmdline + ")"; + } + std::string pidCmdline; + if (p->second.pid != p->second.tid) { + pidCmdline = llkProcGetName(p->second.pid, nullptr, p->second.getCmdline()); + if (pidCmdline.size()) { + pidCmdline = "(" + pidCmdline + ")"; + } + } + std::string tidCmdline = + llkProcGetName(p->second.tid, p->second.getComm(), p->second.getCmdline()); + if (tidCmdline.size()) { + tidCmdline = "(" + tidCmdline + ")"; + } + LOG(VERBOSE) << "thread " << p->second.ppid << ppidCmdline << "->" << p->second.pid + << pidCmdline << "->" << p->second.tid << tidCmdline << " removed"; + } + p = tids.erase(p); + } else { + ++p; + } + } + if (__predict_false(tids.empty())) { + llkTopDirectory.reset(); + } + + llkCycle = llkCheckMs; + + timespec end; + ::clock_gettime(CLOCK_MONOTONIC_COARSE, &end); + auto milli = llkGetTimespecDiffMs(&now, &end); + LOG((milli > 10s) ? ERROR : (milli > 1s) ? WARNING : VERBOSE) << "sample " << llkFormat(milli); + + // cap to minimum sleep for 1 second since last cycle + if (llkCycle < (ms + 1s)) { + return 1s; + } + return llkCycle - ms; +} + +unsigned llkCheckMilliseconds() { + return duration_cast(llkCheck()).count(); +} + +bool llkInit(const char* threadname) { + llkLowRam = android::base::GetBoolProperty("ro.config.low_ram", false); + llkEnable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, llkEnable); + if (llkEnable && !llkTopDirectory.reset(procdir)) { + // Most likely reason we could be here is llkd was started + // incorrectly without the readproc permissions. Keep our + // processing down to a minimum. + llkEnable = false; + } + khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable); + llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall); + // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set + // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value. + khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); + if (khtTimeout == 0s) { + khtTimeout = duration_cast(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) / + LLK_CHECKS_PER_TIMEOUT_DEFAULT); + } + llkTimeoutMs = + khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT); + llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs); + llkValidate(); // validate llkTimeoutMs, llkCheckMs and llkCycle + llkStateTimeoutMs[llkStateD] = GetUintProperty(LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs); + llkStateTimeoutMs[llkStateZ] = GetUintProperty(LLK_Z_TIMEOUT_MS_PROPERTY, llkTimeoutMs); + llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs); + llkValidate(); // validate all (effectively minus llkTimeoutMs) + std::string defaultBlacklistProcess( + std::to_string(kernelPid) + "," + std::to_string(initPid) + "," + + std::to_string(kthreaddPid) + "," + std::to_string(::getpid()) + "," + + std::to_string(::gettid()) + "," LLK_BLACKLIST_PROCESS_DEFAULT); + if (threadname) { + defaultBlacklistProcess += std::string(",") + threadname; + } + for (int cpu = 1; cpu < get_nprocs_conf(); ++cpu) { + defaultBlacklistProcess += ",[watchdog/" + std::to_string(cpu) + "]"; + } + defaultBlacklistProcess = + android::base::GetProperty(LLK_BLACKLIST_PROCESS_PROPERTY, defaultBlacklistProcess); + llkBlacklistProcess = llkSplit(defaultBlacklistProcess); + if (!llkSkipName("[khungtaskd]")) { // ALWAYS ignore as special + llkBlacklistProcess.emplace("[khungtaskd]"); + } + llkBlacklistParent = llkSplit(android::base::GetProperty( + LLK_BLACKLIST_PARENT_PROPERTY, std::to_string(kernelPid) + "," + std::to_string(kthreaddPid) + + "," LLK_BLACKLIST_PARENT_DEFAULT)); + llkBlacklistUid = + llkSplit(android::base::GetProperty(LLK_BLACKLIST_UID_PROPERTY, LLK_BLACKLIST_UID_DEFAULT)); + + // internal watchdog + ::signal(SIGALRM, llkAlarmHandler); + + // kernel hung task configuration? Otherwise leave it as-is + if (khtEnable) { + // EUID must be AID_ROOT to write to /proc/sys/kernel/ nodes, there + // are no capability overrides. For security reasons we do not want + // to run as AID_ROOT. We may not be able to write them successfully, + // we will try, but the least we can do is read the values back to + // confirm expectations and report whether configured or not. + auto configured = llkWriteStringToFileConfirm(std::to_string(khtTimeout.count()), + "/proc/sys/kernel/hung_task_timeout_secs"); + if (configured) { + llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_warnings"); + llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_check_count"); + configured = llkWriteStringToFileConfirm("1", "/proc/sys/kernel/hung_task_panic"); + } + if (configured) { + LOG(INFO) << "[khungtaskd] configured"; + } else { + LOG(WARNING) << "[khungtaskd] not configurable"; + } + } + + bool logConfig = true; + if (llkEnable) { + if (llkMlockall && + // MCL_ONFAULT pins pages as they fault instead of loading + // everything immediately all at once. (Which would be bad, + // because as of this writing, we have a lot of mapped pages we + // never use.) Old kernels will see MCL_ONFAULT and fail with + // EINVAL; we ignore this failure. + // + // N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT + // pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault + // in pages. + + // CAP_IPC_LOCK required + mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) { + PLOG(WARNING) << "mlockall failed "; + } + + if (threadname) { + pthread_attr_t attr; + + if (!pthread_attr_init(&attr)) { + sched_param param; + + memset(¶m, 0, sizeof(param)); + pthread_attr_setschedparam(&attr, ¶m); + pthread_attr_setschedpolicy(&attr, SCHED_BATCH); + if (!pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) { + pthread_t thread; + if (!pthread_create(&thread, &attr, llkThread, const_cast(threadname))) { + // wait a second for thread to start + for (auto retry = 50; retry && !llkRunning; --retry) { + ::usleep(20000); + } + logConfig = !llkRunning; // printed in llkd context? + } else { + LOG(ERROR) << "failed to spawn llkd thread"; + } + } else { + LOG(ERROR) << "failed to detach llkd thread"; + } + pthread_attr_destroy(&attr); + } else { + LOG(ERROR) << "failed to allocate attibutes for llkd thread"; + } + } + } else { + LOG(DEBUG) << "[khungtaskd] left unconfigured"; + } + if (logConfig) { + llkLogConfig(); + } + + return llkEnable; +} diff --git a/llkd/llkd.cpp b/llkd/llkd.cpp new file mode 100644 index 000000000..f10253d45 --- /dev/null +++ b/llkd/llkd.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "llkd.h" + +#include +#include + +#include + +#include + +using namespace std::chrono; + +int main(int, char**) { + LOG(INFO) << "started"; + + bool enabled = llkInit(); + + // Would like this policy to be automatic as part of libllkd, + // but that would be presumptuous and bad side-effect. + struct sched_param param; + memset(¶m, 0, sizeof(param)); + sched_setscheduler(0, SCHED_BATCH, ¶m); + + while (true) { + if (enabled) { + ::usleep(duration_cast(llkCheck()).count()); + } else { + ::pause(); + } + } + // NOTREACHED + + LOG(INFO) << "exiting"; + return 0; +} diff --git a/llkd/llkd.rc b/llkd/llkd.rc new file mode 100644 index 000000000..a257e76e1 --- /dev/null +++ b/llkd/llkd.rc @@ -0,0 +1,18 @@ +# Configure [khungtaskd] +on property:ro.khungtask.enable=true + write /proc/sys/kernel/hung_task_timeout_secs ${ro.khungtask.timeout:-720} + write /proc/sys/kernel/hung_task_warnings 65535 + write /proc/sys/kernel/hung_task_check_count 65535 + write /proc/sys/kernel/hung_task_panic 1 + +on property:ro.llk.enable=true + start llkd + +service llkd /system/bin/llkd + class late_start + disabled + user llkd + group llkd readproc + capabilities KILL IPC_LOCK + file /proc/sysrq-trigger w + writepid /dev/cpuset/system-background/tasks diff --git a/llkd/tests/Android.bp b/llkd/tests/Android.bp new file mode 100644 index 000000000..6dd5938e3 --- /dev/null +++ b/llkd/tests/Android.bp @@ -0,0 +1,41 @@ +// Copyright (C) 2018 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +cc_test { + name: "llkd_unit_test", + + shared_libs: [ + "libbase", + "liblog", + ], + header_libs: [ + "llkd_headers", + ], + + target: { + android: { + srcs: [ + "llkd_test.cpp", + ], + }, + }, + + cflags: [ + "-Wall", + "-Wextra", + "-Werror", + ], + + compile_multilib: "first", +} diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp new file mode 100644 index 000000000..e3c95eba3 --- /dev/null +++ b/llkd/tests/llkd_test.cpp @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include // for MS_PER_SEC and US_PER_SEC + +#include "llkd.h" + +using namespace std::chrono; +using namespace std::chrono_literals; + +namespace { + +milliseconds GetUintProperty(const std::string& key, milliseconds def) { + return milliseconds(android::base::GetUintProperty(key, static_cast(def.count()), + static_cast(def.max().count()))); +} + +seconds GetUintProperty(const std::string& key, seconds def) { + return seconds(android::base::GetUintProperty(key, static_cast(def.count()), + static_cast(def.max().count()))); +} + +// GTEST_LOG_(WARNING) output is fugly, this has much less noise +// ToDo: look into fixing googletest to produce output that matches style of +// all the other status messages, and can switch off __line__ and +// __function__ noise +#define GTEST_LOG_WARNING std::cerr << "[ WARNING ] " +#define GTEST_LOG_INFO std::cerr << "[ INFO ] " + +// Properties is _not_ a high performance ABI! +void rest() { + usleep(200000); +} + +void execute(const char* command) { + if (getuid() || system(command)) { + system((std::string("su root ") + command).c_str()); + } +} + +seconds llkdSleepPeriod(char state) { + auto default_enable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, LLK_ENABLE_DEFAULT); + if (android::base::GetProperty(LLK_ENABLE_PROPERTY, "nothing") == "nothing") { + GTEST_LOG_INFO << LLK_ENABLE_PROPERTY " defaults to " << (default_enable ? "true" : "false") + << "\n"; + } + // Hail Mary hope is unconfigured. + if ((GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, LLK_TIMEOUT_MS_DEFAULT) != + duration_cast(120s)) || + (GetUintProperty(LLK_CHECK_MS_PROPERTY, + LLK_TIMEOUT_MS_DEFAULT / LLK_CHECKS_PER_TIMEOUT_DEFAULT) != + duration_cast(10s))) { + execute("stop llkd"); + rest(); + std::string setprop("setprop "); + execute((setprop + LLK_TIMEOUT_MS_PROPERTY + " 120000").c_str()); + rest(); + execute((setprop + KHT_TIMEOUT_PROPERTY + " 130").c_str()); + rest(); + execute((setprop + LLK_CHECK_MS_PROPERTY + " 10000").c_str()); + rest(); + execute((setprop + LLK_ENABLE_PROPERTY + " true").c_str()); + rest(); + } + default_enable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, false); + if (default_enable) { + execute("start llkd"); + rest(); + GTEST_LOG_INFO << "llkd enabled\n"; + } else { + GTEST_LOG_WARNING << "llkd disabled\n"; + } + + /* KISS follows llk_init() */ + milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; + seconds khtTimeout = duration_cast( + llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) / LLK_CHECKS_PER_TIMEOUT_DEFAULT); + khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); + llkTimeoutMs = + khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT); + llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs); + if (llkTimeoutMs < LLK_TIMEOUT_MS_MINIMUM) { + llkTimeoutMs = LLK_TIMEOUT_MS_MINIMUM; + } + milliseconds llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT; + auto timeout = GetUintProperty( + (state == 'Z') ? LLK_Z_TIMEOUT_MS_PROPERTY : LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs); + if (timeout < LLK_TIMEOUT_MS_MINIMUM) { + timeout = LLK_TIMEOUT_MS_MINIMUM; + } + + if (llkCheckMs > timeout) { + llkCheckMs = timeout; + } + llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs); + timeout += llkCheckMs; + auto sec = duration_cast(timeout); + if (sec == 0s) { + ++sec; + } else if (sec > 59s) { + GTEST_LOG_WARNING << "llkd is configured for about " << duration_cast(sec).count() + << " minutes to react\n"; + } + + // 33% margin for the test to naturally timeout waiting for llkd to respond + return (sec * 4 + 2s) / 3; +} + +inline void waitForPid(pid_t child_pid) { + int wstatus; + ASSERT_LE(0, waitpid(child_pid, &wstatus, 0)); + EXPECT_FALSE(WIFEXITED(wstatus)) << "[ INFO ] exit=" << WEXITSTATUS(wstatus); + ASSERT_TRUE(WIFSIGNALED(wstatus)); + ASSERT_EQ(WTERMSIG(wstatus), SIGKILL); +} + +} // namespace + +// The tests that use this helper are to simulate processes stuck in 'D' +// state that are experiencing forward scheduled progress. As such the +// expectation is that llkd will _not_ perform any mitigations. The sleepfor +// argument helps us set the amount of forward scheduler progress. +static void llkd_driver_ABA(const microseconds sleepfor) { + const auto period = llkdSleepPeriod('D'); + if (period <= sleepfor) { + GTEST_LOG_WARNING << "llkd configuration too short for " + << duration_cast(sleepfor).count() << "ms work cycle\n"; + return; + } + + auto child_pid = fork(); + ASSERT_LE(0, child_pid); + int wstatus; + if (!child_pid) { + auto ratio = period / sleepfor; + ASSERT_LT(0, ratio); + // vfork() parent is uninterruptable D state waiting for child to exec() + while (--ratio > 0) { + auto driver_pid = vfork(); + ASSERT_LE(0, driver_pid); + if (driver_pid) { // parent + waitpid(driver_pid, &wstatus, 0); + if (!WIFEXITED(wstatus)) { + exit(42); + } + if (WEXITSTATUS(wstatus) != 42) { + exit(42); + } + } else { + usleep(sleepfor.count()); + exit(42); + } + } + exit(0); + } + ASSERT_LE(0, waitpid(child_pid, &wstatus, 0)); + EXPECT_TRUE(WIFEXITED(wstatus)); + if (WIFEXITED(wstatus)) { + EXPECT_EQ(0, WEXITSTATUS(wstatus)); + } + ASSERT_FALSE(WIFSIGNALED(wstatus)) << "[ INFO ] signo=" << WTERMSIG(wstatus); +} + +TEST(llkd, driver_ABA_fast) { + llkd_driver_ABA(5ms); +} + +TEST(llkd, driver_ABA_slow) { + llkd_driver_ABA(1s); +} + +TEST(llkd, driver_ABA_glacial) { + llkd_driver_ABA(1min); +} + +// Following tests must be last in this file to capture possible errant +// kernel_panic mitigation failure. + +// The following tests simulate processes stick in 'Z' or 'D' state with +// no forward scheduling progress, but interruptible. As such the expectation +// is that llkd will perform kill mitigation and not progress to kernel_panic. + +TEST(llkd, zombie) { + const auto period = llkdSleepPeriod('Z'); + + /* Create a Persistent Zombie Process */ + pid_t child_pid = fork(); + ASSERT_LE(0, child_pid); + if (!child_pid) { + auto zombie_pid = fork(); + ASSERT_LE(0, zombie_pid); + if (!zombie_pid) { + sleep(1); + exit(0); + } + sleep(period.count()); + exit(42); + } + + waitForPid(child_pid); +} + +TEST(llkd, driver) { + const auto period = llkdSleepPeriod('D'); + + /* Create a Persistent Device Process */ + auto child_pid = fork(); + ASSERT_LE(0, child_pid); + if (!child_pid) { + // vfork() parent is uninterruptable D state waiting for child to exec() + auto driver_pid = vfork(); + ASSERT_LE(0, driver_pid); + sleep(period.count()); + exit(driver_pid ? 42 : 0); + } + + waitForPid(child_pid); +}