Merge "llkd: bootstat: propagate detailed livelock canonical boot reason"
This commit is contained in:
commit
481a8125a6
6 changed files with 52 additions and 17 deletions
|
@ -303,6 +303,9 @@ const std::map<std::string, int32_t> kBootReasonMap = {
|
|||
{"kernel_panic,init", 158},
|
||||
{"kernel_panic,oom", 159},
|
||||
{"kernel_panic,stack", 160},
|
||||
{"kernel_panic,sysrq,livelock,alarm", 161}, // llkd
|
||||
{"kernel_panic,sysrq,livelock,driver", 162}, // llkd
|
||||
{"kernel_panic,sysrq,livelock,zombie", 163}, // llkd
|
||||
};
|
||||
|
||||
// Converts a string value representing the reason the system booted to an
|
||||
|
|
|
@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these
|
|||
conditions. If the test can, it will reconfigure llkd to expedite the test
|
||||
duration by adjusting the ro.llk.* Android properties. Tests run the D state
|
||||
with some scheduling progress to ensure that ABA checking prevents false
|
||||
triggers.
|
||||
triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be
|
||||
set to false; however this will result in some of the unit tests to panic
|
||||
kernel instead of deal with more graceful kill operation.
|
||||
|
||||
Android Properties
|
||||
------------------
|
||||
|
@ -108,13 +110,6 @@ default <empty>, comma separated list of uid numbers or names.
|
|||
Architectural Concerns
|
||||
----------------------
|
||||
|
||||
- Figure out how to communicate the kernel panic better to bootstat canonical
|
||||
boot reason determination. This may require an alteration to bootstat, or
|
||||
some logging from llkd. Would like to see boot reason to be
|
||||
watchdog,livelock as a minimum requirement. Or more specifically would want
|
||||
watchdog,livelock,device or watchdog,livelock,zombie be reported.
|
||||
Currently reports panic,sysrq (user requested panic) or panic depending on
|
||||
system support of pstore.
|
||||
- Create kernel module and associated gTest to actually test panic.
|
||||
- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
|
||||
not be inputs). Could require more test-only interfaces to libllkd.
|
||||
|
|
|
@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void);
|
|||
#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY
|
||||
#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
|
||||
#define LLK_MLOCKALL_DEFAULT true
|
||||
#define LLK_KILLTEST_PROPERTY "ro.llk.killtest"
|
||||
#define LLK_KILLTEST_DEFAULT true
|
||||
#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
|
||||
#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
|
||||
#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"
|
||||
|
|
|
@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check
|
|||
bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
|
||||
bool llkRunning = false; // thread is running
|
||||
bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
|
||||
bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills
|
||||
milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
|
||||
enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
|
||||
milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
|
||||
|
@ -292,7 +293,7 @@ struct proc {
|
|||
exeMissingValid(false),
|
||||
cmdlineValid(false),
|
||||
updated(true),
|
||||
killed(false) {
|
||||
killed(!llkTestWithKill) {
|
||||
memset(comm, '\0', sizeof(comm));
|
||||
setComm(_comm);
|
||||
}
|
||||
|
@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f
|
|||
return android::base::Trim(content) == string;
|
||||
}
|
||||
|
||||
void llkPanicKernel(bool dump, pid_t tid) __noreturn;
|
||||
void llkPanicKernel(bool dump, pid_t tid) {
|
||||
void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn;
|
||||
void llkPanicKernel(bool dump, pid_t tid, const char* state) {
|
||||
auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
|
||||
if (sysrqTriggerFd < 0) {
|
||||
// DYB
|
||||
|
@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) {
|
|||
}
|
||||
::usleep(200000); // let everything settle
|
||||
}
|
||||
llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n",
|
||||
"/dev/kmsg");
|
||||
android::base::WriteStringToFd("c", sysrqTriggerFd);
|
||||
// NOTREACHED
|
||||
// DYB
|
||||
|
@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) {
|
|||
}
|
||||
|
||||
void llkAlarmHandler(int) {
|
||||
llkPanicKernel(false, ::getpid());
|
||||
llkPanicKernel(false, ::getpid(), "alarm");
|
||||
}
|
||||
|
||||
milliseconds GetUintProperty(const std::string& key, milliseconds def) {
|
||||
|
@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
|
|||
(val != procp->nrSwitches)) {
|
||||
procp->nrSwitches = val;
|
||||
procp->count = 0ms;
|
||||
procp->killed = false;
|
||||
procp->killed = !llkTestWithKill;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
|
|||
if (schedUpdate != procp->schedUpdate) {
|
||||
procp->schedUpdate = schedUpdate;
|
||||
procp->count = 0ms;
|
||||
procp->killed = false;
|
||||
procp->killed = !llkTestWithKill;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
|
|||
if (static_cast<uint64_t>(val) != procp->nrSwitches) {
|
||||
procp->nrSwitches = val;
|
||||
procp->count = 0ms;
|
||||
procp->killed = false;
|
||||
procp->killed = !llkTestWithKill;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -719,6 +722,7 @@ void llkLogConfig(void) {
|
|||
<< LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
|
||||
<< KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
|
||||
<< LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
|
||||
<< LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n"
|
||||
<< KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
|
||||
<< LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
|
||||
<< LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
|
||||
|
@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) {
|
|||
procp->time = utime + stime;
|
||||
if (procp->state != state) {
|
||||
procp->count = 0ms;
|
||||
procp->killed = false;
|
||||
procp->killed = !llkTestWithKill;
|
||||
procp->state = state;
|
||||
} else {
|
||||
procp->count += llkCycle;
|
||||
|
@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) {
|
|||
// We are here because we have confirmed kernel live-lock
|
||||
LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
|
||||
<< "->" << tid << ' ' << procp->getComm() << " [panic]";
|
||||
llkPanicKernel(true, tid);
|
||||
llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver");
|
||||
}
|
||||
LOG(VERBOSE) << "+closedir()";
|
||||
}
|
||||
|
@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) {
|
|||
}
|
||||
khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
|
||||
llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
|
||||
llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill);
|
||||
// if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
|
||||
// KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
|
||||
khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
|
||||
|
|
|
@ -44,5 +44,6 @@ service llkd /system/bin/llkd
|
|||
user llkd
|
||||
group llkd readproc
|
||||
capabilities KILL IPC_LOCK
|
||||
file /dev/kmsg w
|
||||
file /proc/sysrq-trigger w
|
||||
writepid /dev/cpuset/system-background/tasks
|
||||
|
|
|
@ -154,6 +154,27 @@ inline void waitForPid(pid_t child_pid) {
|
|||
ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
|
||||
}
|
||||
|
||||
bool checkKill(const char* reason) {
|
||||
if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) {
|
||||
return false;
|
||||
}
|
||||
auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing");
|
||||
if (bootreason == reason) {
|
||||
GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n";
|
||||
return true;
|
||||
}
|
||||
GTEST_LOG_WARNING << "Expected test result is " << reason << "\n";
|
||||
|
||||
// apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test)
|
||||
//
|
||||
// if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") {
|
||||
// GTEST_LOG_WARNING << "Bypassing test\n";
|
||||
// return true;
|
||||
// }
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// The tests that use this helper are to simulate processes stuck in 'D'
|
||||
|
@ -221,6 +242,10 @@ TEST(llkd, driver_ABA_glacial) {
|
|||
// is that llkd will perform kill mitigation and not progress to kernel_panic.
|
||||
|
||||
TEST(llkd, zombie) {
|
||||
if (checkKill("kernel_panic,sysrq,livelock,zombie")) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto period = llkdSleepPeriod('Z');
|
||||
|
||||
/* Create a Persistent Zombie Process */
|
||||
|
@ -241,6 +266,10 @@ TEST(llkd, zombie) {
|
|||
}
|
||||
|
||||
TEST(llkd, driver) {
|
||||
if (checkKill("kernel_panic,sysrq,livelock,driver")) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto period = llkdSleepPeriod('D');
|
||||
|
||||
/* Create a Persistent Device Process */
|
||||
|
|
Loading…
Reference in a new issue