Merge "llkd: bootstat: propagate detailed livelock canonical boot reason"

This commit is contained in:
Treehugger Robot 2018-04-20 21:09:04 +00:00 committed by Gerrit Code Review
commit 481a8125a6
6 changed files with 52 additions and 17 deletions

View file

@ -303,6 +303,9 @@ const std::map<std::string, int32_t> kBootReasonMap = {
{"kernel_panic,init", 158},
{"kernel_panic,oom", 159},
{"kernel_panic,stack", 160},
{"kernel_panic,sysrq,livelock,alarm", 161}, // llkd
{"kernel_panic,sysrq,livelock,driver", 162}, // llkd
{"kernel_panic,sysrq,livelock,zombie", 163}, // llkd
};
// Converts a string value representing the reason the system booted to an

View file

@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these
conditions. If the test can, it will reconfigure llkd to expedite the test
duration by adjusting the ro.llk.* Android properties. Tests run the D state
with some scheduling progress to ensure that ABA checking prevents false
triggers.
triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be
set to false; however this will result in some of the unit tests to panic
kernel instead of deal with more graceful kill operation.
Android Properties
------------------
@ -108,13 +110,6 @@ default <empty>, comma separated list of uid numbers or names.
Architectural Concerns
----------------------
- Figure out how to communicate the kernel panic better to bootstat canonical
boot reason determination. This may require an alteration to bootstat, or
some logging from llkd. Would like to see boot reason to be
watchdog,livelock as a minimum requirement. Or more specifically would want
watchdog,livelock,device or watchdog,livelock,zombie be reported.
Currently reports panic,sysrq (user requested panic) or panic depending on
system support of pstore.
- Create kernel module and associated gTest to actually test panic.
- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
not be inputs). Could require more test-only interfaces to libllkd.

View file

@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void);
#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY
#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
#define LLK_MLOCKALL_DEFAULT true
#define LLK_KILLTEST_PROPERTY "ro.llk.killtest"
#define LLK_KILLTEST_DEFAULT true
#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"

View file

@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check
bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
bool llkRunning = false; // thread is running
bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills
milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
@ -292,7 +293,7 @@ struct proc {
exeMissingValid(false),
cmdlineValid(false),
updated(true),
killed(false) {
killed(!llkTestWithKill) {
memset(comm, '\0', sizeof(comm));
setComm(_comm);
}
@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f
return android::base::Trim(content) == string;
}
void llkPanicKernel(bool dump, pid_t tid) __noreturn;
void llkPanicKernel(bool dump, pid_t tid) {
void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn;
void llkPanicKernel(bool dump, pid_t tid, const char* state) {
auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
if (sysrqTriggerFd < 0) {
// DYB
@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) {
}
::usleep(200000); // let everything settle
}
llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n",
"/dev/kmsg");
android::base::WriteStringToFd("c", sysrqTriggerFd);
// NOTREACHED
// DYB
@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) {
}
void llkAlarmHandler(int) {
llkPanicKernel(false, ::getpid());
llkPanicKernel(false, ::getpid(), "alarm");
}
milliseconds GetUintProperty(const std::string& key, milliseconds def) {
@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
(val != procp->nrSwitches)) {
procp->nrSwitches = val;
procp->count = 0ms;
procp->killed = false;
procp->killed = !llkTestWithKill;
}
return;
}
@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
if (schedUpdate != procp->schedUpdate) {
procp->schedUpdate = schedUpdate;
procp->count = 0ms;
procp->killed = false;
procp->killed = !llkTestWithKill;
}
}
@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
if (static_cast<uint64_t>(val) != procp->nrSwitches) {
procp->nrSwitches = val;
procp->count = 0ms;
procp->killed = false;
procp->killed = !llkTestWithKill;
}
}
}
@ -719,6 +722,7 @@ void llkLogConfig(void) {
<< LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
<< KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
<< LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
<< LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n"
<< KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
<< LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
<< LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) {
procp->time = utime + stime;
if (procp->state != state) {
procp->count = 0ms;
procp->killed = false;
procp->killed = !llkTestWithKill;
procp->state = state;
} else {
procp->count += llkCycle;
@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) {
// We are here because we have confirmed kernel live-lock
LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
<< "->" << tid << ' ' << procp->getComm() << " [panic]";
llkPanicKernel(true, tid);
llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver");
}
LOG(VERBOSE) << "+closedir()";
}
@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) {
}
khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill);
// if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
// KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);

View file

@ -44,5 +44,6 @@ service llkd /system/bin/llkd
user llkd
group llkd readproc
capabilities KILL IPC_LOCK
file /dev/kmsg w
file /proc/sysrq-trigger w
writepid /dev/cpuset/system-background/tasks

View file

@ -154,6 +154,27 @@ inline void waitForPid(pid_t child_pid) {
ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
}
bool checkKill(const char* reason) {
if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) {
return false;
}
auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing");
if (bootreason == reason) {
GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n";
return true;
}
GTEST_LOG_WARNING << "Expected test result is " << reason << "\n";
// apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test)
//
// if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") {
// GTEST_LOG_WARNING << "Bypassing test\n";
// return true;
// }
return false;
}
} // namespace
// The tests that use this helper are to simulate processes stuck in 'D'
@ -221,6 +242,10 @@ TEST(llkd, driver_ABA_glacial) {
// is that llkd will perform kill mitigation and not progress to kernel_panic.
TEST(llkd, zombie) {
if (checkKill("kernel_panic,sysrq,livelock,zombie")) {
return;
}
const auto period = llkdSleepPeriod('Z');
/* Create a Persistent Zombie Process */
@ -241,6 +266,10 @@ TEST(llkd, zombie) {
}
TEST(llkd, driver) {
if (checkKill("kernel_panic,sysrq,livelock,driver")) {
return;
}
const auto period = llkdSleepPeriod('D');
/* Create a Persistent Device Process */