From 3b5bb3a364f34b91296f324d1917b36cbe8fd7cc Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Tue, 31 Oct 2023 16:14:20 +0000 Subject: [PATCH 1/2] libprocessgroup: Poll on cgroup.events In killProcessGroup we currently read cgroup.procs to find processes to kill, send them kill signals until cgroup.procs is empty, then remove the cgroup directory. The cgroup cannot be removed until all processes are dead, otherwise we'll get an EBUSY error from the kernel. There is a race in the kernel where cgroup.procs can read empty even though the cgroup is pinned by processes which are still exiting, and can't be removed yet. [1] Let's use the populated field of cgroup.events instead of an empty cgroup.procs file to determine when the cgroup is removable. In addition to functioning like we expect, this is more efficient because we can poll on cgroup.events instead of retrying kills and rereading cgroup.procs every 5ms which should help reduce CPU contention and cgroup lock contention. It's still possible that it takes longer for a cgroup to become unpopulated than our timeout allows, in which case we will fail to remove the cgroup and leak kernel memory. But this change should help reduce the probability of that happening. [1] https://lore.kernel.org/all/CABdmKX3SOXpcK85a7cx3iXrwUj=i1yXqEz9i9zNkx8mB=ZXQ8A@mail.gmail.com/ Bug: 301871933 Change-Id: If7dcfb331f47e06994c9ac85ed08bbcce18cdad7 --- .../include/processgroup/processgroup.h | 10 +- libprocessgroup/processgroup.cpp | 293 +++++++++++------- 2 files changed, 183 insertions(+), 120 deletions(-) diff --git a/libprocessgroup/include/processgroup/processgroup.h b/libprocessgroup/include/processgroup/processgroup.h index 9107838b8..ca6868c1b 100644 --- a/libprocessgroup/include/processgroup/processgroup.h +++ b/libprocessgroup/include/processgroup/processgroup.h @@ -65,9 +65,8 @@ bool UsePerAppMemcg(); // should be active again. E.g. Zygote specialization for child process. void DropTaskProfilesResourceCaching(); -// Return 0 and removes the cgroup if there are no longer any processes in it. -// Returns -1 in the case of an error occurring or if there are processes still running -// even after retrying for up to 200ms. +// Return 0 if all processes were killed and the cgroup was successfully removed. +// Returns -1 in the case of an error occurring or if there are processes still running. int killProcessGroup(uid_t uid, int initialPid, int signal); // Returns the same as killProcessGroup(), however it does not retry, which means @@ -76,8 +75,9 @@ int killProcessGroupOnce(uid_t uid, int initialPid, int signal); // Sends the provided signal to all members of a process group, but does not wait for processes to // exit, or for the cgroup to be removed. Callers should also ensure that killProcessGroup is called -// later to ensure the cgroup is fully removed, otherwise system resources may leak. -int sendSignalToProcessGroup(uid_t uid, int initialPid, int signal); +// later to ensure the cgroup is fully removed, otherwise system resources will leak. +// Returns true if no errors are encountered sending signals, otherwise false. +bool sendSignalToProcessGroup(uid_t uid, int initialPid, int signal); int createProcessGroup(uid_t uid, int initialPid, bool memControl = false); diff --git a/libprocessgroup/processgroup.cpp b/libprocessgroup/processgroup.cpp index f594f7f66..7e27d7544 100644 --- a/libprocessgroup/processgroup.cpp +++ b/libprocessgroup/processgroup.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include +#include #include #include #include @@ -53,7 +55,8 @@ using android::base::WriteStringToFile; using namespace std::chrono_literals; -#define PROCESSGROUP_CGROUP_PROCS_FILE "/cgroup.procs" +#define PROCESSGROUP_CGROUP_PROCS_FILE "cgroup.procs" +#define PROCESSGROUP_CGROUP_EVENTS_FILE "cgroup.events" bool CgroupsAvailable() { static bool cgroups_available = access("/proc/cgroups", F_OK) == 0; @@ -213,30 +216,21 @@ static std::string ConvertUidPidToPath(const char* cgroup, uid_t uid, int pid) { return StringPrintf("%s/uid_%u/pid_%d", cgroup, uid, pid); } -static int RemoveCgroup(const char* cgroup, uid_t uid, int pid, unsigned int retries) { - int ret = 0; - auto uid_pid_path = ConvertUidPidToPath(cgroup, uid, pid); - - while (retries--) { - ret = rmdir(uid_pid_path.c_str()); - // If we get an error 2 'No such file or directory' , that means the - // cgroup is already removed, treat it as success and return 0 for - // idempotency. - if (ret < 0 && errno == ENOENT) { - ret = 0; - } - if (!ret || errno != EBUSY || !retries) break; - std::this_thread::sleep_for(5ms); - } +static int RemoveCgroup(const char* cgroup, uid_t uid, int pid) { + auto path = ConvertUidPidToPath(cgroup, uid, pid); + int ret = TEMP_FAILURE_RETRY(rmdir(path.c_str())); if (!ret && uid >= AID_ISOLATED_START && uid <= AID_ISOLATED_END) { // Isolated UIDs are unlikely to be reused soon after removal, // so free up the kernel resources for the UID level cgroup. - const auto uid_path = ConvertUidToPath(cgroup, uid); - ret = rmdir(uid_path.c_str()); - if (ret < 0 && errno == ENOENT) { - ret = 0; - } + path = ConvertUidToPath(cgroup, uid); + ret = TEMP_FAILURE_RETRY(rmdir(path.c_str())); + } + + if (ret < 0 && errno == ENOENT) { + // This function is idempoetent, but still warn here. + LOG(WARNING) << "RemoveCgroup: " << path << " does not exist."; + ret = 0; } return ret; @@ -360,38 +354,33 @@ err: return false; } -// Returns number of processes killed on success -// Returns 0 if there are no processes in the process cgroup left to kill -// Returns -1 on error -static int DoKillProcessGroupOnce(const char* cgroup, uid_t uid, int initialPid, int signal) { - // We separate all of the pids in the cgroup into those pids that are also the leaders of - // process groups (stored in the pgids set) and those that are not (stored in the pids set). - std::set pgids; - pgids.emplace(initialPid); - std::set pids; - int processes = 0; - - std::unique_ptr fd(nullptr, fclose); +bool sendSignalToProcessGroup(uid_t uid, int initialPid, int signal) { + std::set pgids, pids; if (CgroupsAvailable()) { - auto path = ConvertUidPidToPath(cgroup, uid, initialPid) + PROCESSGROUP_CGROUP_PROCS_FILE; - fd.reset(fopen(path.c_str(), "re")); - if (!fd) { - if (errno == ENOENT) { - // This happens when the process is already dead or if, as the result of a bug, it - // has been migrated to another cgroup. An example of a bug that can cause migration - // to another cgroup is using the JoinCgroup action with a cgroup controller that - // has been activated in the v2 cgroup hierarchy. - goto kill; - } - PLOG(WARNING) << __func__ << " failed to open process cgroup uid " << uid << " pid " - << initialPid; - return -1; + std::string hierarchy_root_path, cgroup_v2_path; + CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path); + cgroup_v2_path = ConvertUidPidToPath(hierarchy_root_path.c_str(), uid, initialPid); + + LOG(VERBOSE) << "Using " << PROCESSGROUP_CGROUP_PROCS_FILE << " to signal (" << signal + << ") " << cgroup_v2_path; + + // We separate all of the pids in the cgroup into those pids that are also the leaders of + // process groups (stored in the pgids set) and those that are not (stored in the pids set). + const auto procsfilepath = cgroup_v2_path + '/' + PROCESSGROUP_CGROUP_PROCS_FILE; + std::unique_ptr fp(fopen(procsfilepath.c_str(), "re"), fclose); + if (!fp) { + // This should only happen if the cgroup has already been removed with a successful call + // to killProcessGroup. Callers should only retry sendSignalToProcessGroup or + // killProcessGroup calls if they fail without ENOENT. + PLOG(ERROR) << "Failed to open " << procsfilepath; + kill(-initialPid, signal); + return false; } + pid_t pid; bool file_is_empty = true; - while (fscanf(fd.get(), "%d\n", &pid) == 1 && pid >= 0) { - processes++; + while (fscanf(fp.get(), "%d\n", &pid) == 1 && pid >= 0) { file_is_empty = false; if (pid == 0) { // Should never happen... but if it does, trying to kill this @@ -421,7 +410,8 @@ static int DoKillProcessGroupOnce(const char* cgroup, uid_t uid, int initialPid, } } -kill: + pgids.emplace(initialPid); + // Kill all process groups. for (const auto pgid : pgids) { LOG(VERBOSE) << "Killing process group " << -pgid << " in uid " << uid @@ -442,101 +432,174 @@ kill: } } - return (!fd || feof(fd.get())) ? processes : -1; + return true; } -static int KillProcessGroup(uid_t uid, int initialPid, int signal, int retries) { +template +static std::chrono::milliseconds toMillisec(T&& duration) { + return std::chrono::duration_cast(duration); +} + +enum class populated_status +{ + populated, + not_populated, + error +}; + +static populated_status cgroupIsPopulated(int events_fd) { + const std::string POPULATED_KEY("populated "); + const std::string::size_type MAX_EVENTS_FILE_SIZE = 32; + + std::string buf; + buf.resize(MAX_EVENTS_FILE_SIZE); + ssize_t len = TEMP_FAILURE_RETRY(pread(events_fd, buf.data(), buf.size(), 0)); + if (len == -1) { + PLOG(ERROR) << "Could not read cgroup.events: "; + // Potentially ENODEV if the cgroup has been removed since we opened this file, but that + // shouldn't have happened yet. + return populated_status::error; + } + + if (len == 0) { + LOG(ERROR) << "cgroup.events EOF"; + return populated_status::error; + } + + buf.resize(len); + + const std::string::size_type pos = buf.find(POPULATED_KEY); + if (pos == std::string::npos) { + LOG(ERROR) << "Could not find populated key in cgroup.events"; + return populated_status::error; + } + + if (pos + POPULATED_KEY.size() + 1 > len) { + LOG(ERROR) << "Partial read of cgroup.events"; + return populated_status::error; + } + + return buf[pos + POPULATED_KEY.size()] == '1' ? + populated_status::populated : populated_status::not_populated; +} + +// The default timeout of 2200ms comes from the default number of retries in a previous +// implementation of this function. The default retry value was 40 for killing and 400 for cgroup +// removal with 5ms sleeps between each retry. +static int KillProcessGroup( + uid_t uid, int initialPid, int signal, bool once = false, + std::chrono::steady_clock::time_point until = std::chrono::steady_clock::now() + 2200ms) { CHECK_GE(uid, 0); CHECK_GT(initialPid, 0); + // Always attempt to send a kill signal to at least the initialPid, at least once, regardless of + // whether its cgroup exists or not. This should only be necessary if a bug results in the + // migration of the targeted process out of its cgroup, which we will also attempt to kill. + const bool signal_ret = sendSignalToProcessGroup(uid, initialPid, signal); + + if (!CgroupsAvailable() || !signal_ret) return signal_ret ? 0 : -1; + std::string hierarchy_root_path; - if (CgroupsAvailable()) { - CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path); - } - const char* cgroup = hierarchy_root_path.c_str(); + CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path); - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); + const std::string cgroup_v2_path = + ConvertUidPidToPath(hierarchy_root_path.c_str(), uid, initialPid); - int retry = retries; - int processes; - while ((processes = DoKillProcessGroupOnce(cgroup, uid, initialPid, signal)) > 0) { - LOG(VERBOSE) << "Killed " << processes << " processes for processgroup " << initialPid; - if (!CgroupsAvailable()) { - // makes no sense to retry, because there are no cgroup_procs file - processes = 0; // no remaining processes - break; - } - if (retry > 0) { - std::this_thread::sleep_for(5ms); - --retry; - } else { - break; - } - } - - if (processes < 0) { - PLOG(ERROR) << "Error encountered killing process cgroup uid " << uid << " pid " - << initialPid; + const std::string eventsfile = cgroup_v2_path + '/' + PROCESSGROUP_CGROUP_EVENTS_FILE; + android::base::unique_fd events_fd(open(eventsfile.c_str(), O_RDONLY)); + if (events_fd.get() == -1) { + PLOG(WARNING) << "Error opening " << eventsfile << " for KillProcessGroup"; return -1; } - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - auto ms = std::chrono::duration_cast(end - start).count(); + struct pollfd fds = { + .fd = events_fd, + .events = POLLPRI, + }; - // We only calculate the number of 'processes' when killing the processes. - // In the retries == 0 case, we only kill the processes once and therefore - // will not have waited then recalculated how many processes are remaining - // after the first signals have been sent. - // Logging anything regarding the number of 'processes' here does not make sense. + const std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - if (processes == 0) { - if (retries > 0) { - LOG(INFO) << "Successfully killed process cgroup uid " << uid << " pid " << initialPid - << " in " << static_cast(ms) << "ms"; + // The primary reason to loop here is to capture any new forks or migrations that could occur + // after we send signals to the original set of processes, but before all of those processes + // exit and the cgroup becomes unpopulated, or before we remove the cgroup. We try hard to + // ensure this completes successfully to avoid permanent memory leaks, but we still place a + // large default upper bound on the amount of time we spend in this loop. The amount of CPU + // contention, and the amount of work that needs to be done in do_exit for each process + // determines how long this will take. + int ret; + do { + populated_status populated; + while ((populated = cgroupIsPopulated(events_fd.get())) == populated_status::populated && + std::chrono::steady_clock::now() < until) { + + sendSignalToProcessGroup(uid, initialPid, signal); + if (once) { + populated = cgroupIsPopulated(events_fd.get()); + break; + } + + const std::chrono::steady_clock::time_point poll_start = + std::chrono::steady_clock::now(); + + if (poll_start < until) + ret = TEMP_FAILURE_RETRY(poll(&fds, 1, toMillisec(until - poll_start).count())); + + if (ret == -1) { + // Fallback to 5ms sleeps if poll fails + PLOG(ERROR) << "Poll on " << eventsfile << "failed"; + const std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); + if (now < until) + std::this_thread::sleep_for(std::min(5ms, toMillisec(until - now))); + } + + LOG(VERBOSE) << "Waited " + << toMillisec(std::chrono::steady_clock::now() - poll_start).count() + << " ms for " << eventsfile << " poll"; } - if (!CgroupsAvailable()) { - // nothing to do here, if cgroups isn't available - return 0; + const std::chrono::milliseconds kill_duration = + toMillisec(std::chrono::steady_clock::now() - start); + + if (populated == populated_status::populated) { + LOG(WARNING) << "Still waiting on process(es) to exit for cgroup " << cgroup_v2_path + << " after " << kill_duration.count() << " ms"; + // We'll still try the cgroup removal below which we expect to log an error. + } else if (populated == populated_status::not_populated) { + LOG(VERBOSE) << "Killed all processes under cgroup " << cgroup_v2_path + << " after " << kill_duration.count() << " ms"; } - // 400 retries correspond to 2 secs max timeout - int err = RemoveCgroup(cgroup, uid, initialPid, 400); + ret = RemoveCgroup(hierarchy_root_path.c_str(), uid, initialPid); + if (ret) + PLOG(ERROR) << "Unable to remove cgroup " << cgroup_v2_path; + else + LOG(INFO) << "Removed cgroup " << cgroup_v2_path; if (isMemoryCgroupSupported() && UsePerAppMemcg()) { + // This per-application memcg v1 case should eventually be removed after migration to + // memcg v2. std::string memcg_apps_path; if (CgroupGetMemcgAppsPath(&memcg_apps_path) && - RemoveCgroup(memcg_apps_path.c_str(), uid, initialPid, 400) < 0) { - return -1; + (ret = RemoveCgroup(memcg_apps_path.c_str(), uid, initialPid)) < 0) { + const auto memcg_v1_cgroup_path = + ConvertUidPidToPath(memcg_apps_path.c_str(), uid, initialPid); + PLOG(ERROR) << "Unable to remove memcg v1 cgroup " << memcg_v1_cgroup_path; } } - return err; - } else { - if (retries > 0) { - LOG(ERROR) << "Failed to kill process cgroup uid " << uid << " pid " << initialPid - << " in " << static_cast(ms) << "ms, " << processes - << " processes remain"; - } - return -1; - } + if (once) break; + if (std::chrono::steady_clock::now() >= until) break; + } while (ret && errno == EBUSY); + + return ret; } int killProcessGroup(uid_t uid, int initialPid, int signal) { - return KillProcessGroup(uid, initialPid, signal, 40 /*retries*/); + return KillProcessGroup(uid, initialPid, signal); } int killProcessGroupOnce(uid_t uid, int initialPid, int signal) { - return KillProcessGroup(uid, initialPid, signal, 0 /*retries*/); -} - -int sendSignalToProcessGroup(uid_t uid, int initialPid, int signal) { - std::string hierarchy_root_path; - if (CgroupsAvailable()) { - CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path); - } - const char* cgroup = hierarchy_root_path.c_str(); - return DoKillProcessGroupOnce(cgroup, uid, initialPid, signal); + return KillProcessGroup(uid, initialPid, signal, true); } static int createProcessGroupInternal(uid_t uid, int initialPid, std::string cgroup, @@ -576,7 +639,7 @@ static int createProcessGroupInternal(uid_t uid, int initialPid, std::string cgr return -errno; } - auto uid_pid_procs_file = uid_pid_path + PROCESSGROUP_CGROUP_PROCS_FILE; + auto uid_pid_procs_file = uid_pid_path + '/' + PROCESSGROUP_CGROUP_PROCS_FILE; if (!WriteStringToFile(std::to_string(initialPid), uid_pid_procs_file)) { ret = -errno; From 38b8bb1e4a245de240f0761975e1d156030501fb Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Tue, 7 Nov 2023 14:36:46 +0000 Subject: [PATCH 2/2] libprocessgroup: Use cgroup.kill By using cgroup.kill we don't need to read cgroup.procs at all for SIGKILLs, which is more efficient and should help reduce CPU contention and cgroup lock contention. Fallback to cgroup.procs if we encounter an error trying to use cgroup.kill, but if cgroup.kill fails it's likely that cgroup.procs will too. Bug: 239829790 Change-Id: I44706faccfb7c4611b512a3642b913f06d30c1dc --- libprocessgroup/processgroup.cpp | 46 ++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/libprocessgroup/processgroup.cpp b/libprocessgroup/processgroup.cpp index 7e27d7544..4b49d09af 100644 --- a/libprocessgroup/processgroup.cpp +++ b/libprocessgroup/processgroup.cpp @@ -56,6 +56,7 @@ using android::base::WriteStringToFile; using namespace std::chrono_literals; #define PROCESSGROUP_CGROUP_PROCS_FILE "cgroup.procs" +#define PROCESSGROUP_CGROUP_KILL_FILE "cgroup.kill" #define PROCESSGROUP_CGROUP_EVENTS_FILE "cgroup.events" bool CgroupsAvailable() { @@ -77,6 +78,29 @@ bool CgroupGetControllerPath(const std::string& cgroup_name, std::string* path) return true; } +static std::string ConvertUidToPath(const char* cgroup, uid_t uid) { + return StringPrintf("%s/uid_%u", cgroup, uid); +} + +static std::string ConvertUidPidToPath(const char* cgroup, uid_t uid, int pid) { + return StringPrintf("%s/uid_%u/pid_%d", cgroup, uid, pid); +} + +static bool CgroupKillAvailable() { + static std::once_flag f; + static bool cgroup_kill_available = false; + std::call_once(f, []() { + std::string cg_kill; + CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &cg_kill); + // cgroup.kill is not on the root cgroup, so check a non-root cgroup that should always + // exist + cg_kill = ConvertUidToPath(cg_kill.c_str(), AID_ROOT) + '/' + PROCESSGROUP_CGROUP_KILL_FILE; + cgroup_kill_available = access(cg_kill.c_str(), F_OK) == 0; + }); + + return cgroup_kill_available; +} + static bool CgroupGetMemcgAppsPath(std::string* path) { CgroupController controller = CgroupMap::GetInstance().FindController("memory"); @@ -208,14 +232,6 @@ bool SetUserProfiles(uid_t uid, const std::vector& profiles) { false); } -static std::string ConvertUidToPath(const char* cgroup, uid_t uid) { - return StringPrintf("%s/uid_%u", cgroup, uid); -} - -static std::string ConvertUidPidToPath(const char* cgroup, uid_t uid, int pid) { - return StringPrintf("%s/uid_%u/pid_%d", cgroup, uid, pid); -} - static int RemoveCgroup(const char* cgroup, uid_t uid, int pid) { auto path = ConvertUidPidToPath(cgroup, uid, pid); int ret = TEMP_FAILURE_RETRY(rmdir(path.c_str())); @@ -362,6 +378,20 @@ bool sendSignalToProcessGroup(uid_t uid, int initialPid, int signal) { CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path); cgroup_v2_path = ConvertUidPidToPath(hierarchy_root_path.c_str(), uid, initialPid); + if (signal == SIGKILL && CgroupKillAvailable()) { + LOG(VERBOSE) << "Using " << PROCESSGROUP_CGROUP_KILL_FILE << " to SIGKILL " + << cgroup_v2_path; + const std::string killfilepath = cgroup_v2_path + '/' + PROCESSGROUP_CGROUP_KILL_FILE; + if (WriteStringToFile("1", killfilepath)) { + return true; + } else { + PLOG(ERROR) << "Failed to write 1 to " << killfilepath; + // Fallback to cgroup.procs below + } + } + + // Since cgroup.kill only sends SIGKILLs, we read cgroup.procs to find each process to + // signal individually. This is more costly than using cgroup.kill for SIGKILLs. LOG(VERBOSE) << "Using " << PROCESSGROUP_CGROUP_PROCS_FILE << " to signal (" << signal << ") " << cgroup_v2_path;