Merge "lmkd: Add support for process death notifications"

2019-10-09 16:18:40 +00:00 · 2019-10-09 16:18:40 +00:00 · 530098cebc
commit 530098cebc
parent b18dc5d287 f2081a96be
1 changed files with 244 additions and 94 deletions
--- a/lmkd/lmkd.c
+++ b/lmkd/lmkd.c
@ -31,6 +31,7 @@
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
+#include <sys/syscall.h>
 #include <sys/sysinfo.h>
 #include <sys/time.h>
 #include <sys/types.h>
@ -139,6 +140,10 @@
 /* ro.lmk.psi_complete_stall_ms property defaults */
 #define DEF_COMPLETE_STALL 700

+static inline int sys_pidfd_open(pid_t pid, unsigned int flags) {
+    return syscall(__NR_pidfd_open, pid, flags);
+}
+
 /* default to old in-kernel interface if no memory pressure events */
 static bool use_inkernel_interface = true;
 static bool has_inkernel_module;
@ -169,6 +174,11 @@ struct psi_threshold {

 static int level_oomadj[VMPRESS_LEVEL_COUNT];
 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
+static bool pidfd_supported;
+static int last_kill_pid_or_fd = -1;
+static struct timespec last_kill_tm;
+
+/* lmkd configurable parameters */
 static bool debug_process_killing;
 static bool enable_pressure_upgrade;
 static int64_t upgrade_pressure;
@ -197,6 +207,8 @@ enum polling_update {
    POLLING_DO_NOT_CHANGE,
    POLLING_START,
    POLLING_STOP,
+    POLLING_PAUSE,
+    POLLING_RESUME,
 };

 /*
@ -207,6 +219,7 @@ enum polling_update {
 */
 struct polling_params {
    struct event_handler_info* poll_handler;
+    struct event_handler_info* paused_handler;
    struct timespec poll_start_tm;
    struct timespec last_poll_tm;
    int polling_interval_ms;
@ -235,8 +248,11 @@ static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
 /* vmpressure event handler data */
 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];

-/* 3 memory pressure levels, 1 ctrl listen socket, 2 ctrl data socket, 1 lmk events */
-#define MAX_EPOLL_EVENTS (2 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT)
+/*
+ * 1 ctrl listen socket, 2 ctrl data socket, 3 memory pressure levels,
+ * 1 lmk events + 1 fd to wait for process death
+ */
+#define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
 static int epollfd;
 static int maxevents;

@ -1647,11 +1663,112 @@ static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
    closedir(d);
 }

-static int last_killed_pid = -1;
+static bool is_kill_pending(void) {
+    char buf[24];
+
+    if (last_kill_pid_or_fd < 0) {
+        return false;
+    }
+
+    if (pidfd_supported) {
+        return true;
+    }
+
+    /* when pidfd is not supported base the decision on /proc/<pid> existence */
+    snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
+    if (access(buf, F_OK) == 0) {
+        return true;
+    }
+
+    return false;
+}
+
+static bool is_waiting_for_kill(void) {
+    return pidfd_supported && last_kill_pid_or_fd >= 0;
+}
+
+static void stop_wait_for_proc_kill(bool finished) {
+    struct epoll_event epev;
+
+    if (last_kill_pid_or_fd < 0) {
+        return;
+    }
+
+    if (debug_process_killing) {
+        struct timespec curr_tm;
+
+        if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
+            /*
+             * curr_tm is used here merely to report kill duration, so this failure is not fatal.
+             * Log an error and continue.
+             */
+            ALOGE("Failed to get current time");
+        }
+
+        if (finished) {
+            ALOGI("Process got killed in %ldms",
+                get_time_diff_ms(&last_kill_tm, &curr_tm));
+        } else {
+            ALOGI("Stop waiting for process kill after %ldms",
+                get_time_diff_ms(&last_kill_tm, &curr_tm));
+        }
+    }
+
+    if (pidfd_supported) {
+        /* unregister fd */
+        if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev) != 0) {
+            ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
+            return;
+        }
+        maxevents--;
+        close(last_kill_pid_or_fd);
+    }
+
+    last_kill_pid_or_fd = -1;
+}
+
+static void kill_done_handler(int data __unused, uint32_t events __unused,
+                              struct polling_params *poll_params) {
+    stop_wait_for_proc_kill(true);
+    poll_params->update = POLLING_RESUME;
+}
+
+static void start_wait_for_proc_kill(int pid) {
+    static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
+    struct epoll_event epev;
+
+    if (last_kill_pid_or_fd >= 0) {
+        /* Should not happen but if it does we should stop previous wait */
+        ALOGE("Attempt to wait for a kill while another wait is in progress");
+        stop_wait_for_proc_kill(false);
+    }
+
+    if (!pidfd_supported) {
+        /* If pidfd is not supported store PID of the process being killed */
+        last_kill_pid_or_fd = pid;
+        return;
+    }
+
+    last_kill_pid_or_fd = TEMP_FAILURE_RETRY(sys_pidfd_open(pid, 0));
+    if (last_kill_pid_or_fd < 0) {
+        ALOGE("pidfd_open for process pid %d failed; errno=%d", pid, errno);
+        return;
+    }
+
+    epev.events = EPOLLIN;
+    epev.data.ptr = (void *)&kill_done_hinfo;
+    if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
+        ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
+        close(last_kill_pid_or_fd);
+        last_kill_pid_or_fd = -1;
+        return;
+    }
+    maxevents++;
+}

 /* Kill one process specified by procp.  Returns the size of the process killed */
 static int kill_one_process(struct proc* procp, int min_oom_score, int kill_reason,
-                            const char *kill_desc, union meminfo *mi) {
+                            const char *kill_desc, union meminfo *mi, struct timespec *tm) {
    int pid = procp->pid;
    uid_t uid = procp->uid;
    int tgid;
@ -1682,12 +1799,16 @@ static int kill_one_process(struct proc* procp, int min_oom_score, int kill_reas

    TRACE_KILL_START(pid);

+    /* Have to start waiting before sending SIGKILL to make sure pid is valid */
+    start_wait_for_proc_kill(pid);
+
    /* CAP_KILL required */
    r = kill(pid, SIGKILL);

    TRACE_KILL_END();

    if (r) {
+        stop_wait_for_proc_kill(false);
        ALOGE("kill(%d): errno=%d", pid, errno);
        /* Delete process record even when we fail to kill so that we don't get stuck on it */
        goto out;
@ -1695,6 +1816,8 @@ static int kill_one_process(struct proc* procp, int min_oom_score, int kill_reas

    set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);

+    last_kill_tm = *tm;
+
    inc_killcnt(procp->oomadj);

    killinfo_log(procp, min_oom_score, tasksize, kill_reason, mi);
@ -1707,8 +1830,6 @@ static int kill_one_process(struct proc* procp, int min_oom_score, int kill_reas
              uid, procp->oomadj, tasksize * page_k);
    }

-    last_killed_pid = pid;
-
    stats_write_lmk_kill_occurred(LMK_KILL_OCCURRED, uid, taskname,
            procp->oomadj, min_oom_score, tasksize, mem_st);

@ -1728,7 +1849,7 @@ out:
 * Returns size of the killed process.
 */
 static int find_and_kill_process(int min_score_adj, int kill_reason, const char *kill_desc,
-                                 union meminfo *mi) {
+                                 union meminfo *mi, struct timespec *tm) {
    int i;
    int killed_size = 0;
    bool lmk_state_change_start = false;
@ -1743,7 +1864,7 @@ static int find_and_kill_process(int min_score_adj, int kill_reason, const char
            if (!procp)
                break;

-            killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc, mi);
+            killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc, mi, tm);
            if (killed_size >= 0) {
                if (!lmk_state_change_start) {
                    lmk_state_change_start = true;
@ -1822,23 +1943,6 @@ enum vmpressure_level downgrade_level(enum vmpressure_level level) {
        level - 1 : level);
 }

-static bool is_kill_pending(void) {
-    char buf[24];
-
-    if (last_killed_pid < 0) {
-        return false;
-    }
-
-    snprintf(buf, sizeof(buf), "/proc/%d/", last_killed_pid);
-    if (access(buf, F_OK) == 0) {
-        return true;
-    }
-
-    // reset last killed PID because there's nothing pending
-    last_killed_pid = -1;
-    return false;
-}
-
 enum zone_watermark {
    WMARK_MIN = 0,
    WMARK_LOW,
@ -1934,9 +2038,13 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_

    /* Skip while still killing a process */
    if (is_kill_pending()) {
-        /* TODO: replace this quick polling with pidfd polling if kernel supports */
        goto no_kill;
    }
+    /*
+     * Process is dead, stop waiting. This has no effect if pidfds are supported and
+     * death notification already caused waiting to stop.
+     */
+    stop_wait_for_proc_kill(true);

    if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
        ALOGE("Failed to get current time");
@ -2067,7 +2175,8 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_

    /* Kill a process if necessary */
    if (kill_reason != NONE) {
-        int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi);
+        int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi,
+                                                &curr_tm);
        if (pages_freed > 0) {
            killing = true;
            if (cut_thrashing_limit) {
@ -2081,6 +2190,13 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
    }

 no_kill:
+    /* Do not poll if kernel supports pidfd waiting */
+    if (is_waiting_for_kill()) {
+        /* Pause polling if we are waiting for process death notification */
+        poll_params->update = POLLING_PAUSE;
+        return;
+    }
+
    /*
     * Start polling after initial PSI event;
     * extend polling while device is in direct reclaim or process is being killed;
@ -2110,7 +2226,6 @@ static void mp_event_common(int data, uint32_t events, struct polling_params *po
    union meminfo mi;
    struct zoneinfo zi;
    struct timespec curr_tm;
-    static struct timespec last_kill_tm;
    static unsigned long kill_skip_count = 0;
    enum vmpressure_level level = (enum vmpressure_level)data;
    long other_free = 0, other_file = 0;
@ -2159,15 +2274,26 @@ static void mp_event_common(int data, uint32_t events, struct polling_params *po
        return;
    }

-    if (kill_timeout_ms) {
-        // If we're within the timeout, see if there's pending reclaim work
-        // from the last killed process. If there is (as evidenced by
-        // /proc/<pid> continuing to exist), skip killing for now.
-        if ((get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) &&
-            (low_ram_device || is_kill_pending())) {
+    if (kill_timeout_ms && get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) {
+        /*
+         * If we're within the no-kill timeout, see if there's pending reclaim work
+         * from the last killed process. If so, skip killing for now.
+         */
+        if (is_kill_pending()) {
            kill_skip_count++;
            return;
        }
+        /*
+         * Process is dead, stop waiting. This has no effect if pidfds are supported and
+         * death notification already caused waiting to stop.
+         */
+        stop_wait_for_proc_kill(true);
+    } else {
+        /*
+         * Killing took longer than no-kill timeout. Stop waiting for the last process
+         * to die because we are ready to kill again.
+         */
+        stop_wait_for_proc_kill(false);
    }

    if (kill_skip_count > 0) {
@ -2266,7 +2392,7 @@ static void mp_event_common(int data, uint32_t events, struct polling_params *po
 do_kill:
    if (low_ram_device) {
        /* For Go devices kill only one task */
-        if (find_and_kill_process(level_oomadj[level], -1, NULL, &mi) == 0) {
+        if (find_and_kill_process(level_oomadj[level], -1, NULL, &mi, &curr_tm) == 0) {
            if (debug_process_killing) {
                ALOGI("Nothing to kill");
            }
@ -2289,7 +2415,7 @@ do_kill:
            min_score_adj = level_oomadj[level];
        }

-        pages_freed = find_and_kill_process(min_score_adj, -1, NULL, &mi);
+        pages_freed = find_and_kill_process(min_score_adj, -1, NULL, &mi, &curr_tm);

        if (pages_freed == 0) {
            /* Rate limit kill reports when nothing was reclaimed */
@ -2297,9 +2423,6 @@ do_kill:
                report_skip_count++;
                return;
            }
-        } else {
-            /* If we killed anything, update the last killed timestamp. */
-            last_kill_tm = curr_tm;
        }

        /* Log whenever we kill or when report rate limit allows */
@ -2322,6 +2445,10 @@ do_kill:

        last_report_tm = curr_tm;
    }
+    if (is_waiting_for_kill()) {
+        /* pause polling if we are waiting for process death notification */
+        poll_params->update = POLLING_PAUSE;
+    }
 }

 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
@ -2473,6 +2600,7 @@ static int init(void) {
        .fd = -1,
    };
    struct epoll_event epev;
+    int pidfd;
    int i;
    int ret;

@ -2563,9 +2691,61 @@ static int init(void) {
        ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
    }

+    /* check if kernel supports pidfd_open syscall */
+    pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(getpid(), 0));
+    if (pidfd < 0) {
+        pidfd_supported = (errno != ENOSYS);
+    } else {
+        pidfd_supported = true;
+        close(pidfd);
+    }
+    ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
+
    return 0;
 }

+static void call_handler(struct event_handler_info* handler_info,
+                         struct polling_params *poll_params, uint32_t events) {
+    struct timespec curr_tm;
+
+    handler_info->handler(handler_info->data, events, poll_params);
+    clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
+    poll_params->last_poll_tm = curr_tm;
+
+    switch (poll_params->update) {
+    case POLLING_START:
+        /*
+         * Poll for the duration of PSI_WINDOW_SIZE_MS after the
+         * initial PSI event because psi events are rate-limited
+         * at one per sec.
+         */
+        poll_params->poll_start_tm = curr_tm;
+        if (poll_params->poll_handler != handler_info) {
+            poll_params->poll_handler = handler_info;
+        }
+        break;
+    case POLLING_STOP:
+        poll_params->poll_handler = NULL;
+        break;
+    case POLLING_PAUSE:
+        poll_params->paused_handler = handler_info;
+        poll_params->poll_handler = NULL;
+        break;
+    case POLLING_RESUME:
+        poll_params->poll_start_tm = curr_tm;
+        poll_params->poll_handler = poll_params->paused_handler;
+        break;
+    case POLLING_DO_NOT_CHANGE:
+        if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
+            /* Polled for the duration of PSI window, time to stop */
+            poll_params->poll_handler = NULL;
+        }
+        /* WARNING: skipping the rest of the function */
+        return;
+    }
+    poll_params->update = POLLING_DO_NOT_CHANGE;
+}
+
 static void mainloop(void) {
    struct event_handler_info* handler_info;
    struct polling_params poll_params;
@ -2582,41 +2762,33 @@ static void mainloop(void) {
        int i;

        if (poll_params.poll_handler) {
-            /* Calculate next timeout */
-            clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
-            delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
-            delay = (delay < poll_params.polling_interval_ms) ?
-                poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
-
-            /* Wait for events until the next polling timeout */
-            nevents = epoll_wait(epollfd, events, maxevents, delay);
+            bool poll_now;

            clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
-            if (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
-                poll_params.polling_interval_ms) {
-                /* Set input params for the call */
-                poll_params.poll_handler->handler(poll_params.poll_handler->data, 0, &poll_params);
-                poll_params.last_poll_tm = curr_tm;
+            if (poll_params.poll_handler == poll_params.paused_handler) {
+                /*
+                 * Just transitioned into POLLING_RESUME. Reset paused_handler
+                 * and poll immediately
+                 */
+                poll_params.paused_handler = NULL;
+                poll_now = true;
+                nevents = 0;
+            } else {
+                /* Calculate next timeout */
+                delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
+                delay = (delay < poll_params.polling_interval_ms) ?
+                    poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;

-                if (poll_params.update != POLLING_DO_NOT_CHANGE) {
-                    switch (poll_params.update) {
-                    case POLLING_START:
-                        poll_params.poll_start_tm = curr_tm;
-                        break;
-                    case POLLING_STOP:
-                        poll_params.poll_handler = NULL;
-                        break;
-                    default:
-                        break;
-                    }
-                    poll_params.update = POLLING_DO_NOT_CHANGE;
-                } else {
-                    if (get_time_diff_ms(&poll_params.poll_start_tm, &curr_tm) >
-                        PSI_WINDOW_SIZE_MS) {
-                        /* Polled for the duration of PSI window, time to stop */
-                        poll_params.poll_handler = NULL;
-                    }
-                }
+                /* Wait for events until the next polling timeout */
+                nevents = epoll_wait(epollfd, events, maxevents, delay);
+
+                /* Update current time after wait */
+                clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
+                poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
+                    poll_params.polling_interval_ms);
+            }
+            if (poll_now) {
+                call_handler(poll_params.poll_handler, &poll_params, 0);
            }
        } else {
            /* Wait for events with no timeout */
@ -2656,29 +2828,7 @@ static void mainloop(void) {
            }
            if (evt->data.ptr) {
                handler_info = (struct event_handler_info*)evt->data.ptr;
-                /* Set input params for the call */
-                handler_info->handler(handler_info->data, evt->events, &poll_params);
-
-                if (poll_params.update != POLLING_DO_NOT_CHANGE) {
-                    switch (poll_params.update) {
-                    case POLLING_START:
-                        /*
-                         * Poll for the duration of PSI_WINDOW_SIZE_MS after the
-                         * initial PSI event because psi events are rate-limited
-                         * at one per sec.
-                         */
-                        clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
-                        poll_params.poll_start_tm = poll_params.last_poll_tm = curr_tm;
-                        poll_params.poll_handler = handler_info;
-                        break;
-                    case POLLING_STOP:
-                        poll_params.poll_handler = NULL;
-                        break;
-                    default:
-                        break;
-                    }
-                    poll_params.update = POLLING_DO_NOT_CHANGE;
-                }
+                call_handler(handler_info, &poll_params, evt->events);
            }
        }
    }