From c531ed6648b99e88061cd41e7948f1f4e2dcef03 Mon Sep 17 00:00:00 2001 From: Josh Gao Date: Wed, 24 Jan 2018 14:23:42 -0800 Subject: [PATCH] debuggerd_fallback: fix race. A race condition occurs when one thread takes more than a second to get scheduled to handle the signal we send to ask it to dump its stack. When this happens, the main thread will continue on, close the fd, and then ask the next thread to dump, but the slow thread will then wake up and try to write to the new thread's fd, or trigger an assertion in __linker_enable_fallback_allocator. Do a few things to make this less bad: - encode both target tid and fd in the shared atomic, so that we know who each fd is for - switch __linker_enable_fallback_allocator to return success instead of aborting, and bail out if it's already in use - write to the output fd right when we get to it, instead of doing it whenever the dumping code decides to, to reduce the likelihood that the timeout expires Test: debuggerd_test Change-Id: Ife0f6dae388b601e7f991605f14d7a0274013f6b --- debuggerd/handler/debuggerd_fallback.cpp | 105 +++++++++++++++++++---- 1 file changed, 89 insertions(+), 16 deletions(-) diff --git a/debuggerd/handler/debuggerd_fallback.cpp b/debuggerd/handler/debuggerd_fallback.cpp index 5fddddcf2..364fca5e8 100644 --- a/debuggerd/handler/debuggerd_fallback.cpp +++ b/debuggerd/handler/debuggerd_fallback.cpp @@ -55,7 +55,7 @@ using android::base::unique_fd; using unwindstack::Regs; -extern "C" void __linker_enable_fallback_allocator(); +extern "C" bool __linker_enable_fallback_allocator(); extern "C" void __linker_disable_fallback_allocator(); // This is incredibly sketchy to do inside of a signal handler, especially when libbacktrace @@ -65,7 +65,11 @@ extern "C" void __linker_disable_fallback_allocator(); // This isn't the default method of dumping because it can fail in cases such as address space // exhaustion. static void debuggerd_fallback_trace(int output_fd, ucontext_t* ucontext) { - __linker_enable_fallback_allocator(); + if (!__linker_enable_fallback_allocator()) { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "fallback allocator already in use"); + return; + } + { std::unique_ptr regs; @@ -84,7 +88,11 @@ static void debuggerd_fallback_trace(int output_fd, ucontext_t* ucontext) { static void debuggerd_fallback_tombstone(int output_fd, ucontext_t* ucontext, siginfo_t* siginfo, void* abort_message) { - __linker_enable_fallback_allocator(); + if (!__linker_enable_fallback_allocator()) { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "fallback allocator already in use"); + return; + } + engrave_tombstone_ucontext(output_fd, reinterpret_cast(abort_message), siginfo, ucontext); __linker_disable_fallback_allocator(); @@ -116,7 +124,7 @@ static void iterate_siblings(bool (*callback)(pid_t, int), int output_fd) { closedir(dir); } -static bool forward_output(int src_fd, int dst_fd) { +static bool forward_output(int src_fd, int dst_fd, pid_t expected_tid) { // Make sure the thread actually got the signal. struct pollfd pfd = { .fd = src_fd, .events = POLLIN, @@ -127,6 +135,18 @@ static bool forward_output(int src_fd, int dst_fd) { return false; } + pid_t tid; + if (TEMP_FAILURE_RETRY(read(src_fd, &tid, sizeof(tid))) != sizeof(tid)) { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to read tid"); + return false; + } + + if (tid != expected_tid) { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "received tid %d, expected %d", tid, + expected_tid); + return false; + } + while (true) { char buf[512]; ssize_t rc = TEMP_FAILURE_RETRY(read(src_fd, buf, sizeof(buf))); @@ -144,16 +164,54 @@ static bool forward_output(int src_fd, int dst_fd) { } } +struct __attribute__((__packed__)) packed_thread_output { + int32_t tid; + int32_t fd; +}; + +static uint64_t pack_thread_fd(pid_t tid, int fd) { + packed_thread_output packed = {.tid = tid, .fd = fd}; + uint64_t result; + static_assert(sizeof(packed) == sizeof(result)); + memcpy(&result, &packed, sizeof(packed)); + return result; +} + +static std::pair unpack_thread_fd(uint64_t value) { + packed_thread_output result; + memcpy(&result, &value, sizeof(value)); + return std::make_pair(result.tid, result.fd); +} + static void trace_handler(siginfo_t* info, ucontext_t* ucontext) { - static std::atomic trace_output_fd(-1); + static std::atomic trace_output(pack_thread_fd(-1, -1)); if (info->si_value.sival_int == ~0) { // Asked to dump by the original signal recipient. - debuggerd_fallback_trace(trace_output_fd, ucontext); + uint64_t val = trace_output.load(); + auto [tid, fd] = unpack_thread_fd(val); + if (tid != gettid()) { + // We received some other thread's info request? + async_safe_format_log(ANDROID_LOG_ERROR, "libc", + "thread %d received output fd for thread %d?", gettid(), tid); + return; + } - int tmp = trace_output_fd.load(); - trace_output_fd.store(-1); - close(tmp); + if (!trace_output.compare_exchange_strong(val, pack_thread_fd(-1, -1))) { + // Presumably, the timeout in forward_output expired, and the main thread moved on. + // If this happened, the main thread closed our fd for us, so just return. + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "cmpxchg for thread %d failed", gettid()); + return; + } + + // Write our tid to the output fd to let the main thread know that we're working. + if (TEMP_FAILURE_RETRY(write(fd, &tid, sizeof(tid))) == sizeof(tid)) { + debuggerd_fallback_trace(fd, ucontext); + } else { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to write to output fd"); + } + + close(fd); return; } @@ -189,7 +247,14 @@ static void trace_handler(siginfo_t* info, ucontext_t* ucontext) { return false; } - trace_output_fd.store(pipe_write.get()); + uint64_t expected = pack_thread_fd(-1, -1); + if (!trace_output.compare_exchange_strong(expected, + pack_thread_fd(tid, pipe_write.release()))) { + auto [tid, fd] = unpack_thread_fd(expected); + async_safe_format_log(ANDROID_LOG_ERROR, "libc", + "thread %d is already outputting to fd %d?", tid, fd); + return false; + } siginfo_t siginfo = {}; siginfo.si_code = SI_QUEUE; @@ -203,12 +268,20 @@ static void trace_handler(siginfo_t* info, ucontext_t* ucontext) { return false; } - bool success = forward_output(pipe_read.get(), output_fd); - if (success) { - // The signaled thread has closed trace_output_fd already. - (void)pipe_write.release(); - } else { - trace_output_fd.store(-1); + bool success = forward_output(pipe_read.get(), output_fd, tid); + if (!success) { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", + "timeout expired while waiting for thread %d to dump", tid); + } + + // Regardless of whether the poll succeeds, check to see if the thread took fd ownership. + uint64_t post_wait = trace_output.exchange(pack_thread_fd(-1, -1)); + if (post_wait != pack_thread_fd(-1, -1)) { + auto [tid, fd] = unpack_thread_fd(post_wait); + if (fd != -1) { + async_safe_format_log(ANDROID_LOG_ERROR, "libc", "closing fd %d for thread %d", fd, tid); + close(fd); + } } return true;