platform_bionic/libc/bionic/bionic_elf_tls.cpp
Elliott Hughes 13090d0bbf Spell "calling convention" out in full.
We only use this in one other place anyway.

Also be explicit about how `__tls_get_addr` and `___tls_get_addr` differ, since I missed that at first!

Change-Id: Ica214886c5346f118f063bca26e6dd8d74ee21f4
2024-05-29 12:34:18 +00:00

485 lines
19 KiB
C++

/*
* Copyright (C) 2019 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "private/bionic_elf_tls.h"
#include <async_safe/CHECK.h>
#include <async_safe/log.h>
#include <string.h>
#include <sys/param.h>
#include <unistd.h>
#include "platform/bionic/macros.h"
#include "platform/bionic/page.h"
#include "private/ScopedRWLock.h"
#include "private/ScopedSignalBlocker.h"
#include "private/bionic_globals.h"
#include "private/bionic_tls.h"
#include "pthread_internal.h"
// Every call to __tls_get_addr needs to check the generation counter, so
// accesses to the counter need to be as fast as possible. Keep a copy of it in
// a hidden variable, which can be accessed without using the GOT. The linker
// will update this variable when it updates its counter.
//
// To allow the linker to update this variable, libc.so's constructor passes its
// address to the linker. To accommodate a possible __tls_get_addr call before
// libc.so's constructor, this local copy is initialized to SIZE_MAX, forcing
// __tls_get_addr to initially use the slow path.
__LIBC_HIDDEN__ _Atomic(size_t) __libc_tls_generation_copy = SIZE_MAX;
// Search for a TLS segment in the given phdr table. Returns true if it has a
// TLS segment and false otherwise.
bool __bionic_get_tls_segment(const ElfW(Phdr)* phdr_table, size_t phdr_count,
ElfW(Addr) load_bias, TlsSegment* out) {
for (size_t i = 0; i < phdr_count; ++i) {
const ElfW(Phdr)& phdr = phdr_table[i];
if (phdr.p_type == PT_TLS) {
*out = TlsSegment{
.aligned_size =
TlsAlignedSize{
.size = phdr.p_memsz,
.align =
TlsAlign{
.value = phdr.p_align ?: 1, // 0 means "no alignment requirement"
.skew = phdr.p_vaddr % MAX(1, phdr.p_align),
},
},
.init_ptr = reinterpret_cast<void*>(load_bias + phdr.p_vaddr),
.init_size = phdr.p_filesz,
};
return true;
}
}
return false;
}
// Return true if the alignment of a TLS segment is a valid power-of-two.
bool __bionic_check_tls_align(size_t align) {
// Note: The size does not need to be a multiple of the alignment. With ld.bfd
// (or after using binutils' strip), the TLS segment's size isn't rounded up.
return powerof2(align);
}
static void static_tls_layout_overflow() {
async_safe_fatal("error: TLS segments in static TLS overflowed");
}
static size_t align_checked(size_t value, TlsAlign tls_align) {
const size_t align = tls_align.value;
const size_t skew = tls_align.skew;
CHECK(align != 0 && powerof2(align + 0) && skew < align);
const size_t result = ((value - skew + align - 1) & ~(align - 1)) + skew;
if (result < value) static_tls_layout_overflow();
return result;
}
size_t StaticTlsLayout::offset_thread_pointer() const {
return offset_bionic_tcb_ + (-MIN_TLS_SLOT * sizeof(void*));
}
// Allocates the Bionic TCB and the executable's TLS segment in the static TLS
// layout, satisfying alignment requirements for both.
//
// For an executable's TLS accesses (using the LocalExec model), the static
// linker bakes TLS offsets directly into the .text section, so the loader must
// place the executable segment at the same offset relative to the TP.
// Similarly, the Bionic TLS slots (bionic_tcb) must also be allocated at the
// correct offset relative to the TP.
//
// Returns the offset of the executable's TLS segment.
//
// Note: This function has unit tests, but they are in bionic-unit-tests-static,
// not bionic-unit-tests.
size_t StaticTlsLayout::reserve_exe_segment_and_tcb(const TlsSegment* seg,
const char* progname __attribute__((unused))) {
// Special case: if the executable has no TLS segment, then just allocate a
// TCB and skip the minimum alignment check on ARM.
if (seg == nullptr) {
offset_bionic_tcb_ = reserve_type<bionic_tcb>();
return 0;
}
#if defined(__arm__) || defined(__aarch64__)
// ARM uses a "variant 1" TLS layout. The ABI specifies that the TP points at
// a 2-word TCB, followed by the executable's segment. In practice, libc
// implementations actually allocate a larger TCB at negative offsets from the
// TP.
//
// Historically, Bionic allocated an 8-word TCB starting at TP+0, so to keep
// the executable's TLS segment from overlapping the last 6 slots, Bionic
// requires that executables have an 8-word PT_TLS alignment to ensure that
// the TCB fits in the alignment padding, which it accomplishes using
// crtbegin.c. Bionic uses negative offsets for new TLS slots to avoid this
// problem.
static_assert(MIN_TLS_SLOT <= 0 && MAX_TLS_SLOT >= 1);
static_assert(sizeof(bionic_tcb) == (MAX_TLS_SLOT - MIN_TLS_SLOT + 1) * sizeof(void*));
static_assert(alignof(bionic_tcb) == sizeof(void*));
const size_t max_align = MAX(alignof(bionic_tcb), seg->aligned_size.align.value);
// Allocate the TCB first. Split it into negative and non-negative slots and
// ensure that TP (i.e. the first non-negative slot) is aligned to max_align.
const size_t tcb_size_pre = -MIN_TLS_SLOT * sizeof(void*);
const size_t tcb_size_post = (MAX_TLS_SLOT + 1) * sizeof(void*);
const auto pair =
reserve_tp_pair(TlsAlignedSize{.size = tcb_size_pre},
TlsAlignedSize{.size = tcb_size_post, .align = TlsAlign{.value = max_align}});
offset_bionic_tcb_ = pair.before;
const size_t offset_tp = pair.tp;
// Allocate the segment.
offset_exe_ = reserve(seg->aligned_size);
// Verify that the ABI and Bionic tpoff values are equal, which is equivalent
// to checking whether the segment is sufficiently aligned.
const size_t abi_tpoff = align_checked(2 * sizeof(void*), seg->aligned_size.align);
const size_t actual_tpoff = align_checked(tcb_size_post, seg->aligned_size.align);
CHECK(actual_tpoff == offset_exe_ - offset_tp);
if (abi_tpoff != actual_tpoff) {
async_safe_fatal(
"error: \"%s\": executable's TLS segment is underaligned: "
"alignment is %zu (skew %zu), needs to be at least %zu for %s Bionic",
progname, seg->aligned_size.align.value, seg->aligned_size.align.skew, tcb_size_post,
(sizeof(void*) == 4 ? "ARM" : "ARM64"));
}
#elif defined(__i386__) || defined(__x86_64__)
auto pair = reserve_tp_pair(seg->aligned_size, TlsAlignedSize::of_type<bionic_tcb>());
offset_exe_ = pair.before;
offset_bionic_tcb_ = pair.after;
#elif defined(__riscv)
static_assert(MAX_TLS_SLOT == -1, "Last slot of bionic_tcb must be slot #(-1) on riscv");
auto pair = reserve_tp_pair(TlsAlignedSize::of_type<bionic_tcb>(), seg->aligned_size);
offset_bionic_tcb_ = pair.before;
offset_exe_ = pair.after;
#else
#error "Unrecognized architecture"
#endif
return offset_exe_;
}
size_t StaticTlsLayout::reserve_bionic_tls() {
offset_bionic_tls_ = reserve_type<bionic_tls>();
return offset_bionic_tls_;
}
void StaticTlsLayout::finish_layout() {
// Round the offset up to the alignment.
cursor_ = align_checked(cursor_, TlsAlign{.value = align_});
}
size_t StaticTlsLayout::align_cursor(TlsAlign align) {
cursor_ = align_checked(cursor_, align);
align_ = MAX(align_, align.value);
return cursor_;
}
size_t StaticTlsLayout::align_cursor_unskewed(size_t align) {
return align_cursor(TlsAlign{.value = align});
}
// Reserve the requested number of bytes at the requested alignment. The
// requested size is not required to be a multiple of the alignment, nor is the
// cursor aligned after the allocation.
size_t StaticTlsLayout::reserve(TlsAlignedSize aligned_size) {
align_cursor(aligned_size.align);
const size_t result = cursor_;
if (__builtin_add_overflow(cursor_, aligned_size.size, &cursor_)) static_tls_layout_overflow();
return result;
}
// Calculate the TP offset and allocate something before it and something after
// it. The TP will be aligned to:
//
// MAX(before.align.value, after.align.value)
//
// The `before` and `after` allocations are each allocated as closely as
// possible to the TP.
StaticTlsLayout::TpAllocations StaticTlsLayout::reserve_tp_pair(TlsAlignedSize before,
TlsAlignedSize after) {
// Tentative `before` allocation.
const size_t tentative_before = reserve(before);
const size_t tentative_before_end = align_cursor_unskewed(before.align.value);
const size_t offset_tp = align_cursor_unskewed(MAX(before.align.value, after.align.value));
const size_t offset_after = reserve(after);
// If the `after` allocation has higher alignment than `before`, then there
// may be alignment padding to remove between `before` and the TP. Shift
// `before` forward to remove this padding.
CHECK(((offset_tp - tentative_before_end) & (before.align.value - 1)) == 0);
const size_t offset_before = tentative_before + (offset_tp - tentative_before_end);
return TpAllocations{offset_before, offset_tp, offset_after};
}
// Copy each TLS module's initialization image into a newly-allocated block of
// static TLS memory. To reduce dirty pages, this function only writes to pages
// within the static TLS that need initialization. The memory should already be
// zero-initialized on entry.
void __init_static_tls(void* static_tls) {
// The part of the table we care about (i.e. static TLS modules) never changes
// after startup, but we still need the mutex because the table could grow,
// moving the initial part. If this locking is too slow, we can duplicate the
// static part of the table.
TlsModules& modules = __libc_shared_globals()->tls_modules;
ScopedSignalBlocker ssb;
ScopedReadLock locker(&modules.rwlock);
for (size_t i = 0; i < modules.module_count; ++i) {
TlsModule& module = modules.module_table[i];
if (module.static_offset == SIZE_MAX) {
// All of the static modules come before all of the dynamic modules, so
// once we see the first dynamic module, we're done.
break;
}
if (module.segment.init_size == 0) {
// Skip the memcpy call for TLS segments with no initializer, which is
// common.
continue;
}
memcpy(static_cast<char*>(static_tls) + module.static_offset,
module.segment.init_ptr,
module.segment.init_size);
}
}
static inline size_t dtv_size_in_bytes(size_t module_count) {
return sizeof(TlsDtv) + module_count * sizeof(void*);
}
// Calculates the number of module slots to allocate in a new DTV. For small
// objects (up to 1KiB), the TLS allocator allocates memory in power-of-2 sizes,
// so for better space usage, ensure that the DTV size (header + slots) is a
// power of 2.
//
// The lock on TlsModules must be held.
static size_t calculate_new_dtv_count() {
size_t loaded_cnt = __libc_shared_globals()->tls_modules.module_count;
size_t bytes = dtv_size_in_bytes(MAX(1, loaded_cnt));
if (!powerof2(bytes)) {
bytes = BIONIC_ROUND_UP_POWER_OF_2(bytes);
}
return (bytes - sizeof(TlsDtv)) / sizeof(void*);
}
// This function must be called with signals blocked and a write lock on
// TlsModules held.
static void update_tls_dtv(bionic_tcb* tcb) {
const TlsModules& modules = __libc_shared_globals()->tls_modules;
BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
// Use the generation counter from the shared globals instead of the local
// copy, which won't be initialized yet if __tls_get_addr is called before
// libc.so's constructor.
if (__get_tcb_dtv(tcb)->generation == atomic_load(&modules.generation)) {
return;
}
const size_t old_cnt = __get_tcb_dtv(tcb)->count;
// If the DTV isn't large enough, allocate a larger one. Because a signal
// handler could interrupt the fast path of __tls_get_addr, we don't free the
// old DTV. Instead, we add the old DTV to a list, then free all of a thread's
// DTVs at thread-exit. Each time the DTV is reallocated, its size at least
// doubles.
if (modules.module_count > old_cnt) {
size_t new_cnt = calculate_new_dtv_count();
TlsDtv* const old_dtv = __get_tcb_dtv(tcb);
TlsDtv* const new_dtv = static_cast<TlsDtv*>(allocator.alloc(dtv_size_in_bytes(new_cnt)));
memcpy(new_dtv, old_dtv, dtv_size_in_bytes(old_cnt));
new_dtv->count = new_cnt;
new_dtv->next = old_dtv;
__set_tcb_dtv(tcb, new_dtv);
}
TlsDtv* const dtv = __get_tcb_dtv(tcb);
const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
char* static_tls = reinterpret_cast<char*>(tcb) - layout.offset_bionic_tcb();
// Initialize static TLS modules and free unloaded modules.
for (size_t i = 0; i < dtv->count; ++i) {
if (i < modules.module_count) {
const TlsModule& mod = modules.module_table[i];
if (mod.static_offset != SIZE_MAX) {
dtv->modules[i] = static_tls + mod.static_offset;
continue;
}
if (mod.first_generation != kTlsGenerationNone &&
mod.first_generation <= dtv->generation) {
continue;
}
}
if (modules.on_destruction_cb != nullptr) {
void* dtls_begin = dtv->modules[i];
void* dtls_end =
static_cast<void*>(static_cast<char*>(dtls_begin) + allocator.get_chunk_size(dtls_begin));
modules.on_destruction_cb(dtls_begin, dtls_end);
}
allocator.free(dtv->modules[i]);
dtv->modules[i] = nullptr;
}
dtv->generation = atomic_load(&modules.generation);
}
__attribute__((noinline)) static void* tls_get_addr_slow_path(const TlsIndex* ti) {
TlsModules& modules = __libc_shared_globals()->tls_modules;
bionic_tcb* tcb = __get_bionic_tcb();
// Block signals and lock TlsModules. We may need the allocator, so take
// a write lock.
ScopedSignalBlocker ssb;
ScopedWriteLock locker(&modules.rwlock);
update_tls_dtv(tcb);
TlsDtv* dtv = __get_tcb_dtv(tcb);
const size_t module_idx = __tls_module_id_to_idx(ti->module_id);
void* mod_ptr = dtv->modules[module_idx];
if (mod_ptr == nullptr) {
const TlsSegment& segment = modules.module_table[module_idx].segment;
// TODO: Currently the aligned_size.align.skew property is ignored.
// That is, for a dynamic TLS block at addr A, (A % p_align) will be 0, not
// (p_vaddr % p_align).
mod_ptr = __libc_shared_globals()->tls_allocator.memalign(segment.aligned_size.align.value,
segment.aligned_size.size);
if (segment.init_size > 0) {
memcpy(mod_ptr, segment.init_ptr, segment.init_size);
}
dtv->modules[module_idx] = mod_ptr;
// Reports the allocation to the listener, if any.
if (modules.on_creation_cb != nullptr) {
modules.on_creation_cb(
mod_ptr, static_cast<void*>(static_cast<char*>(mod_ptr) + segment.aligned_size.size));
}
}
return static_cast<char*>(mod_ptr) + ti->offset + TLS_DTV_OFFSET;
}
// Returns the address of a thread's TLS memory given a module ID and an offset
// into that module's TLS segment. This function is called on every access to a
// dynamic TLS variable on targets that don't use TLSDESC. arm64 uses TLSDESC,
// so it only calls this function on a thread's first access to a module's TLS
// segment.
//
// On most targets, this accessor function is __tls_get_addr and
// TLS_GET_ADDR_CALLING_CONVENTION is unset, but 32-bit x86 uses
// ___tls_get_addr (with three underscores) instead, and a regparm
// calling convention.
extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CALLING_CONVENTION {
TlsDtv* dtv = __get_tcb_dtv(__get_bionic_tcb());
// TODO: See if we can use a relaxed memory ordering here instead.
size_t generation = atomic_load(&__libc_tls_generation_copy);
if (__predict_true(generation == dtv->generation)) {
void* mod_ptr = dtv->modules[__tls_module_id_to_idx(ti->module_id)];
if (__predict_true(mod_ptr != nullptr)) {
return static_cast<char*>(mod_ptr) + ti->offset + TLS_DTV_OFFSET;
}
}
return tls_get_addr_slow_path(ti);
}
// This function frees:
// - TLS modules referenced by the current DTV.
// - The list of DTV objects associated with the current thread.
//
// The caller must have already blocked signals.
void __free_dynamic_tls(bionic_tcb* tcb) {
TlsModules& modules = __libc_shared_globals()->tls_modules;
BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
// If we didn't allocate any dynamic memory, skip out early without taking
// the lock.
TlsDtv* dtv = __get_tcb_dtv(tcb);
if (dtv->generation == kTlsGenerationNone) {
return;
}
// We need the write lock to use the allocator.
ScopedWriteLock locker(&modules.rwlock);
// First free everything in the current DTV.
for (size_t i = 0; i < dtv->count; ++i) {
if (i < modules.module_count && modules.module_table[i].static_offset != SIZE_MAX) {
// This module's TLS memory is allocated statically, so don't free it here.
continue;
}
if (modules.on_destruction_cb != nullptr) {
void* dtls_begin = dtv->modules[i];
void* dtls_end =
static_cast<void*>(static_cast<char*>(dtls_begin) + allocator.get_chunk_size(dtls_begin));
modules.on_destruction_cb(dtls_begin, dtls_end);
}
allocator.free(dtv->modules[i]);
}
// Now free the thread's list of DTVs.
while (dtv->generation != kTlsGenerationNone) {
TlsDtv* next = dtv->next;
allocator.free(dtv);
dtv = next;
}
// Clear the DTV slot. The DTV must not be used again with this thread.
tcb->tls_slot(TLS_SLOT_DTV) = nullptr;
}
// Invokes all the registered thread_exit callbacks, if any.
void __notify_thread_exit_callbacks() {
TlsModules& modules = __libc_shared_globals()->tls_modules;
if (modules.first_thread_exit_callback == nullptr) {
// If there is no first_thread_exit_callback, there shouldn't be a tail.
CHECK(modules.thread_exit_callback_tail_node == nullptr);
return;
}
// Callbacks are supposed to be invoked in the reverse order
// in which they were registered.
CallbackHolder* node = modules.thread_exit_callback_tail_node;
while (node != nullptr) {
node->cb();
node = node->prev;
}
modules.first_thread_exit_callback();
}