From 7dd3896fe1ec5160169b17507962fc699abea39f Mon Sep 17 00:00:00 2001 From: Elliott Hughes Date: Thu, 6 Apr 2023 14:50:31 -0700 Subject: [PATCH] riscv64: switch from x18 to gp for shadow call stack. We want to give back a useful callee-saved general purpose register (x18) that was only "chosen" because it was what llvm allowed for historical reasons. gp is a better choice because it's effectively unused otherwise anyway. Unfortunately, that means we need extra space in jmp_buf (which I've reserved in an earlier change, e7b3b8b467bad2cd32470b5edd5cb9938b934316), so let's rearrange the entries in jmp_buf to match their order in the register file. Bug: https://github.com/google/android-riscv64/issues/72 Bug: http://b/277909695 Test: treehugger Change-Id: Ia629409a894c1a83d2052885702bbdd895c758e1 --- libc/arch-riscv64/bionic/setjmp.S | 87 ++++++++++++++++--------------- libc/bionic/pthread_create.cpp | 6 +-- libc/bionic/pthread_internal.h | 27 +++++----- 3 files changed, 64 insertions(+), 56 deletions(-) diff --git a/libc/arch-riscv64/bionic/setjmp.S b/libc/arch-riscv64/bionic/setjmp.S index ba3cacf44..26f7ec9d3 100644 --- a/libc/arch-riscv64/bionic/setjmp.S +++ b/libc/arch-riscv64/bionic/setjmp.S @@ -36,50 +36,52 @@ // 0 sigflag/cookie setjmp cookie in top 31 bits, signal mask flag in low bit // 1 sigmask 64-bit signal mask // 2 ra -// 3 s0 +// 3 sp +// 4 gp +// 5 s0 // ...... -// 14 s11 -// 15 sp -// 16 fs0 +// 16 s11 +// 17 fs0 // ...... -// 27 fs11 -// 28 checksum +// 28 fs11 +// 29 checksum // _JBLEN: defined in bionic/libc/include/setjmp.h #define _JB_SIGFLAG 0 #define _JB_SIGMASK 1 * 8 #define _JB_RA 2 * 8 -#define _JB_S0 3 * 8 -#define _JB_S1 4 * 8 -#define _JB_S2 5 * 8 -#define _JB_S3 6 * 8 -#define _JB_S4 7 * 8 -#define _JB_S5 8 * 8 -#define _JB_S6 9 * 8 -#define _JB_S7 10 * 8 -#define _JB_S8 11 * 8 -#define _JB_S9 12 * 8 -#define _JB_S10 13 * 8 -#define _JB_S11 14 * 8 -#define _JB_SP 15 * 8 -#define _JB_FS0 16 * 8 -#define _JB_FS1 17 * 8 -#define _JB_FS2 18 * 8 -#define _JB_FS3 19 * 8 -#define _JB_FS4 20 * 8 -#define _JB_FS5 21 * 8 -#define _JB_FS6 22 * 8 -#define _JB_FS7 23 * 8 -#define _JB_FS8 24 * 8 -#define _JB_FS9 25 * 8 -#define _JB_FS10 26 * 8 -#define _JB_FS11 27 * 8 -#define _JB_CHECKSUM 28 * 8 +#define _JB_SP 3 * 8 +#define _JB_GP 4 * 8 +#define _JB_S0 5 * 8 +#define _JB_S1 6 * 8 +#define _JB_S2 7 * 8 +#define _JB_S3 8 * 8 +#define _JB_S4 9 * 8 +#define _JB_S5 10 * 8 +#define _JB_S6 11 * 8 +#define _JB_S7 12 * 8 +#define _JB_S8 13 * 8 +#define _JB_S9 14 * 8 +#define _JB_S10 15 * 8 +#define _JB_S11 16 * 8 +#define _JB_FS0 17 * 8 +#define _JB_FS1 18 * 8 +#define _JB_FS2 19 * 8 +#define _JB_FS3 20 * 8 +#define _JB_FS4 21 * 8 +#define _JB_FS5 22 * 8 +#define _JB_FS6 23 * 8 +#define _JB_FS7 24 * 8 +#define _JB_FS8 25 * 8 +#define _JB_FS9 26 * 8 +#define _JB_FS10 27 * 8 +#define _JB_FS11 28 * 8 +#define _JB_CHECKSUM 29 * 8 .macro m_mangle_registers reg, sp_reg xor s0, s0, \reg xor s1, s1, \reg - xor a4, a4, \reg // a4 is the masked s2 (x18) for SCS. + xor s2, s2, \reg xor s3, s3, \reg xor s4, s4, \reg xor s5, s5, \reg @@ -89,12 +91,13 @@ xor s9, s9, \reg xor s10, s10, \reg xor s11, s11, \reg + xor a4, a4, \reg // a4 is the masked gp (x3) for SCS. xor \sp_reg, \sp_reg, \reg .endm .macro m_calculate_checksum dst, src, scratch li \dst, 0 - .irp i,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27 + .irp i,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 ld \scratch, (\i * 8)(\src) xor \dst, \dst, \scratch .endr @@ -152,19 +155,21 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(sigsetjmp) andi a1, a1, -2 // Mask off the high bits of the shadow call stack pointer. - // We only store the low bits of x18 to avoid leaking the + // We only store the low bits of gp to avoid leaking the // shadow call stack address into memory. // See the SCS commentary in pthread_internal.h for more detail. li a4, SCS_MASK - and a4, a4, x18 + and a4, a4, gp // Save core registers. mv a2, sp m_mangle_registers a1, sp_reg=a2 sd ra, _JB_RA(a0) + sd a4, _JB_GP(a0) // a4 is the masked gp (x3) for SCS. + sd a2, _JB_SP(a0) sd s0, _JB_S0(a0) sd s1, _JB_S1(a0) - sd a4, _JB_S2(a0) // a4 is the masked s2 (x18) for SCS. + sd s2, _JB_S2(a0) sd s3, _JB_S3(a0) sd s4, _JB_S4(a0) sd s5, _JB_S5(a0) @@ -174,7 +179,6 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(sigsetjmp) sd s9, _JB_S9(a0) sd s10, _JB_S10(a0) sd s11, _JB_S11(a0) - sd a2, _JB_SP(a0) m_unmangle_registers a1, sp_reg=a2 // Save floating point registers. @@ -236,9 +240,10 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(siglongjmp) // Restore core registers. andi a2, a2, -2 ld ra, _JB_RA(a0) + ld a4, _JB_GP(a0) // Don't clobber the upper bits of gp (x3) used for SCS yet. ld s0, _JB_S0(a0) ld s1, _JB_S1(a0) - ld a4, _JB_S2(a0) // Don't clobber s2 (x18) used for SCS yet. + ld s2, _JB_S2(a0) ld s3, _JB_S3(a0) ld s4, _JB_S4(a0) ld s5, _JB_S5(a0) @@ -254,8 +259,8 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(siglongjmp) // Restore the low bits of the shadow call stack pointer. li a5, ~SCS_MASK - and x18, x18, a5 - or x18, a4, x18 + and gp, gp, a5 + or gp, gp, a4 addi sp, sp, -24 sd ra, 0(sp) diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp index 15d6d6dd9..7bf9b409a 100644 --- a/libc/bionic/pthread_create.cpp +++ b/libc/bionic/pthread_create.cpp @@ -133,14 +133,14 @@ static void __init_shadow_call_stack(pthread_internal_t* thread __unused) { size_t scs_offset = (getpid() == 1) ? 0 : (arc4random_uniform(SCS_GUARD_REGION_SIZE / SCS_SIZE - 1) * SCS_SIZE); - // Make the stack readable and writable and store its address in x18. - // This is deliberately the only place where the address is stored. + // Make the stack read-write, and store its address in the register we're using as the shadow + // stack pointer. This is deliberately the only place where the address is stored. char* scs = scs_aligned_guard_region + scs_offset; mprotect(scs, SCS_SIZE, PROT_READ | PROT_WRITE); #if defined(__aarch64__) __asm__ __volatile__("mov x18, %0" ::"r"(scs)); #elif defined(__riscv) - __asm__ __volatile__("mv x18, %0" ::"r"(scs)); + __asm__ __volatile__("mv gp, %0" ::"r"(scs)); #endif #endif } diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h index 083c2ed8c..a3a4ccda2 100644 --- a/libc/bionic/pthread_internal.h +++ b/libc/bionic/pthread_internal.h @@ -110,7 +110,8 @@ class pthread_internal_t { // are actually used. // // This address is only used to deallocate the shadow call stack on thread - // exit; the address of the stack itself is stored only in the x18 register. + // exit; the address of the stack itself is stored only in the register used + // as the shadow stack pointer (x18 on arm64, gp on riscv64). // // Because the protection offered by SCS relies on the secrecy of the stack // address, storing the address here weakens the protection, but only @@ -119,22 +120,24 @@ class pthread_internal_t { // to other allocations), but not the stack itself, which is <0.1% of the size // of the guard region. // - // longjmp()/setjmp() don't store all the bits of x18, only the bottom bits - // covered by SCS_MASK. Since longjmp()/setjmp() between different threads is - // undefined behavior (and unsupported on Android), we can retrieve the high - // bits of x18 from the current value in x18 --- all the jmp_buf needs to store - // is where exactly the shadow stack pointer is in the thread's shadow stack: - // the bottom bits of x18. + // longjmp()/setjmp() don't store all the bits of the shadow stack pointer, + // only the bottom bits covered by SCS_MASK. Since longjmp()/setjmp() between + // different threads is undefined behavior (and unsupported on Android), we + // can retrieve the high bits of the shadow stack pointer from the current + // value in the register --- all the jmp_buf needs to store is where exactly + // the shadow stack pointer is *within* the thread's shadow stack: the bottom + // bits of the register. // // There are at least two other options for discovering the start address of // the guard region on thread exit, but they are not as simple as storing in // TLS. // - // 1) Derive it from the value of the x18 register. This is only possible in - // processes that do not contain legacy code that might clobber x18, - // therefore each process must declare early during process startup whether - // it might load legacy code. - // TODO: riscv64 has no legacy code, so we can actually go this route there! + // 1) Derive it from the current value of the shadow stack pointer. This is + // only possible in processes that do not contain legacy code that might + // clobber x18 on arm64, therefore each process must declare early during + // process startup whether it might load legacy code. + // TODO: riscv64 has no legacy code, so we can actually go this route + // there, but hopefully we'll actually get the Zsslpcfi extension instead. // 2) Mark the guard region as such using prctl(PR_SET_VMA_ANON_NAME) and // discover its address by reading /proc/self/maps. One issue with this is // that reading /proc/self/maps can race with allocations, so we may need