riscv64: switch from x18 to gp for shadow call stack.

We want to give back a useful callee-saved general purpose
register (x18) that was only "chosen" because it was what llvm
allowed for historical reasons. gp is a better choice because it's
effectively unused otherwise anyway.

Unfortunately, that means we need extra space in jmp_buf (which I've
reserved in an earlier change, e7b3b8b467),
so let's rearrange the entries in jmp_buf to match their order in the
register file.

Bug: https://github.com/google/android-riscv64/issues/72
Bug: http://b/277909695
Test: treehugger
Change-Id: Ia629409a894c1a83d2052885702bbdd895c758e1
This commit is contained in:
Elliott Hughes 2023-04-06 14:50:31 -07:00
parent 97950cb186
commit 7dd3896fe1
3 changed files with 64 additions and 56 deletions

View file

@ -36,50 +36,52 @@
// 0 sigflag/cookie setjmp cookie in top 31 bits, signal mask flag in low bit
// 1 sigmask 64-bit signal mask
// 2 ra
// 3 s0
// 3 sp
// 4 gp
// 5 s0
// ......
// 14 s11
// 15 sp
// 16 fs0
// 16 s11
// 17 fs0
// ......
// 27 fs11
// 28 checksum
// 28 fs11
// 29 checksum
// _JBLEN: defined in bionic/libc/include/setjmp.h
#define _JB_SIGFLAG 0
#define _JB_SIGMASK 1 * 8
#define _JB_RA 2 * 8
#define _JB_S0 3 * 8
#define _JB_S1 4 * 8
#define _JB_S2 5 * 8
#define _JB_S3 6 * 8
#define _JB_S4 7 * 8
#define _JB_S5 8 * 8
#define _JB_S6 9 * 8
#define _JB_S7 10 * 8
#define _JB_S8 11 * 8
#define _JB_S9 12 * 8
#define _JB_S10 13 * 8
#define _JB_S11 14 * 8
#define _JB_SP 15 * 8
#define _JB_FS0 16 * 8
#define _JB_FS1 17 * 8
#define _JB_FS2 18 * 8
#define _JB_FS3 19 * 8
#define _JB_FS4 20 * 8
#define _JB_FS5 21 * 8
#define _JB_FS6 22 * 8
#define _JB_FS7 23 * 8
#define _JB_FS8 24 * 8
#define _JB_FS9 25 * 8
#define _JB_FS10 26 * 8
#define _JB_FS11 27 * 8
#define _JB_CHECKSUM 28 * 8
#define _JB_SP 3 * 8
#define _JB_GP 4 * 8
#define _JB_S0 5 * 8
#define _JB_S1 6 * 8
#define _JB_S2 7 * 8
#define _JB_S3 8 * 8
#define _JB_S4 9 * 8
#define _JB_S5 10 * 8
#define _JB_S6 11 * 8
#define _JB_S7 12 * 8
#define _JB_S8 13 * 8
#define _JB_S9 14 * 8
#define _JB_S10 15 * 8
#define _JB_S11 16 * 8
#define _JB_FS0 17 * 8
#define _JB_FS1 18 * 8
#define _JB_FS2 19 * 8
#define _JB_FS3 20 * 8
#define _JB_FS4 21 * 8
#define _JB_FS5 22 * 8
#define _JB_FS6 23 * 8
#define _JB_FS7 24 * 8
#define _JB_FS8 25 * 8
#define _JB_FS9 26 * 8
#define _JB_FS10 27 * 8
#define _JB_FS11 28 * 8
#define _JB_CHECKSUM 29 * 8
.macro m_mangle_registers reg, sp_reg
xor s0, s0, \reg
xor s1, s1, \reg
xor a4, a4, \reg // a4 is the masked s2 (x18) for SCS.
xor s2, s2, \reg
xor s3, s3, \reg
xor s4, s4, \reg
xor s5, s5, \reg
@ -89,12 +91,13 @@
xor s9, s9, \reg
xor s10, s10, \reg
xor s11, s11, \reg
xor a4, a4, \reg // a4 is the masked gp (x3) for SCS.
xor \sp_reg, \sp_reg, \reg
.endm
.macro m_calculate_checksum dst, src, scratch
li \dst, 0
.irp i,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
.irp i,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
ld \scratch, (\i * 8)(\src)
xor \dst, \dst, \scratch
.endr
@ -152,19 +155,21 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(sigsetjmp)
andi a1, a1, -2
// Mask off the high bits of the shadow call stack pointer.
// We only store the low bits of x18 to avoid leaking the
// We only store the low bits of gp to avoid leaking the
// shadow call stack address into memory.
// See the SCS commentary in pthread_internal.h for more detail.
li a4, SCS_MASK
and a4, a4, x18
and a4, a4, gp
// Save core registers.
mv a2, sp
m_mangle_registers a1, sp_reg=a2
sd ra, _JB_RA(a0)
sd a4, _JB_GP(a0) // a4 is the masked gp (x3) for SCS.
sd a2, _JB_SP(a0)
sd s0, _JB_S0(a0)
sd s1, _JB_S1(a0)
sd a4, _JB_S2(a0) // a4 is the masked s2 (x18) for SCS.
sd s2, _JB_S2(a0)
sd s3, _JB_S3(a0)
sd s4, _JB_S4(a0)
sd s5, _JB_S5(a0)
@ -174,7 +179,6 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(sigsetjmp)
sd s9, _JB_S9(a0)
sd s10, _JB_S10(a0)
sd s11, _JB_S11(a0)
sd a2, _JB_SP(a0)
m_unmangle_registers a1, sp_reg=a2
// Save floating point registers.
@ -236,9 +240,10 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(siglongjmp)
// Restore core registers.
andi a2, a2, -2
ld ra, _JB_RA(a0)
ld a4, _JB_GP(a0) // Don't clobber the upper bits of gp (x3) used for SCS yet.
ld s0, _JB_S0(a0)
ld s1, _JB_S1(a0)
ld a4, _JB_S2(a0) // Don't clobber s2 (x18) used for SCS yet.
ld s2, _JB_S2(a0)
ld s3, _JB_S3(a0)
ld s4, _JB_S4(a0)
ld s5, _JB_S5(a0)
@ -254,8 +259,8 @@ __BIONIC_WEAK_ASM_FOR_NATIVE_BRIDGE(siglongjmp)
// Restore the low bits of the shadow call stack pointer.
li a5, ~SCS_MASK
and x18, x18, a5
or x18, a4, x18
and gp, gp, a5
or gp, gp, a4
addi sp, sp, -24
sd ra, 0(sp)

View file

@ -133,14 +133,14 @@ static void __init_shadow_call_stack(pthread_internal_t* thread __unused) {
size_t scs_offset =
(getpid() == 1) ? 0 : (arc4random_uniform(SCS_GUARD_REGION_SIZE / SCS_SIZE - 1) * SCS_SIZE);
// Make the stack readable and writable and store its address in x18.
// This is deliberately the only place where the address is stored.
// Make the stack read-write, and store its address in the register we're using as the shadow
// stack pointer. This is deliberately the only place where the address is stored.
char* scs = scs_aligned_guard_region + scs_offset;
mprotect(scs, SCS_SIZE, PROT_READ | PROT_WRITE);
#if defined(__aarch64__)
__asm__ __volatile__("mov x18, %0" ::"r"(scs));
#elif defined(__riscv)
__asm__ __volatile__("mv x18, %0" ::"r"(scs));
__asm__ __volatile__("mv gp, %0" ::"r"(scs));
#endif
#endif
}

View file

@ -110,7 +110,8 @@ class pthread_internal_t {
// are actually used.
//
// This address is only used to deallocate the shadow call stack on thread
// exit; the address of the stack itself is stored only in the x18 register.
// exit; the address of the stack itself is stored only in the register used
// as the shadow stack pointer (x18 on arm64, gp on riscv64).
//
// Because the protection offered by SCS relies on the secrecy of the stack
// address, storing the address here weakens the protection, but only
@ -119,22 +120,24 @@ class pthread_internal_t {
// to other allocations), but not the stack itself, which is <0.1% of the size
// of the guard region.
//
// longjmp()/setjmp() don't store all the bits of x18, only the bottom bits
// covered by SCS_MASK. Since longjmp()/setjmp() between different threads is
// undefined behavior (and unsupported on Android), we can retrieve the high
// bits of x18 from the current value in x18 --- all the jmp_buf needs to store
// is where exactly the shadow stack pointer is in the thread's shadow stack:
// the bottom bits of x18.
// longjmp()/setjmp() don't store all the bits of the shadow stack pointer,
// only the bottom bits covered by SCS_MASK. Since longjmp()/setjmp() between
// different threads is undefined behavior (and unsupported on Android), we
// can retrieve the high bits of the shadow stack pointer from the current
// value in the register --- all the jmp_buf needs to store is where exactly
// the shadow stack pointer is *within* the thread's shadow stack: the bottom
// bits of the register.
//
// There are at least two other options for discovering the start address of
// the guard region on thread exit, but they are not as simple as storing in
// TLS.
//
// 1) Derive it from the value of the x18 register. This is only possible in
// processes that do not contain legacy code that might clobber x18,
// therefore each process must declare early during process startup whether
// it might load legacy code.
// TODO: riscv64 has no legacy code, so we can actually go this route there!
// 1) Derive it from the current value of the shadow stack pointer. This is
// only possible in processes that do not contain legacy code that might
// clobber x18 on arm64, therefore each process must declare early during
// process startup whether it might load legacy code.
// TODO: riscv64 has no legacy code, so we can actually go this route
// there, but hopefully we'll actually get the Zsslpcfi extension instead.
// 2) Mark the guard region as such using prctl(PR_SET_VMA_ANON_NAME) and
// discover its address by reading /proc/self/maps. One issue with this is
// that reading /proc/self/maps can race with allocations, so we may need