mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Previously different architectures were using random sources of
differing strength and cost to decide the random kstack offset. A number
of architectures (loongarch, powerpc, s390, x86) were using their
timestamp counter, at whatever the frequency happened to be. Other
arches (arm64, riscv) were using entropy from the crng via
get_random_u16().
There have been concerns that in some cases the timestamp counters may
be too weak, because they can be easily guessed or influenced by user
space. And get_random_u16() has been shown to be too costly for the
level of protection kstack offset randomization provides.
So let's use a common, architecture-agnostic source of entropy; a
per-cpu prng, seeded at boot-time from the crng. This has a few
benefits:
- We can remove choose_random_kstack_offset(); That was only there to
try to make the timestamp counter value a bit harder to influence
from user space [*].
- The architecture code is simplified. All it has to do now is call
add_random_kstack_offset() in the syscall path.
- The strength of the randomness can be reasoned about independently
of the architecture.
- Arches previously using get_random_u16() now have much faster
syscall paths, see below results.
[*] Additionally, this gets rid of some redundant work on s390 and x86.
Before this patch, those architectures called
choose_random_kstack_offset() under arch_exit_to_user_mode_prepare(),
which is also called for exception returns to userspace which were *not*
syscalls (e.g. regular interrupts). Getting rid of
choose_random_kstack_offset() avoids a small amount of redundant work
for the non-syscall cases.
In some configurations, add_random_kstack_offset() will now call
instrumentable code, so for a couple of arches, I have moved the call a
bit later to the first point where instrumentation is allowed. This
doesn't impact the efficacy of the mechanism.
There have been some claims that a prng may be less strong than the
timestamp counter if not regularly reseeded. But the prng has a period
of about 2^113. So as long as the prng state remains secret, it should
not be possible to guess. If the prng state can be accessed, we have
bigger problems.
Additionally, we are only consuming 6 bits to randomize the stack, so
there are only 64 possible random offsets. I assert that it would be
trivial for an attacker to brute force by repeating their attack and
waiting for the random stack offset to be the desired one. The prng
approach seems entirely proportional to this level of protection.
Performance data are provided below. The baseline is v6.18 with rndstack
on for each respective arch. (I)/(R) indicate statistically significant
improvement/regression. arm64 platform is AWS Graviton3 (m7g.metal).
x86_64 platform is AWS Sapphire Rapids (m7i.24xlarge):
+-----------------+--------------+---------------+---------------+
| Benchmark | Result Class | per-cpu-prng | per-cpu-prng |
| | | arm64 (metal) | x86_64 (VM) |
+=================+==============+===============+===============+
| syscall/getpid | mean (ns) | (I) -9.50% | (I) -17.65% |
| | p99 (ns) | (I) -59.24% | (I) -24.41% |
| | p99.9 (ns) | (I) -59.52% | (I) -28.52% |
+-----------------+--------------+---------------+---------------+
| syscall/getppid | mean (ns) | (I) -9.52% | (I) -19.24% |
| | p99 (ns) | (I) -59.25% | (I) -25.03% |
| | p99.9 (ns) | (I) -59.50% | (I) -28.17% |
+-----------------+--------------+---------------+---------------+
| syscall/invalid | mean (ns) | (I) -10.31% | (I) -18.56% |
| | p99 (ns) | (I) -60.79% | (I) -20.06% |
| | p99.9 (ns) | (I) -61.04% | (I) -25.04% |
+-----------------+--------------+---------------+---------------+
I tested an earlier version of this change on x86 bare metal and it
showed a smaller but still significant improvement. The bare metal
system wasn't available this time around so testing was done in a VM
instance. I'm guessing the cost of rdtsc is higher for VMs.
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Link: https://patch.msgid.link/20260303150840.3789438-3-ryan.roberts@arm.com
Signed-off-by: Kees Cook <kees@kernel.org>
150 lines
4.2 KiB
C
150 lines
4.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/compiler.h>
|
|
#include <linux/context_tracking.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/nospec.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/randomize_kstack.h>
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <asm/debug-monitors.h>
|
|
#include <asm/exception.h>
|
|
#include <asm/fpsimd.h>
|
|
#include <asm/syscall.h>
|
|
#include <asm/thread_info.h>
|
|
#include <asm/unistd.h>
|
|
#include <asm/unistd_compat_32.h>
|
|
|
|
long compat_arm_syscall(struct pt_regs *regs, int scno);
|
|
long sys_ni_syscall(void);
|
|
|
|
static long do_ni_syscall(struct pt_regs *regs, int scno)
|
|
{
|
|
if (is_compat_task()) {
|
|
long ret = compat_arm_syscall(regs, scno);
|
|
if (ret != -ENOSYS)
|
|
return ret;
|
|
}
|
|
|
|
return sys_ni_syscall();
|
|
}
|
|
|
|
static long __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)
|
|
{
|
|
return syscall_fn(regs);
|
|
}
|
|
|
|
static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
|
|
unsigned int sc_nr,
|
|
const syscall_fn_t syscall_table[])
|
|
{
|
|
long ret;
|
|
|
|
add_random_kstack_offset();
|
|
|
|
if (likely(scno < sc_nr)) {
|
|
syscall_fn_t syscall_fn;
|
|
syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
|
|
ret = __invoke_syscall(regs, syscall_fn);
|
|
} else {
|
|
ret = do_ni_syscall(regs, scno);
|
|
}
|
|
|
|
syscall_set_return_value(current, regs, 0, ret);
|
|
}
|
|
|
|
static inline bool has_syscall_work(unsigned long flags)
|
|
{
|
|
return unlikely(flags & _TIF_SYSCALL_WORK);
|
|
}
|
|
|
|
static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
|
|
const syscall_fn_t syscall_table[])
|
|
{
|
|
unsigned long flags = read_thread_flags();
|
|
|
|
regs->orig_x0 = regs->regs[0];
|
|
regs->syscallno = scno;
|
|
|
|
/*
|
|
* BTI note:
|
|
* The architecture does not guarantee that SPSR.BTYPE is zero
|
|
* on taking an SVC, so we could return to userspace with a
|
|
* non-zero BTYPE after the syscall.
|
|
*
|
|
* This shouldn't matter except when userspace is explicitly
|
|
* doing something stupid, such as setting PROT_BTI on a page
|
|
* that lacks conforming BTI/PACIxSP instructions, falling
|
|
* through from one executable page to another with differing
|
|
* PROT_BTI, or messing with BTYPE via ptrace: in such cases,
|
|
* userspace should not be surprised if a SIGILL occurs on
|
|
* syscall return.
|
|
*
|
|
* So, don't touch regs->pstate & PSR_BTYPE_MASK here.
|
|
* (Similarly for HVC and SMC elsewhere.)
|
|
*/
|
|
|
|
if (unlikely(flags & _TIF_MTE_ASYNC_FAULT)) {
|
|
/*
|
|
* Process the asynchronous tag check fault before the actual
|
|
* syscall. do_notify_resume() will send a signal to userspace
|
|
* before the syscall is restarted.
|
|
*/
|
|
syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0);
|
|
return;
|
|
}
|
|
|
|
if (has_syscall_work(flags)) {
|
|
/*
|
|
* The de-facto standard way to skip a system call using ptrace
|
|
* is to set the system call to -1 (NO_SYSCALL) and set x0 to a
|
|
* suitable error code for consumption by userspace. However,
|
|
* this cannot be distinguished from a user-issued syscall(-1)
|
|
* and so we must set x0 to -ENOSYS here in case the tracer doesn't
|
|
* issue the skip and we fall into trace_exit with x0 preserved.
|
|
*
|
|
* This is slightly odd because it also means that if a tracer
|
|
* sets the system call number to -1 but does not initialise x0,
|
|
* then x0 will be preserved for all system calls apart from a
|
|
* user-issued syscall(-1). However, requesting a skip and not
|
|
* setting the return value is unlikely to do anything sensible
|
|
* anyway.
|
|
*/
|
|
if (scno == NO_SYSCALL)
|
|
syscall_set_return_value(current, regs, -ENOSYS, 0);
|
|
scno = syscall_trace_enter(regs);
|
|
if (scno == NO_SYSCALL)
|
|
goto trace_exit;
|
|
}
|
|
|
|
invoke_syscall(regs, scno, sc_nr, syscall_table);
|
|
|
|
/*
|
|
* The tracing status may have changed under our feet, so we have to
|
|
* check again. However, if we were tracing entry, then we always trace
|
|
* exit regardless, as the old entry assembly did.
|
|
*/
|
|
if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
|
|
flags = read_thread_flags();
|
|
if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP))
|
|
return;
|
|
}
|
|
|
|
trace_exit:
|
|
syscall_trace_exit(regs);
|
|
}
|
|
|
|
void do_el0_svc(struct pt_regs *regs)
|
|
{
|
|
el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);
|
|
}
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
void do_el0_svc_compat(struct pt_regs *regs)
|
|
{
|
|
el0_svc_common(regs, regs->regs[7], __NR_compat32_syscalls,
|
|
compat_sys_call_table);
|
|
}
|
|
#endif
|