From e30e8d46cf605d216a799a28c77b8a41c328613a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 2 Aug 2021 11:42:00 +0100 Subject: arm64: fix compat syscall return truncation Due to inconsistencies in the way we manipulate compat GPRs, we have a few issues today: * For audit and tracing, where error codes are handled as a (native) long, negative error codes are expected to be sign-extended to the native 64-bits, or they may fail to be matched correctly. Thus a syscall which fails with an error may erroneously be identified as failing. * For ptrace, *all* compat return values should be sign-extended for consistency with 32-bit arm, but we currently only do this for negative return codes. * As we may transiently set the upper 32 bits of some compat GPRs while in the kernel, these can be sampled by perf, which is somewhat confusing. This means that where a syscall returns a pointer above 2G, this will be sign-extended, but will not be mistaken for an error as error codes are constrained to the inclusive range [-4096, -1] where no user pointer can exist. To fix all of these, we must consistently use helpers to get/set the compat GPRs, ensuring that we never write the upper 32 bits of the return code, and always sign-extend when reading the return code. This patch does so, with the following changes: * We re-organise syscall_get_return_value() to always sign-extend for compat tasks, and reimplement syscall_get_error() atop. We update syscall_trace_exit() to use syscall_get_return_value(). * We consistently use syscall_set_return_value() to set the return value, ensureing the upper 32 bits are never set unexpectedly. * As the core audit code currently uses regs_return_value() rather than syscall_get_return_value(), we special-case this for compat_user_mode(regs) such that this will do the right thing. Going forward, we should try to move the core audit code over to syscall_get_return_value(). Cc: Reported-by: He Zhe Reported-by: weiyuchen Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20210802104200.21390-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/ptrace.h | 12 +++++++++++- arch/arm64/include/asm/syscall.h | 19 ++++++++++--------- arch/arm64/kernel/ptrace.c | 2 +- arch/arm64/kernel/signal.c | 3 ++- arch/arm64/kernel/syscall.c | 9 +++------ 5 files changed, 27 insertions(+), 18 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index e58bca832dff..41b332c054ab 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -320,7 +320,17 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) static inline unsigned long regs_return_value(struct pt_regs *regs) { - return regs->regs[0]; + unsigned long val = regs->regs[0]; + + /* + * Audit currently uses regs_return_value() instead of + * syscall_get_return_value(). Apply the same sign-extension here until + * audit is updated to use syscall_get_return_value(). + */ + if (compat_user_mode(regs)) + val = sign_extend64(val, 31); + + return val; } static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index cfc0672013f6..03e20895453a 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -29,22 +29,23 @@ static inline void syscall_rollback(struct task_struct *task, regs->regs[0] = regs->orig_x0; } - -static inline long syscall_get_error(struct task_struct *task, - struct pt_regs *regs) +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) { - unsigned long error = regs->regs[0]; + unsigned long val = regs->regs[0]; if (is_compat_thread(task_thread_info(task))) - error = sign_extend64(error, 31); + val = sign_extend64(val, 31); - return IS_ERR_VALUE(error) ? error : 0; + return val; } -static inline long syscall_get_return_value(struct task_struct *task, - struct pt_regs *regs) +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) { - return regs->regs[0]; + unsigned long error = syscall_get_return_value(task, regs); + + return IS_ERR_VALUE(error) ? error : 0; } static inline void syscall_set_return_value(struct task_struct *task, diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 499b6b2f9757..b381a1ee9ea7 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1862,7 +1862,7 @@ void syscall_trace_exit(struct pt_regs *regs) audit_syscall_exit(regs); if (flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs_return_value(regs)); + trace_sys_exit(regs, syscall_get_return_value(current, regs)); if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index f8192f4ae0b8..23036334f4dc 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -890,7 +891,7 @@ static void do_signal(struct pt_regs *regs) retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { - regs->regs[0] = -EINTR; + syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 263d6c1a525f..50a0f1a38e84 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -54,10 +54,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, ret = do_ni_syscall(regs, scno); } - if (is_compat_task()) - ret = lower_32_bits(ret); - - regs->regs[0] = ret; + syscall_set_return_value(current, regs, 0, ret); /* * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), @@ -115,7 +112,7 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * syscall. do_notify_resume() will send a signal to userspace * before the syscall is restarted. */ - regs->regs[0] = -ERESTARTNOINTR; + syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); return; } @@ -136,7 +133,7 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * anyway. */ if (scno == NO_SYSCALL) - regs->regs[0] = -ENOSYS; + syscall_set_return_value(current, regs, -ENOSYS, 0); scno = syscall_trace_enter(regs); if (scno == NO_SYSCALL) goto trace_exit; -- cgit v1.2.3 From 64ee84c75b5f75132eec97f2c7a201a056d53698 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 1 Aug 2021 14:35:25 +0900 Subject: arm64: move warning about toolchains to archprepare Commit 987fdfec2410 ("arm64: move --fix-cortex-a53-843419 linker test to Kconfig") fixed the false-positive warning in the installation step. Yet, there are some cases where this false-positive is shown. For example, you can see it when you cross 987fdfec2410 during git-bisect. $ git checkout 987fdfec2410^ [ snip ] $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- defconfig all [ snip ] $ git checkout v5.13 [ snip] $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- defconfig all [ snip ] arch/arm64/Makefile:25: ld does not support --fix-cortex-a53-843419; kernel may be susceptible to erratum In the stale include/config/auto.config, CONFIG_ARM64_ERRATUM_843419=y is set without CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419, so the warning is displayed while parsing the Makefiles. Make will restart with the updated include/config/auto.config, hence CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419 will be set eventually, but this warning is a surprise for users. Commit 25896d073d8a ("x86/build: Fix compiler support check for CONFIG_RETPOLINE") addressed a similar issue. Move $(warning ...) out of the parse stage of Makefiles. The same applies to CONFIG_ARM64_USE_LSE_ATOMICS. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210801053525.105235-1-masahiroy@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 7bc37d0a1b68..7b668db43261 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -21,19 +21,11 @@ LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \ endif ifeq ($(CONFIG_ARM64_ERRATUM_843419),y) - ifneq ($(CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419),y) -$(warning ld does not support --fix-cortex-a53-843419; kernel may be susceptible to erratum) - else + ifeq ($(CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419),y) LDFLAGS_vmlinux += --fix-cortex-a53-843419 endif endif -ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS), y) - ifneq ($(CONFIG_ARM64_LSE_ATOMICS), y) -$(warning LSE atomics not supported by binutils) - endif -endif - cc_has_k_constraint := $(call try-run,echo \ 'int main(void) { \ asm volatile("and w0, w0, %w0" :: "K" (4294967295)); \ @@ -176,6 +168,17 @@ vdso_install: archprepare: $(Q)$(MAKE) $(build)=arch/arm64/tools kapi +ifeq ($(CONFIG_ARM64_ERRATUM_843419),y) + ifneq ($(CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419),y) + @echo "warning: ld does not support --fix-cortex-a53-843419; kernel may be susceptible to erratum" >&2 + endif +endif +ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS),y) + ifneq ($(CONFIG_ARM64_LSE_ATOMICS),y) + @echo "warning: LSE atomics not supported by binutils" >&2 + endif +endif + # We use MRPROPER_FILES and CLEAN_FILES now archclean: -- cgit v1.2.3 From f9c4ff2ab9fe433d44ebbc2e3c2368a49df44798 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 31 Jul 2021 00:51:31 +1200 Subject: arm64: fix the doc of RANDOMIZE_MODULE_REGION_FULL Obviously kaslr is setting the module region to 2GB rather than 4GB since commit b2eed9b588112 ("arm64/kernel: kaslr: reduce module randomization range to 2 GB"). So fix the size of region in Kconfig. On the other hand, even though RANDOMIZE_MODULE_REGION_FULL is not set, module_alloc() can fall back to a 2GB window if ARM64_MODULE_PLTS is set. In this case, veneers are still needed. !RANDOMIZE_MODULE_REGION_FULL doesn't necessarily mean veneers are not needed. So fix the doc to be more precise to avoid any confusion to the readers of the code. Cc: Masami Hiramatsu Cc: Ard Biesheuvel Cc: Qi Liu Signed-off-by: Barry Song Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20210730125131.13724-1-song.bao.hua@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 9 ++++++--- arch/arm64/kernel/kaslr.c | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b5b13a932561..fdcd54d39c1e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1800,11 +1800,11 @@ config RANDOMIZE_BASE If unsure, say N. config RANDOMIZE_MODULE_REGION_FULL - bool "Randomize the module region over a 4 GB range" + bool "Randomize the module region over a 2 GB range" depends on RANDOMIZE_BASE default y help - Randomizes the location of the module region inside a 4 GB window + Randomizes the location of the module region inside a 2 GB window covering the core kernel. This way, it is less likely for modules to leak information about the location of core kernel data structures but it does imply that function calls between modules and the core @@ -1812,7 +1812,10 @@ config RANDOMIZE_MODULE_REGION_FULL When this option is not set, the module region will be randomized over a limited range that contains the [_stext, _etext] interval of the - core kernel, so branch relocations are always in range. + core kernel, so branch relocations are almost always in range unless + ARM64_MODULE_PLTS is enabled and the region is exhausted. In this + particular case of region exhaustion, modules might be able to fall + back to a larger 2GB area. config CC_HAVE_STACKPROTECTOR_SYSREG def_bool $(cc-option,-mstack-protector-guard=sysreg -mstack-protector-guard-reg=sp_el0 -mstack-protector-guard-offset=0) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index cfa2cfde3019..418b2bba1521 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -162,7 +162,9 @@ u64 __init kaslr_early_init(void) * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, * _stext) . This guarantees that the resulting region still * covers [_stext, _etext], and that all relative branches can - * be resolved without veneers. + * be resolved without veneers unless this region is exhausted + * and we fall back to a larger 2GB window in module_alloc() + * when ARM64_MODULE_PLTS is enabled. */ module_range = MODULES_VSIZE - (u64)(_etext - _stext); module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; -- cgit v1.2.3 From 8d5903f457145e3fcd858578b065d667822d99ac Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 2 Aug 2021 17:48:44 +0100 Subject: arm64: stacktrace: fix comment Due to a copy-paste error, we describe struct stackframe::pc as a snapshot of the `fp` field rather than the `lr` field. Fix the comment. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Madhavan T. Venkataraman Cc: Mark Brown Cc: Will Deacon Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210802164845.45506-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/stacktrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 1801399204d7..8aebc00c1718 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -35,7 +35,7 @@ struct stack_info { * accounting information necessary for robust unwinding. * * @fp: The fp value in the frame record (or the real fp) - * @pc: The fp value in the frame record (or the real lr) + * @pc: The lr value in the frame record (or the real lr) * * @stacks_done: Stacks which have been entirely unwound, for which it is no * longer valid to unwind to. -- cgit v1.2.3 From 0c32706dac1b0a72713184246952ab0f54327c21 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 2 Aug 2021 17:48:45 +0100 Subject: arm64: stacktrace: avoid tracing arch_stack_walk() When the function_graph tracer is in use, arch_stack_walk() may unwind the stack incorrectly, erroneously reporting itself, missing the final entry which is being traced, and reporting all traced entries between these off-by-one from where they should be. When ftrace hooks a function return, the original return address is saved to the fgraph ret_stack, and the return address in the LR (or the function's frame record) is replaced with `return_to_handler`. When arm64's unwinder encounter frames returning to `return_to_handler`, it finds the associated original return address from the fgraph ret stack, assuming the most recent `ret_to_hander` entry on the stack corresponds to the most recent entry in the fgraph ret stack, and so on. When arch_stack_walk() is used to dump the current task's stack, it starts from the caller of arch_stack_walk(). However, arch_stack_walk() can be traced, and so may push an entry on to the fgraph ret stack, leaving the fgraph ret stack offset by one from the expected position. This can be seen when dumping the stack via /proc/self/stack, where enabling the graph tracer results in an unexpected `stack_trace_save_tsk` entry at the start of the trace, and `el0_svc` missing form the end of the trace. This patch fixes this by marking arch_stack_walk() as notrace, as we do for all other functions on the path to ftrace_graph_get_ret_stack(). While a few helper functions are not marked notrace, their calls/returns are balanced, and will have no observable effect when examining the fgraph ret stack. It is possible for an exeption boundary to cause a similar offset if the return address of the interrupted context was in the LR. Fixing those cases will require some more substantial rework, and is left for subsequent patches. Before: | # cat /proc/self/stack | [<0>] proc_pid_stack+0xc4/0x140 | [<0>] proc_single_show+0x6c/0x120 | [<0>] seq_read_iter+0x240/0x4e0 | [<0>] seq_read+0xe8/0x140 | [<0>] vfs_read+0xb8/0x1e4 | [<0>] ksys_read+0x74/0x100 | [<0>] __arm64_sys_read+0x28/0x3c | [<0>] invoke_syscall+0x50/0x120 | [<0>] el0_svc_common.constprop.0+0xc4/0xd4 | [<0>] do_el0_svc+0x30/0x9c | [<0>] el0_svc+0x2c/0x54 | [<0>] el0t_64_sync_handler+0x1a8/0x1b0 | [<0>] el0t_64_sync+0x198/0x19c | # echo function_graph > /sys/kernel/tracing/current_tracer | # cat /proc/self/stack | [<0>] stack_trace_save_tsk+0xa4/0x110 | [<0>] proc_pid_stack+0xc4/0x140 | [<0>] proc_single_show+0x6c/0x120 | [<0>] seq_read_iter+0x240/0x4e0 | [<0>] seq_read+0xe8/0x140 | [<0>] vfs_read+0xb8/0x1e4 | [<0>] ksys_read+0x74/0x100 | [<0>] __arm64_sys_read+0x28/0x3c | [<0>] invoke_syscall+0x50/0x120 | [<0>] el0_svc_common.constprop.0+0xc4/0xd4 | [<0>] do_el0_svc+0x30/0x9c | [<0>] el0t_64_sync_handler+0x1a8/0x1b0 | [<0>] el0t_64_sync+0x198/0x19c After: | # cat /proc/self/stack | [<0>] proc_pid_stack+0xc4/0x140 | [<0>] proc_single_show+0x6c/0x120 | [<0>] seq_read_iter+0x240/0x4e0 | [<0>] seq_read+0xe8/0x140 | [<0>] vfs_read+0xb8/0x1e4 | [<0>] ksys_read+0x74/0x100 | [<0>] __arm64_sys_read+0x28/0x3c | [<0>] invoke_syscall+0x50/0x120 | [<0>] el0_svc_common.constprop.0+0xc4/0xd4 | [<0>] do_el0_svc+0x30/0x9c | [<0>] el0_svc+0x2c/0x54 | [<0>] el0t_64_sync_handler+0x1a8/0x1b0 | [<0>] el0t_64_sync+0x198/0x19c | # echo function_graph > /sys/kernel/tracing/current_tracer | # cat /proc/self/stack | [<0>] proc_pid_stack+0xc4/0x140 | [<0>] proc_single_show+0x6c/0x120 | [<0>] seq_read_iter+0x240/0x4e0 | [<0>] seq_read+0xe8/0x140 | [<0>] vfs_read+0xb8/0x1e4 | [<0>] ksys_read+0x74/0x100 | [<0>] __arm64_sys_read+0x28/0x3c | [<0>] invoke_syscall+0x50/0x120 | [<0>] el0_svc_common.constprop.0+0xc4/0xd4 | [<0>] do_el0_svc+0x30/0x9c | [<0>] el0_svc+0x2c/0x54 | [<0>] el0t_64_sync_handler+0x1a8/0x1b0 | [<0>] el0t_64_sync+0x198/0x19c Cc: Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Madhavan T. Venkataraman Cc: Mark Brown Cc: Will Deacon Reviwed-by: Mark Brown Link: https://lore.kernel.org/r/20210802164845.45506-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b83c8d911930..8982a2b78acf 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -218,7 +218,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) #ifdef CONFIG_STACKTRACE -noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, +noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { -- cgit v1.2.3