diff mbox series

[PATCHv3,bpf,2/2] bpf: Disable preemption in bpf_event_output

Message ID 20230725084206.580930-3-jolsa@kernel.org (mailing list archive)
State Accepted
Delegated to: BPF
Headers show
Series bpf: Disable preemption in perf_event_output helpers code | expand

Checks

Context Check Description
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-3 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-8 success Logs for veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1357 this patch: 1357
netdev/cc_maintainers fail 2 blamed authors not CCed: tglx@linutronix.de davem@davemloft.net; 8 maintainers not CCed: tglx@linutronix.de linux-trace-kernel@vger.kernel.org kpsingh@kernel.org martin.lau@linux.dev mhiramat@kernel.org song@kernel.org davem@davemloft.net rostedt@goodmis.org
netdev/build_clang success Errors and warnings before: 1365 this patch: 1365
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1380 this patch: 1380
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 27 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 fail Logs for build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-5 success Logs for build for x86_64 with llvm-16

Commit Message

Jiri Olsa July 25, 2023, 8:42 a.m. UTC
We received report [1] of kernel crash, which is caused by
using nesting protection without disabled preemption.

The bpf_event_output can be called by programs executed by
bpf_prog_run_array_cg function that disabled migration but
keeps preemption enabled.

This can cause task to be preempted by another one inside the
nesting protection and lead eventually to two tasks using same
perf_sample_data buffer and cause crashes like:

  BUG: kernel NULL pointer dereference, address: 0000000000000001
  #PF: supervisor instruction fetch in kernel mode
  #PF: error_code(0x0010) - not-present page
  ...
  ? perf_output_sample+0x12a/0x9a0
  ? finish_task_switch.isra.0+0x81/0x280
  ? perf_event_output+0x66/0xa0
  ? bpf_event_output+0x13a/0x190
  ? bpf_event_output_data+0x22/0x40
  ? bpf_prog_dfc84bbde731b257_cil_sock4_connect+0x40a/0xacb
  ? xa_load+0x87/0xe0
  ? __cgroup_bpf_run_filter_sock_addr+0xc1/0x1a0
  ? release_sock+0x3e/0x90
  ? sk_setsockopt+0x1a1/0x12f0
  ? udp_pre_connect+0x36/0x50
  ? inet_dgram_connect+0x93/0xa0
  ? __sys_connect+0xb4/0xe0
  ? udp_setsockopt+0x27/0x40
  ? __pfx_udp_push_pending_frames+0x10/0x10
  ? __sys_setsockopt+0xdf/0x1a0
  ? __x64_sys_connect+0xf/0x20
  ? do_syscall_64+0x3a/0x90
  ? entry_SYSCALL_64_after_hwframe+0x72/0xdc

Fixing this by disabling preemption in bpf_event_output.

[1] https://github.com/cilium/cilium/issues/26756
Cc: stable@vger.kernel.org
Reported-by: Oleg "livelace" Popov <o.popov@livelace.ru>
Closes: https://github.com/cilium/cilium/issues/26756
Fixes: 2a916f2f546c ("bpf: Use migrate_disable/enable in array macros and cgroup/lirc code.")
Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 kernel/trace/bpf_trace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Comments

Alexei Starovoitov July 26, 2023, 12:13 a.m. UTC | #1
On Tue, Jul 25, 2023 at 1:42 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> We received report [1] of kernel crash, which is caused by
> using nesting protection without disabled preemption.
>
> The bpf_event_output can be called by programs executed by
> bpf_prog_run_array_cg function that disabled migration but
> keeps preemption enabled.
>
> This can cause task to be preempted by another one inside the
> nesting protection and lead eventually to two tasks using same
> perf_sample_data buffer and cause crashes like:
>
>   BUG: kernel NULL pointer dereference, address: 0000000000000001
>   #PF: supervisor instruction fetch in kernel mode
>   #PF: error_code(0x0010) - not-present page
>   ...
>   ? perf_output_sample+0x12a/0x9a0
>   ? finish_task_switch.isra.0+0x81/0x280
>   ? perf_event_output+0x66/0xa0
>   ? bpf_event_output+0x13a/0x190
>   ? bpf_event_output_data+0x22/0x40
>   ? bpf_prog_dfc84bbde731b257_cil_sock4_connect+0x40a/0xacb
>   ? xa_load+0x87/0xe0
>   ? __cgroup_bpf_run_filter_sock_addr+0xc1/0x1a0
>   ? release_sock+0x3e/0x90
>   ? sk_setsockopt+0x1a1/0x12f0
>   ? udp_pre_connect+0x36/0x50
>   ? inet_dgram_connect+0x93/0xa0
>   ? __sys_connect+0xb4/0xe0
>   ? udp_setsockopt+0x27/0x40
>   ? __pfx_udp_push_pending_frames+0x10/0x10
>   ? __sys_setsockopt+0xdf/0x1a0
>   ? __x64_sys_connect+0xf/0x20
>   ? do_syscall_64+0x3a/0x90
>   ? entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> Fixing this by disabling preemption in bpf_event_output.
>
> [1] https://github.com/cilium/cilium/issues/26756
> Cc: stable@vger.kernel.org
> Reported-by: Oleg "livelace" Popov <o.popov@livelace.ru>
> Closes: https://github.com/cilium/cilium/issues/26756
> Fixes: 2a916f2f546c ("bpf: Use migrate_disable/enable in array macros and cgroup/lirc code.")
> Acked-by: Hou Tao <houtao1@huawei.com>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  kernel/trace/bpf_trace.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 14c9a1a548c9..6826ebf750b0 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -720,7 +720,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
>  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>                      void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
>  {
> -       int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
>         struct perf_raw_frag frag = {
>                 .copy           = ctx_copy,
>                 .size           = ctx_size,
> @@ -737,8 +736,13 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>         };
>         struct perf_sample_data *sd;
>         struct pt_regs *regs;
> +       int nest_level;
>         u64 ret;
>
> +       preempt_disable();
> +

Removed extra empty line here and in patch 1 and applied.

Thanks!

> +       nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
> +
>         if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
>                 ret = -EBUSY;
>                 goto out;
> @@ -753,6 +757,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>         ret = __bpf_perf_event_output(regs, map, flags, sd);
>  out:
>         this_cpu_dec(bpf_event_output_nest_level);
> +       preempt_enable();
>         return ret;
>  }
>
> --
> 2.41.0
>
diff mbox series

Patch

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 14c9a1a548c9..6826ebf750b0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -720,7 +720,6 @@  static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
 	struct perf_raw_frag frag = {
 		.copy		= ctx_copy,
 		.size		= ctx_size,
@@ -737,8 +736,13 @@  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 	struct perf_sample_data *sd;
 	struct pt_regs *regs;
+	int nest_level;
 	u64 ret;
 
+	preempt_disable();
+
+	nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
+
 	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
 		ret = -EBUSY;
 		goto out;
@@ -753,6 +757,7 @@  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	ret = __bpf_perf_event_output(regs, map, flags, sd);
 out:
 	this_cpu_dec(bpf_event_output_nest_level);
+	preempt_enable();
 	return ret;
 }