diff mbox series

[bpf-next,1/2] bpf/perf: Call perf_prepare_sample() before bpf_prog_run()

Message ID 20221220220144.4016213-2-namhyung@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Allow access to perf sample data (v2) | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1395 this patch: 1395
netdev/cc_maintainers warning 6 maintainers not CCed: mark.rutland@arm.com mingo@redhat.com alexander.shishkin@linux.intel.com song@kernel.org martin.lau@linux.dev linux-perf-users@vger.kernel.org
netdev/build_clang success Errors and warnings before: 150 this patch: 150
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1390 this patch: 1390
netdev/checkpatch warning WARNING: line length of 89 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc

Commit Message

Namhyung Kim Dec. 20, 2022, 10:01 p.m. UTC
When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
access perf sample data directly and call perf_prepare_sample() to make sure
the sample data is populated.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/bpf.h   | 1 +
 kernel/bpf/verifier.c | 1 +
 kernel/events/core.c  | 3 +++
 3 files changed, 5 insertions(+)

Comments

Peter Zijlstra Dec. 22, 2022, 12:55 p.m. UTC | #1
On Tue, Dec 20, 2022 at 02:01:43PM -0800, Namhyung Kim wrote:
> When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
> access perf sample data directly and call perf_prepare_sample() to make sure
> the sample data is populated.

I don't understand a word of this :/ What are you doing and why?

> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  include/linux/bpf.h   | 1 +
>  kernel/bpf/verifier.c | 1 +
>  kernel/events/core.c  | 3 +++
>  3 files changed, 5 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5fec2d1be6d7..6bd4c21a6dd4 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1341,6 +1341,7 @@ struct bpf_prog {
>  				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
>  				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
>  				call_get_func_ip:1, /* Do we call get_func_ip() */
> +				call_cast_kctx:1, /* Do we call bpf_cast_to_kern_ctx() */
>  				tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
>  	enum bpf_prog_type	type;		/* Type of BPF program */
>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index faa358b3d5d7..23a9dc187292 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9236,6 +9236,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>  				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
>  				regs[BPF_REG_0].btf = desc_btf;
>  				regs[BPF_REG_0].btf_id = meta.ret_btf_id;
> +				env->prog->call_cast_kctx = 1;
>  			} else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
>  				ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
>  				if (!ret_t || !btf_type_is_struct(ret_t)) {
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index e47914ac8732..a654a0cb6842 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -10332,6 +10332,7 @@ static void bpf_overflow_handler(struct perf_event *event,
>  		.event = event,
>  	};
>  	struct bpf_prog *prog;
> +	struct perf_event_header dummy;
>  	int ret = 0;
>  
>  	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
> @@ -10346,6 +10347,8 @@ static void bpf_overflow_handler(struct perf_event *event,
>  			data->callchain = perf_callchain(event, regs);
>  			data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
>  		}
> +		if (prog->call_cast_kctx)
> +			perf_prepare_sample(&dummy, data, event, regs);
>  
>  		ret = bpf_prog_run(prog, &ctx);
>  	}
> -- 
> 2.39.0.314.g84b9a713c41-goog
>
Daniel Borkmann Dec. 22, 2022, 1:17 p.m. UTC | #2
On 12/22/22 1:55 PM, Peter Zijlstra wrote:
> On Tue, Dec 20, 2022 at 02:01:43PM -0800, Namhyung Kim wrote:
>> When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
>> access perf sample data directly and call perf_prepare_sample() to make sure
>> the sample data is populated.
> 
> I don't understand a word of this :/ What are you doing and why?

Yeah, above commit message is too terse and unclear. Also, not following where
this assumption comes from; bpf_cast_to_kern_ctx() can be used elsewhere, too,
not just tracing. Recent example from CI side can be found [0].

Thanks,
Daniel

   [0] bpf tree, 70a00e2f1dba ("selftests/bpf: Test bpf_skb_adjust_room on CHECKSUM_PARTIAL")
Namhyung Kim Dec. 22, 2022, 5:34 p.m. UTC | #3
Hello,

On Thu, Dec 22, 2022 at 5:17 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 12/22/22 1:55 PM, Peter Zijlstra wrote:
> > On Tue, Dec 20, 2022 at 02:01:43PM -0800, Namhyung Kim wrote:
> >> When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
> >> access perf sample data directly and call perf_prepare_sample() to make sure
> >> the sample data is populated.
> >
> > I don't understand a word of this :/ What are you doing and why?
>
> Yeah, above commit message is too terse and unclear. Also, not following where
> this assumption comes from; bpf_cast_to_kern_ctx() can be used elsewhere, too,
> not just tracing. Recent example from CI side can be found [0].

Sorry about that.  Let me rephrase it like below:

With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
can access perf sample data directly from the ctx.  But the perf sample
data is not fully prepared at this point, and some fields can have invalid
uninitialized values.  So it needs to call perf_prepare_sample() before
calling the BPF overflow handler.

But just calling perf_prepare_sample() can be costly when the BPF
doesn't access the sample data.  It's needed only if the BPF program
uses the sample data.  But it seems hard for the BPF verifier to detect
if it'd access perf sample data.  So I just checked if it calls the
bpf_cast_to_kern_ctx() and assumed it does.

Thanks,
Namhyung
Peter Zijlstra Dec. 22, 2022, 8:16 p.m. UTC | #4
On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:

> Sorry about that.  Let me rephrase it like below:
> 
> With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> can access perf sample data directly from the ctx. 

This is the bpf_prog_run() in bpf_overflow_handler(), right?

> But the perf sample
> data is not fully prepared at this point, and some fields can have invalid
> uninitialized values.  So it needs to call perf_prepare_sample() before
> calling the BPF overflow handler.

It never was, why is it a problem now?

> But just calling perf_prepare_sample() can be costly when the BPF

So you potentially call it twice now, how's that useful?
Namhyung Kim Dec. 22, 2022, 10:25 p.m. UTC | #5
On Thu, Dec 22, 2022 at 12:16 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
>
> > Sorry about that.  Let me rephrase it like below:
> >
> > With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> > can access perf sample data directly from the ctx.
>
> This is the bpf_prog_run() in bpf_overflow_handler(), right?

Yes.

>
> > But the perf sample
> > data is not fully prepared at this point, and some fields can have invalid
> > uninitialized values.  So it needs to call perf_prepare_sample() before
> > calling the BPF overflow handler.
>
> It never was, why is it a problem now?

BPF used to allow selected fields only like period and addr, and they
are initialized always by perf_sample_data_init().  This is relaxed
by the bpf_cast_to_kern_ctx() and it can easily access arbitrary
fields of perf_sample_data now.

The background of this change is to use BPF as a filter for perf
event samples.  The code is there already and returning 0 from
BPF can drop perf samples.  With access to more sample data,
it'd make more educated decisions.

For example, I got some requests to limit perf samples in a
selected region of address (code or data).  Or it can collect
samples only if some hardware specific information is set in
the raw data like in AMD IBS.  We can easily extend it to other
sample info based on users' needs.

>
> > But just calling perf_prepare_sample() can be costly when the BPF
>
> So you potentially call it twice now, how's that useful?

Right.  I think we can check data->sample_flags in
perf_prepare_sample() to minimize the duplicate work.
It already does it for some fields, but misses others.

Thanks,
Namhyung
Jiri Olsa Dec. 23, 2022, 7:53 a.m. UTC | #6
On Thu, Dec 22, 2022 at 02:25:49PM -0800, Namhyung Kim wrote:
> On Thu, Dec 22, 2022 at 12:16 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
> >
> > > Sorry about that.  Let me rephrase it like below:
> > >
> > > With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> > > can access perf sample data directly from the ctx.
> >
> > This is the bpf_prog_run() in bpf_overflow_handler(), right?
> 
> Yes.
> 
> >
> > > But the perf sample
> > > data is not fully prepared at this point, and some fields can have invalid
> > > uninitialized values.  So it needs to call perf_prepare_sample() before
> > > calling the BPF overflow handler.
> >
> > It never was, why is it a problem now?
> 
> BPF used to allow selected fields only like period and addr, and they
> are initialized always by perf_sample_data_init().  This is relaxed
> by the bpf_cast_to_kern_ctx() and it can easily access arbitrary
> fields of perf_sample_data now.
> 
> The background of this change is to use BPF as a filter for perf
> event samples.  The code is there already and returning 0 from
> BPF can drop perf samples.  With access to more sample data,
> it'd make more educated decisions.
> 
> For example, I got some requests to limit perf samples in a
> selected region of address (code or data).  Or it can collect
> samples only if some hardware specific information is set in
> the raw data like in AMD IBS.  We can easily extend it to other
> sample info based on users' needs.
> 
> >
> > > But just calling perf_prepare_sample() can be costly when the BPF
> >
> > So you potentially call it twice now, how's that useful?
> 
> Right.  I think we can check data->sample_flags in
> perf_prepare_sample() to minimize the duplicate work.
> It already does it for some fields, but misses others.

we used to have __PERF_SAMPLE_CALLCHAIN_EARLY to avoid extra perf_callchain,
could we add some flag like __PERF_SAMPLE_INIT_EARLY to avoid double call to
perf_prepare_sample?

jirka 

> 
> Thanks,
> Namhyung
Namhyung Kim Dec. 27, 2022, 11:19 p.m. UTC | #7
On Thu, Dec 22, 2022 at 11:53 PM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Dec 22, 2022 at 02:25:49PM -0800, Namhyung Kim wrote:
> > On Thu, Dec 22, 2022 at 12:16 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
> > >
> > > > Sorry about that.  Let me rephrase it like below:
> > > >
> > > > With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> > > > can access perf sample data directly from the ctx.
> > >
> > > This is the bpf_prog_run() in bpf_overflow_handler(), right?
> >
> > Yes.
> >
> > >
> > > > But the perf sample
> > > > data is not fully prepared at this point, and some fields can have invalid
> > > > uninitialized values.  So it needs to call perf_prepare_sample() before
> > > > calling the BPF overflow handler.
> > >
> > > It never was, why is it a problem now?
> >
> > BPF used to allow selected fields only like period and addr, and they
> > are initialized always by perf_sample_data_init().  This is relaxed
> > by the bpf_cast_to_kern_ctx() and it can easily access arbitrary
> > fields of perf_sample_data now.
> >
> > The background of this change is to use BPF as a filter for perf
> > event samples.  The code is there already and returning 0 from
> > BPF can drop perf samples.  With access to more sample data,
> > it'd make more educated decisions.
> >
> > For example, I got some requests to limit perf samples in a
> > selected region of address (code or data).  Or it can collect
> > samples only if some hardware specific information is set in
> > the raw data like in AMD IBS.  We can easily extend it to other
> > sample info based on users' needs.
> >
> > >
> > > > But just calling perf_prepare_sample() can be costly when the BPF
> > >
> > > So you potentially call it twice now, how's that useful?
> >
> > Right.  I think we can check data->sample_flags in
> > perf_prepare_sample() to minimize the duplicate work.
> > It already does it for some fields, but misses others.
>
> we used to have __PERF_SAMPLE_CALLCHAIN_EARLY to avoid extra perf_callchain,
> could we add some flag like __PERF_SAMPLE_INIT_EARLY to avoid double call to
> perf_prepare_sample?

I think we can check if the filtered_sample_type is 0.
But it still needs to update the perf_event_header.
I think we need to save the calculated size separately.

Thanks,
Namhyung
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fec2d1be6d7..6bd4c21a6dd4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1341,6 +1341,7 @@  struct bpf_prog {
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
 				call_get_func_ip:1, /* Do we call get_func_ip() */
+				call_cast_kctx:1, /* Do we call bpf_cast_to_kern_ctx() */
 				tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index faa358b3d5d7..23a9dc187292 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9236,6 +9236,7 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
 				regs[BPF_REG_0].btf = desc_btf;
 				regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+				env->prog->call_cast_kctx = 1;
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 				ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
 				if (!ret_t || !btf_type_is_struct(ret_t)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e47914ac8732..a654a0cb6842 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10332,6 +10332,7 @@  static void bpf_overflow_handler(struct perf_event *event,
 		.event = event,
 	};
 	struct bpf_prog *prog;
+	struct perf_event_header dummy;
 	int ret = 0;
 
 	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
@@ -10346,6 +10347,8 @@  static void bpf_overflow_handler(struct perf_event *event,
 			data->callchain = perf_callchain(event, regs);
 			data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
 		}
+		if (prog->call_cast_kctx)
+			perf_prepare_sample(&dummy, data, event, regs);
 
 		ret = bpf_prog_run(prog, &ctx);
 	}