diff mbox series

[RFC,bpf-next] bpf: Use prog->active instead of bpf_prog_active for kprobe_multi

Message ID 20220525114003.61890-1-jolsa@kernel.org (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series [RFC,bpf-next] bpf: Use prog->active instead of bpf_prog_active for kprobe_multi | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 11 this patch: 11
netdev/cc_maintainers warning 3 maintainers not CCed: kpsingh@kernel.org rostedt@goodmis.org mingo@redhat.com
netdev/build_clang success Errors and warnings before: 10 this patch: 10
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff fail author Signed-off-by missing
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 11 this patch: 11
netdev/checkpatch warning CHECK: Unnecessary parentheses around 'prog->active' CHECK: Unnecessary parentheses around prog->active
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc

Commit Message

Jiri Olsa May 25, 2022, 11:40 a.m. UTC
hi,
Alexei suggested to use prog->active instead global bpf_prog_active
for programs attached with kprobe multi [1].

AFAICS this will bypass bpf_disable_instrumentation, which seems to be
ok for some places like hash map update, but I'm not sure about other
places, hence this is RFC post.

I'm not sure how are kprobes different to trampolines in this regard,
because trampolines use prog->active and it's not a problem.

thoughts?

thanks,
jirka


[1] https://lore.kernel.org/bpf/20220316185333.ytyh5irdftjcklk6@ast-mbp.dhcp.thefacebook.com/
---
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

Comments

Yonghong Song May 26, 2022, 4:23 p.m. UTC | #1
On 5/25/22 4:40 AM, Jiri Olsa wrote:
> hi,
> Alexei suggested to use prog->active instead global bpf_prog_active
> for programs attached with kprobe multi [1].

prog->active and bpf_prog_active tries to prevent program
recursion and bpf_prog_active provides stronger protection
as it prevent different programs from recursion while prog->active
presents only for the same program.

Currently trampoline based programs use prog->active mechanism
and kprobe, tracepoint and perf.

> 
> AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> ok for some places like hash map update, but I'm not sure about other
> places, hence this is RFC post.
> 
> I'm not sure how are kprobes different to trampolines in this regard,
> because trampolines use prog->active and it's not a problem.

The following is just my understanding.
In most cases, prog->active should be okay. The only tricky
case might be due to shared maps such that one prog did update/delete
map element and inside the lock in update/delete another
trampoline program is triggered and trying to update/delete the same
map (bucket). But this is a known issue and not a unique issue for
kprobe_multi.

> 
> thoughts?
> 
> thanks,
> jirka
> 
> 
> [1] https://lore.kernel.org/bpf/20220316185333.ytyh5irdftjcklk6@ast-mbp.dhcp.thefacebook.com/
> ---
>   kernel/trace/bpf_trace.c | 31 +++++++++++++++++++------------
>   1 file changed, 19 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 10b157a6d73e..7aec39ae0a1c 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -2385,8 +2385,8 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
>   }
>   
>   static int
> -kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> -			   unsigned long entry_ip, struct pt_regs *regs)
> +__kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> +			     unsigned long entry_ip, struct pt_regs *regs)
>   {
>   	struct bpf_kprobe_multi_run_ctx run_ctx = {
>   		.link = link,
> @@ -2395,21 +2395,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
>   	struct bpf_run_ctx *old_run_ctx;
>   	int err;
>   
> -	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> -		err = 0;
> -		goto out;
> -	}
> -
> -	migrate_disable();
> -	rcu_read_lock();
>   	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
>   	err = bpf_prog_run(link->link.prog, regs);
>   	bpf_reset_run_ctx(old_run_ctx);
> +	return err;
> +}
> +
> +static int
> +kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> +			   unsigned long entry_ip, struct pt_regs *regs)
> +{
> +	struct bpf_prog *prog = link->link.prog;
> +	int err = 0;
> +
> +	migrate_disable();
> +	rcu_read_lock();
> +
> +	if (likely(__this_cpu_inc_return(*(prog->active)) == 1))
> +		err = __kprobe_multi_link_prog_run(link, entry_ip, regs);
> +
> +	__this_cpu_dec(*(prog->active));
>   	rcu_read_unlock();
>   	migrate_enable();
> -
> - out:
> -	__this_cpu_dec(bpf_prog_active);
>   	return err;
>   }
>
Andrii Nakryiko May 31, 2022, 11:24 p.m. UTC | #2
On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> hi,
> Alexei suggested to use prog->active instead global bpf_prog_active
> for programs attached with kprobe multi [1].
>
> AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> ok for some places like hash map update, but I'm not sure about other
> places, hence this is RFC post.
>
> I'm not sure how are kprobes different to trampolines in this regard,
> because trampolines use prog->active and it's not a problem.
>
> thoughts?
>

Let's say we have two kernel functions A and B? B can be called from
BPF program though some BPF helper, ok? Now let's say I have two BPF
programs kprobeX and kretprobeX, both are attached to A and B. With
using prog->active instead of per-cpu bpf_prog_active, what would be
the behavior when A is called somewhere in the kernel.

1. A is called
2. kprobeX is activated for A, calls some helper which eventually calls B
  3. kprobeX is attempted to be called for B, but is skipped due to prog->active
  4. B runs
  5. kretprobeX is activated for B, calls some helper which eventually calls B
    6. kprobeX is ignored (prog->active > 0)
    7. B runs
    8. kretprobeX is ignored (prog->active > 0)
9. kretprobeX is activated for A, calls helper which calls B
  10. kprobeX is activated for B
    11. kprobeX is ignored (prog->active > 0)
    12. B runs
    13. kretprobeX is ignored (prog->active > 0)
  14. B runs
  15. kretprobeX is ignored (prog->active > 0)


If that's correct, we get:

1. kprobeX for A
2. kretprobeX for B
3. kretprobeX for A
4. kprobeX for B

It's quite mind-boggling and annoying in practice. I'd very much
prefer just kprobeX for A followed by kretprobeX for A. That's it.

I'm trying to protect against this in retsnoop with custom per-cpu
logic in each program, but I so much more prefer bpf_prog_active,
which basically says "no nested kprobe calls while kprobe program is
running", which makes a lot of sense in practice.

Given kprobe already used global bpf_prog_active I'd say multi-kprobe
should stick to bpf_prog_active as well.


> thanks,
> jirka
>
>
> [1] https://lore.kernel.org/bpf/20220316185333.ytyh5irdftjcklk6@ast-mbp.dhcp.thefacebook.com/
> ---
>  kernel/trace/bpf_trace.c | 31 +++++++++++++++++++------------
>  1 file changed, 19 insertions(+), 12 deletions(-)
>

[...]
Alexei Starovoitov June 8, 2022, 4:29 a.m. UTC | #3
On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > hi,
> > Alexei suggested to use prog->active instead global bpf_prog_active
> > for programs attached with kprobe multi [1].
> >
> > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > ok for some places like hash map update, but I'm not sure about other
> > places, hence this is RFC post.
> >
> > I'm not sure how are kprobes different to trampolines in this regard,
> > because trampolines use prog->active and it's not a problem.
> >
> > thoughts?
> >
>
> Let's say we have two kernel functions A and B? B can be called from
> BPF program though some BPF helper, ok? Now let's say I have two BPF
> programs kprobeX and kretprobeX, both are attached to A and B. With
> using prog->active instead of per-cpu bpf_prog_active, what would be
> the behavior when A is called somewhere in the kernel.
>
> 1. A is called
> 2. kprobeX is activated for A, calls some helper which eventually calls B
>   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
>   4. B runs
>   5. kretprobeX is activated for B, calls some helper which eventually calls B
>     6. kprobeX is ignored (prog->active > 0)
>     7. B runs
>     8. kretprobeX is ignored (prog->active > 0)
> 9. kretprobeX is activated for A, calls helper which calls B
>   10. kprobeX is activated for B
>     11. kprobeX is ignored (prog->active > 0)

not correct. kprobeX actually runs.
but the end result is correct.

>     12. B runs
>     13. kretprobeX is ignored (prog->active > 0)
>   14. B runs
>   15. kretprobeX is ignored (prog->active > 0)
>
>
> If that's correct, we get:
>
> 1. kprobeX for A
> 2. kretprobeX for B
> 3. kretprobeX for A
> 4. kprobeX for B

Here it's correct.

> It's quite mind-boggling and annoying in practice. I'd very much
> prefer just kprobeX for A followed by kretprobeX for A. That's it.
>
> I'm trying to protect against this in retsnoop with custom per-cpu
> logic in each program, but I so much more prefer bpf_prog_active,
> which basically says "no nested kprobe calls while kprobe program is
> running", which makes a lot of sense in practice.

It makes sense for retsnoop, but does not make sense in general.

> Given kprobe already used global bpf_prog_active I'd say multi-kprobe
> should stick to bpf_prog_active as well.

I strongly disagree.
Both multi kprobe and kprobe should move to per prog counter
plus some other protection
(we cannot just move to per-prog due to syscalls).
It's true that the above order is mind-boggling,
but it's much better than
missing kprobe invocation completely just because
another kprobe is running on the same cpu.
People complained numerous times about this kprobe behavior.
kprobeX attached to A
kprobeY attached to B.
If kprobeX calls B kprobeY is not going to be called.
Means that anything that bpf is using is lost.
spin locks, lists, rcu, etc.
Sleepable uprobes are coming.
iirc Delyan's patch correctly.
We will do migrate_disable and inc bpf_prog_active.
Now random kprobes on that cpu will be lost.
It's awful. We have to fix it.
Jiri Olsa June 8, 2022, 12:32 p.m. UTC | #4
On Tue, Jun 07, 2022 at 09:29:30PM -0700, Alexei Starovoitov wrote:
> On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > >
> > > hi,
> > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > for programs attached with kprobe multi [1].
> > >
> > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > ok for some places like hash map update, but I'm not sure about other
> > > places, hence this is RFC post.
> > >
> > > I'm not sure how are kprobes different to trampolines in this regard,
> > > because trampolines use prog->active and it's not a problem.
> > >
> > > thoughts?
> > >
> >
> > Let's say we have two kernel functions A and B? B can be called from
> > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > programs kprobeX and kretprobeX, both are attached to A and B. With
> > using prog->active instead of per-cpu bpf_prog_active, what would be
> > the behavior when A is called somewhere in the kernel.
> >
> > 1. A is called
> > 2. kprobeX is activated for A, calls some helper which eventually calls B
> >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> >   4. B runs
> >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> >     6. kprobeX is ignored (prog->active > 0)
> >     7. B runs
> >     8. kretprobeX is ignored (prog->active > 0)
> > 9. kretprobeX is activated for A, calls helper which calls B
> >   10. kprobeX is activated for B
> >     11. kprobeX is ignored (prog->active > 0)
> 
> not correct. kprobeX actually runs.
> but the end result is correct.
> 
> >     12. B runs
> >     13. kretprobeX is ignored (prog->active > 0)
> >   14. B runs
> >   15. kretprobeX is ignored (prog->active > 0)
> >
> >
> > If that's correct, we get:
> >
> > 1. kprobeX for A
> > 2. kretprobeX for B
> > 3. kretprobeX for A
> > 4. kprobeX for B
> 
> Here it's correct.
> 
> > It's quite mind-boggling and annoying in practice. I'd very much
> > prefer just kprobeX for A followed by kretprobeX for A. That's it.
> >
> > I'm trying to protect against this in retsnoop with custom per-cpu
> > logic in each program, but I so much more prefer bpf_prog_active,
> > which basically says "no nested kprobe calls while kprobe program is
> > running", which makes a lot of sense in practice.
> 
> It makes sense for retsnoop, but does not make sense in general.
> 
> > Given kprobe already used global bpf_prog_active I'd say multi-kprobe
> > should stick to bpf_prog_active as well.
> 
> I strongly disagree.
> Both multi kprobe and kprobe should move to per prog counter
> plus some other protection
> (we cannot just move to per-prog due to syscalls).
> It's true that the above order is mind-boggling,
> but it's much better than
> missing kprobe invocation completely just because
> another kprobe is running on the same cpu.
> People complained numerous times about this kprobe behavior.
> kprobeX attached to A
> kprobeY attached to B.
> If kprobeX calls B kprobeY is not going to be called.
> Means that anything that bpf is using is lost.
> spin locks, lists, rcu, etc.
> Sleepable uprobes are coming.
> iirc Delyan's patch correctly.
> We will do migrate_disable and inc bpf_prog_active.
> Now random kprobes on that cpu will be lost.
> It's awful. We have to fix it.

how about using bpf_prog_active just to disable instrumentation,
and only check it before running bpf prog

plus adding prog->active check for both kprobe and kprobe_multi
to bpf_prog_run function, like in the patch below

jirka


---
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ed0c0ff42ad5..a27e947f8749 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -632,7 +632,12 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 
 static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
 {
-	return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
+	u32 ret = 0;
+
+	if (likely(__this_cpu_inc_return(*(prog->active)) == 1))
+		ret = __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
+	__this_cpu_dec(*(prog->active));
+	return ret;
 }
 
 /*
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 10b157a6d73e..62389ff0f15a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -103,16 +103,8 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 
 	cant_sleep();
 
-	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
-		/*
-		 * since some bpf program is already running on this cpu,
-		 * don't call into another bpf program (same or different)
-		 * and don't send kprobe event into ring-buffer,
-		 * so return zero here
-		 */
-		ret = 0;
-		goto out;
-	}
+	if (unlikely(this_cpu_read(bpf_prog_active)))
+		return 0;
 
 	/*
 	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
@@ -133,10 +125,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
 				 ctx, bpf_prog_run);
 	rcu_read_unlock();
-
- out:
-	__this_cpu_dec(bpf_prog_active);
-
 	return ret;
 }
 
@@ -2395,10 +2383,8 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 	struct bpf_run_ctx *old_run_ctx;
 	int err;
 
-	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
-		err = 0;
-		goto out;
-	}
+	if (unlikely(this_cpu_read(bpf_prog_active)))
+		return 0;
 
 	migrate_disable();
 	rcu_read_lock();
@@ -2407,9 +2393,6 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 	bpf_reset_run_ctx(old_run_ctx);
 	rcu_read_unlock();
 	migrate_enable();
-
- out:
-	__this_cpu_dec(bpf_prog_active);
 	return err;
 }
Andrii Nakryiko June 9, 2022, 6:26 p.m. UTC | #5
On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > >
> > > hi,
> > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > for programs attached with kprobe multi [1].
> > >
> > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > ok for some places like hash map update, but I'm not sure about other
> > > places, hence this is RFC post.
> > >
> > > I'm not sure how are kprobes different to trampolines in this regard,
> > > because trampolines use prog->active and it's not a problem.
> > >
> > > thoughts?
> > >
> >
> > Let's say we have two kernel functions A and B? B can be called from
> > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > programs kprobeX and kretprobeX, both are attached to A and B. With
> > using prog->active instead of per-cpu bpf_prog_active, what would be
> > the behavior when A is called somewhere in the kernel.
> >
> > 1. A is called
> > 2. kprobeX is activated for A, calls some helper which eventually calls B
> >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> >   4. B runs
> >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> >     6. kprobeX is ignored (prog->active > 0)
> >     7. B runs
> >     8. kretprobeX is ignored (prog->active > 0)
> > 9. kretprobeX is activated for A, calls helper which calls B
> >   10. kprobeX is activated for B
> >     11. kprobeX is ignored (prog->active > 0)
>
> not correct. kprobeX actually runs.
> but the end result is correct.
>

Right, it was a long sequence, but you got the idea :)

> >     12. B runs
> >     13. kretprobeX is ignored (prog->active > 0)
> >   14. B runs
> >   15. kretprobeX is ignored (prog->active > 0)
> >
> >
> > If that's correct, we get:
> >
> > 1. kprobeX for A
> > 2. kretprobeX for B
> > 3. kretprobeX for A
> > 4. kprobeX for B
>
> Here it's correct.
>
> > It's quite mind-boggling and annoying in practice. I'd very much
> > prefer just kprobeX for A followed by kretprobeX for A. That's it.
> >
> > I'm trying to protect against this in retsnoop with custom per-cpu
> > logic in each program, but I so much more prefer bpf_prog_active,
> > which basically says "no nested kprobe calls while kprobe program is
> > running", which makes a lot of sense in practice.
>
> It makes sense for retsnoop, but does not make sense in general.
>
> > Given kprobe already used global bpf_prog_active I'd say multi-kprobe
> > should stick to bpf_prog_active as well.
>
> I strongly disagree.
> Both multi kprobe and kprobe should move to per prog counter
> plus some other protection
> (we cannot just move to per-prog due to syscalls).
> It's true that the above order is mind-boggling,
> but it's much better than
> missing kprobe invocation completely just because
> another kprobe is running on the same cpu.
> People complained numerous times about this kprobe behavior.
> kprobeX attached to A
> kprobeY attached to B.
> If kprobeX calls B kprobeY is not going to be called.
> Means that anything that bpf is using is lost.
> spin locks, lists, rcu, etc.
> Sleepable uprobes are coming.
> iirc Delyan's patch correctly.
> We will do migrate_disable and inc bpf_prog_active.

This might be a different issue, I'm not sure why uprobes should be
protected by the same global bpf_prog_active, you can't trigger uprobe
from uprobe program. And especially for sleepable programs it makes no
sense to use per-CPU protection (we have bpf_run_ctx for such
protections, if needed).

> Now random kprobes on that cpu will be lost.

It's not random. The rule is you can't kernel functions and
tracepoints triggered from BPF kprobes/tracepoints. This prevents
nasty reentrance problems and makes sense. Isn't kernel tracing infra
is protecting itself similarly, preventing reentrancy and recursion?

> It's awful. We have to fix it.

You can call it "a fix" if you'd like, but it's changing a very
user-visible behavior and guarantees on which users relied for a
while. So even if we switch to per-prog protection it will have to be
some sort of opt-in (flag, new program type, whatever).
Alexei Starovoitov June 9, 2022, 10:03 p.m. UTC | #6
On Thu, Jun 9, 2022 at 11:27 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > >
> > > > hi,
> > > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > > for programs attached with kprobe multi [1].
> > > >
> > > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > > ok for some places like hash map update, but I'm not sure about other
> > > > places, hence this is RFC post.
> > > >
> > > > I'm not sure how are kprobes different to trampolines in this regard,
> > > > because trampolines use prog->active and it's not a problem.
> > > >
> > > > thoughts?
> > > >
> > >
> > > Let's say we have two kernel functions A and B? B can be called from
> > > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > > programs kprobeX and kretprobeX, both are attached to A and B. With
> > > using prog->active instead of per-cpu bpf_prog_active, what would be
> > > the behavior when A is called somewhere in the kernel.
> > >
> > > 1. A is called
> > > 2. kprobeX is activated for A, calls some helper which eventually calls B
> > >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> > >   4. B runs
> > >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> > >     6. kprobeX is ignored (prog->active > 0)
> > >     7. B runs
> > >     8. kretprobeX is ignored (prog->active > 0)
> > > 9. kretprobeX is activated for A, calls helper which calls B
> > >   10. kprobeX is activated for B
> > >     11. kprobeX is ignored (prog->active > 0)
> >
> > not correct. kprobeX actually runs.
> > but the end result is correct.
> >
>
> Right, it was a long sequence, but you got the idea :)
>
> > >     12. B runs
> > >     13. kretprobeX is ignored (prog->active > 0)
> > >   14. B runs
> > >   15. kretprobeX is ignored (prog->active > 0)
> > >
> > >
> > > If that's correct, we get:
> > >
> > > 1. kprobeX for A
> > > 2. kretprobeX for B
> > > 3. kretprobeX for A
> > > 4. kprobeX for B
> >
> > Here it's correct.
> >
> > > It's quite mind-boggling and annoying in practice. I'd very much
> > > prefer just kprobeX for A followed by kretprobeX for A. That's it.
> > >
> > > I'm trying to protect against this in retsnoop with custom per-cpu
> > > logic in each program, but I so much more prefer bpf_prog_active,
> > > which basically says "no nested kprobe calls while kprobe program is
> > > running", which makes a lot of sense in practice.
> >
> > It makes sense for retsnoop, but does not make sense in general.
> >
> > > Given kprobe already used global bpf_prog_active I'd say multi-kprobe
> > > should stick to bpf_prog_active as well.
> >
> > I strongly disagree.
> > Both multi kprobe and kprobe should move to per prog counter
> > plus some other protection
> > (we cannot just move to per-prog due to syscalls).
> > It's true that the above order is mind-boggling,
> > but it's much better than
> > missing kprobe invocation completely just because
> > another kprobe is running on the same cpu.
> > People complained numerous times about this kprobe behavior.
> > kprobeX attached to A
> > kprobeY attached to B.
> > If kprobeX calls B kprobeY is not going to be called.
> > Means that anything that bpf is using is lost.
> > spin locks, lists, rcu, etc.
> > Sleepable uprobes are coming.
> > iirc Delyan's patch correctly.
> > We will do migrate_disable and inc bpf_prog_active.
>
> This might be a different issue, I'm not sure why uprobes should be
> protected by the same global bpf_prog_active, you can't trigger uprobe
> from uprobe program. And especially for sleepable programs it makes no
> sense to use per-CPU protection (we have bpf_run_ctx for such
> protections, if needed).

you're forgetting about tracepoints and perf_events.
'perf record' will capture everything.
Whereas bpf powered 'perf record' will not see bpf progs
attached to [ku]probe, tp, events.

> > Now random kprobes on that cpu will be lost.
>
> It's not random. The rule is you can't kernel functions and
> tracepoints triggered from BPF kprobes/tracepoints. This prevents
> nasty reentrance problems and makes sense.

From the user point of view it makes no sense whatsoever.
bperf is losing info.
Try explaining that to users.

> Isn't kernel tracing infra
> is protecting itself similarly, preventing reentrancy and recursion?

Yes and that's what it should be doing.
It's not the job of the kernel to implement the policy.
"run one bpf prog per cpu at a time" is a policy
and very much a broken one.

> > It's awful. We have to fix it.
>
> You can call it "a fix" if you'd like, but it's changing a very
> user-visible behavior and guarantees on which users relied for a
> while. So even if we switch to per-prog protection it will have to be
> some sort of opt-in (flag, new program type, whatever).

No opt-in allowed for fixes and it's a bug fix.
No one should rely on broken kernel behavior.
If retsnoop rely on that it's really retsnoop's fault.
Andrii Nakryiko June 10, 2022, 5:58 p.m. UTC | #7
On Thu, Jun 9, 2022 at 3:03 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Jun 9, 2022 at 11:27 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > > >
> > > > > hi,
> > > > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > > > for programs attached with kprobe multi [1].
> > > > >
> > > > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > > > ok for some places like hash map update, but I'm not sure about other
> > > > > places, hence this is RFC post.
> > > > >
> > > > > I'm not sure how are kprobes different to trampolines in this regard,
> > > > > because trampolines use prog->active and it's not a problem.
> > > > >
> > > > > thoughts?
> > > > >
> > > >
> > > > Let's say we have two kernel functions A and B? B can be called from
> > > > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > > > programs kprobeX and kretprobeX, both are attached to A and B. With
> > > > using prog->active instead of per-cpu bpf_prog_active, what would be
> > > > the behavior when A is called somewhere in the kernel.
> > > >
> > > > 1. A is called
> > > > 2. kprobeX is activated for A, calls some helper which eventually calls B
> > > >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> > > >   4. B runs
> > > >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> > > >     6. kprobeX is ignored (prog->active > 0)
> > > >     7. B runs
> > > >     8. kretprobeX is ignored (prog->active > 0)
> > > > 9. kretprobeX is activated for A, calls helper which calls B
> > > >   10. kprobeX is activated for B
> > > >     11. kprobeX is ignored (prog->active > 0)
> > >
> > > not correct. kprobeX actually runs.
> > > but the end result is correct.
> > >
> >
> > Right, it was a long sequence, but you got the idea :)
> >
> > > >     12. B runs
> > > >     13. kretprobeX is ignored (prog->active > 0)
> > > >   14. B runs
> > > >   15. kretprobeX is ignored (prog->active > 0)
> > > >
> > > >
> > > > If that's correct, we get:
> > > >
> > > > 1. kprobeX for A
> > > > 2. kretprobeX for B
> > > > 3. kretprobeX for A
> > > > 4. kprobeX for B
> > >
> > > Here it's correct.
> > >
> > > > It's quite mind-boggling and annoying in practice. I'd very much
> > > > prefer just kprobeX for A followed by kretprobeX for A. That's it.
> > > >
> > > > I'm trying to protect against this in retsnoop with custom per-cpu
> > > > logic in each program, but I so much more prefer bpf_prog_active,
> > > > which basically says "no nested kprobe calls while kprobe program is
> > > > running", which makes a lot of sense in practice.
> > >
> > > It makes sense for retsnoop, but does not make sense in general.
> > >
> > > > Given kprobe already used global bpf_prog_active I'd say multi-kprobe
> > > > should stick to bpf_prog_active as well.
> > >
> > > I strongly disagree.
> > > Both multi kprobe and kprobe should move to per prog counter
> > > plus some other protection
> > > (we cannot just move to per-prog due to syscalls).
> > > It's true that the above order is mind-boggling,
> > > but it's much better than
> > > missing kprobe invocation completely just because
> > > another kprobe is running on the same cpu.
> > > People complained numerous times about this kprobe behavior.
> > > kprobeX attached to A
> > > kprobeY attached to B.
> > > If kprobeX calls B kprobeY is not going to be called.
> > > Means that anything that bpf is using is lost.
> > > spin locks, lists, rcu, etc.
> > > Sleepable uprobes are coming.
> > > iirc Delyan's patch correctly.
> > > We will do migrate_disable and inc bpf_prog_active.
> >
> > This might be a different issue, I'm not sure why uprobes should be
> > protected by the same global bpf_prog_active, you can't trigger uprobe
> > from uprobe program. And especially for sleepable programs it makes no
> > sense to use per-CPU protection (we have bpf_run_ctx for such
> > protections, if needed).
>
> you're forgetting about tracepoints and perf_events.
> 'perf record' will capture everything.
> Whereas bpf powered 'perf record' will not see bpf progs
> attached to [ku]probe, tp, events.

I agree that using *the same bundled together* bpf_prog_active for
kprobes, uprobes, tracepoints and perf_events doesn't make sense. Can
we think about a bit more nuanced approach here? E.g., for perf_events
it seems unlikely to go into reentrancy and they have quite different
"mode of operation" than kprobes, so per-program protection makes more
sense to me there. Similarly for uprobes, as I mentioned.

But for kprobes its very common to use pairs of kprobe and kretprobe
together. And the sequence I mentioned above is already very
confusing, and if you think about two independent application that
attach pairs of kprobe+kretprobe independently to the same subset of
functions, their interaction will be just plain weird and bug-like.
Specifically for kprobes, pretending like kprobe BPF program doesn't
call any internals of the kernel and doesn't trigger any nested
kprobes seems like a sane thing to me. Surely from kernel POV just
doing per-program (and per-CPU!) check is simple and "elegant" in
terms of implementation, but it's just shifting burden to all kprobe
users. I'm not sure that's the right trade off in this case.

I'm less clear about whether tracepoint should share protection with
kprobe, tbh. On one hand they have same reentrancy problems (though
much less so), but they are also not used in coupled pairs as
kprobe+kretprobe is typically used, so per-program protection for TP
might be ok.

>
> > > Now random kprobes on that cpu will be lost.
> >
> > It's not random. The rule is you can't kernel functions and
> > tracepoints triggered from BPF kprobes/tracepoints. This prevents
> > nasty reentrance problems and makes sense.
>
> From the user point of view it makes no sense whatsoever.
> bperf is losing info.
> Try explaining that to users.
>

See above, I agree that perf_event should not be ignored if it happens
to NMI-interrupt some kprobe BPF program, but I think it's a bit of a
different problem (just like uprobe).

> > Isn't kernel tracing infra
> > is protecting itself similarly, preventing reentrancy and recursion?
>
> Yes and that's what it should be doing.
> It's not the job of the kernel to implement the policy.
> "run one bpf prog per cpu at a time" is a policy
> and very much a broken one.

Plain "one bpf prog per cpu" is bad policy, yes, but all-or-nothing
policy for kprobes/kretprobes attached to the same kernel function
seems to make a lot of sense to me

>
> > > It's awful. We have to fix it.
> >
> > You can call it "a fix" if you'd like, but it's changing a very
> > user-visible behavior and guarantees on which users relied for a
> > while. So even if we switch to per-prog protection it will have to be
> > some sort of opt-in (flag, new program type, whatever).
>
> No opt-in allowed for fixes and it's a bug fix.
> No one should rely on broken kernel behavior.
> If retsnoop rely on that it's really retsnoop's fault.

No point in arguing if we can't even agree on whether this is a bug or
not, sorry.

Getting kretprobe invocation out of the blue without getting
corresponding kprobe invocation first (both of which were successfully
attached) seems like more of a bug to me. But perhaps that's a matter
of subjective opinion.
Alexei Starovoitov June 11, 2022, 8:53 p.m. UTC | #8
On Fri, Jun 10, 2022 at 10:58:50AM -0700, Andrii Nakryiko wrote:
> On Thu, Jun 9, 2022 at 3:03 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Jun 9, 2022 at 11:27 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> > > > <andrii.nakryiko@gmail.com> wrote:
> > > > >
> > > > > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > > > >
> > > > > > hi,
> > > > > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > > > > for programs attached with kprobe multi [1].
> > > > > >
> > > > > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > > > > ok for some places like hash map update, but I'm not sure about other
> > > > > > places, hence this is RFC post.
> > > > > >
> > > > > > I'm not sure how are kprobes different to trampolines in this regard,
> > > > > > because trampolines use prog->active and it's not a problem.
> > > > > >
> > > > > > thoughts?
> > > > > >
> > > > >
> > > > > Let's say we have two kernel functions A and B? B can be called from
> > > > > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > > > > programs kprobeX and kretprobeX, both are attached to A and B. With
> > > > > using prog->active instead of per-cpu bpf_prog_active, what would be
> > > > > the behavior when A is called somewhere in the kernel.
> > > > >
> > > > > 1. A is called
> > > > > 2. kprobeX is activated for A, calls some helper which eventually calls B
> > > > >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> > > > >   4. B runs
> > > > >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> > > > >     6. kprobeX is ignored (prog->active > 0)
> > > > >     7. B runs
> > > > >     8. kretprobeX is ignored (prog->active > 0)
> > > > > 9. kretprobeX is activated for A, calls helper which calls B
> > > > >   10. kprobeX is activated for B
> > > > >     11. kprobeX is ignored (prog->active > 0)
> > > >
> > > > not correct. kprobeX actually runs.
> > > > but the end result is correct.
> > > >
> > >
> > > Right, it was a long sequence, but you got the idea :)

The above analysis was actually incorrect.
There are three kprobe flavors: int3, opt, ftrace.
while multi-kprobe is based on fprobe.
kretprobe can be traditional and rethook based.
In all of these mechanisms there is at least ftrace_test_recursion_trylock()
and for kprobes there is kprobe_running (per-cpu current_kprobe) filter
that acts as bpf_prog_active.

So this:
1. kprobeX for A
2. kretprobeX for B
3. kretprobeX for A
4. kprobeX for B
doesn't seem possible.
Unless there is reproducer of above behavior there is no point using above
as a design argument.

> > > > It's awful. We have to fix it.
> > >
> > > You can call it "a fix" if you'd like, but it's changing a very
> > > user-visible behavior and guarantees on which users relied for a
> > > while. So even if we switch to per-prog protection it will have to be
> > > some sort of opt-in (flag, new program type, whatever).
> >
> > No opt-in allowed for fixes and it's a bug fix.
> > No one should rely on broken kernel behavior.
> > If retsnoop rely on that it's really retsnoop's fault.
> 
> No point in arguing if we can't even agree on whether this is a bug or
> not, sorry.
> 
> Getting kretprobe invocation out of the blue without getting
> corresponding kprobe invocation first (both of which were successfully
> attached) seems like more of a bug to me. But perhaps that's a matter
> of subjective opinion.

The issue of kprobe/kretprobe mismatch was known for long time.
First maxactive was an issue. It should be solved by rethook now.
Then kprobe/kretprobe attach is not atomic.
bpf prog attaching kprobe and kretprobe to the same func cannot assume
that they will always pair. bcc scripts had to deal with this.

Say, kprobe/kretprobe will become fentry/fexit like with prog->active only.
If retsnoop wants to do its own per-cpu prog_active counter it will
prevent out-of-order fentry/fexit for the case when the same prog
is attached to before-bpf-func and during-bpf-func. Only retsnoop's progs
will miss during-bpf-func events. Such policy decisions is localized to one tool.
All other users will see the events they care about.
kprobe/kretprobe/fprobe run handlers with preemption disabled which makes
these mechanisms unfriendly to RT. Their design shows that they're not suitable
for always-on running. When bpf+kprobe was introduced 7 years ago it wasn't
meant to be 24-7 either. bpf_prog_active is modeled like current_kprobe.
It was addressing the deadlock issue with spinlocks in maps.
Recursion was not an issue.
Sadly kprobe/kretprobe/fprobe look unfixable in this form. Too much work
needs to be done to enable something like:
user A attaches prog A to func X. X runs, prog A runs with migration disabled.
Preemption. Something else starts on this cpu. Another user B attaching prog B
to func Y should see its prog being executed.
With kprobes it looks impossible. While fentry was designed with this use case
in mind. Note it's not about sleepable progs. Normal bpf progs can be preempted.

Back to Jiri's question whether we can remove bpf_prog_active from
trace_call_bpf.  Yes. We can and we should. It will allow bperf to collect
stack traces that include bpf progs. It's an important fix. Incorrect retsnoop
assumptions about kprobes will not be affected.
Jiri Olsa June 13, 2022, 12:36 p.m. UTC | #9
On Sat, Jun 11, 2022 at 01:53:26PM -0700, Alexei Starovoitov wrote:
> On Fri, Jun 10, 2022 at 10:58:50AM -0700, Andrii Nakryiko wrote:
> > On Thu, Jun 9, 2022 at 3:03 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Jun 9, 2022 at 11:27 AM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > > > > >
> > > > > > > hi,
> > > > > > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > > > > > for programs attached with kprobe multi [1].
> > > > > > >
> > > > > > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > > > > > ok for some places like hash map update, but I'm not sure about other
> > > > > > > places, hence this is RFC post.
> > > > > > >
> > > > > > > I'm not sure how are kprobes different to trampolines in this regard,
> > > > > > > because trampolines use prog->active and it's not a problem.
> > > > > > >
> > > > > > > thoughts?
> > > > > > >
> > > > > >
> > > > > > Let's say we have two kernel functions A and B? B can be called from
> > > > > > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > > > > > programs kprobeX and kretprobeX, both are attached to A and B. With
> > > > > > using prog->active instead of per-cpu bpf_prog_active, what would be
> > > > > > the behavior when A is called somewhere in the kernel.
> > > > > >
> > > > > > 1. A is called
> > > > > > 2. kprobeX is activated for A, calls some helper which eventually calls B
> > > > > >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> > > > > >   4. B runs
> > > > > >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> > > > > >     6. kprobeX is ignored (prog->active > 0)
> > > > > >     7. B runs
> > > > > >     8. kretprobeX is ignored (prog->active > 0)
> > > > > > 9. kretprobeX is activated for A, calls helper which calls B
> > > > > >   10. kprobeX is activated for B
> > > > > >     11. kprobeX is ignored (prog->active > 0)
> > > > >
> > > > > not correct. kprobeX actually runs.
> > > > > but the end result is correct.
> > > > >
> > > >
> > > > Right, it was a long sequence, but you got the idea :)
> 
> The above analysis was actually incorrect.
> There are three kprobe flavors: int3, opt, ftrace.
> while multi-kprobe is based on fprobe.
> kretprobe can be traditional and rethook based.
> In all of these mechanisms there is at least ftrace_test_recursion_trylock()
> and for kprobes there is kprobe_running (per-cpu current_kprobe) filter
> that acts as bpf_prog_active.
> 
> So this:
> 1. kprobeX for A
> 2. kretprobeX for B
> 3. kretprobeX for A
> 4. kprobeX for B
> doesn't seem possible.
> Unless there is reproducer of above behavior there is no point using above
> as a design argument.

yes, I just experimentally verified ;-) I have a selftest with new test
helper doing Andrii's scenario (with kprobes on ftrace) and kprobe_running
check will take care of the entry side:

        if (kprobe_running()) {
                kprobes_inc_nmissed_count(p);

and as a results kretprobe won't be installed as well

> 
> > > > > It's awful. We have to fix it.
> > > >
> > > > You can call it "a fix" if you'd like, but it's changing a very
> > > > user-visible behavior and guarantees on which users relied for a
> > > > while. So even if we switch to per-prog protection it will have to be
> > > > some sort of opt-in (flag, new program type, whatever).
> > >
> > > No opt-in allowed for fixes and it's a bug fix.
> > > No one should rely on broken kernel behavior.
> > > If retsnoop rely on that it's really retsnoop's fault.
> > 
> > No point in arguing if we can't even agree on whether this is a bug or
> > not, sorry.
> > 
> > Getting kretprobe invocation out of the blue without getting
> > corresponding kprobe invocation first (both of which were successfully
> > attached) seems like more of a bug to me. But perhaps that's a matter
> > of subjective opinion.
> 
> The issue of kprobe/kretprobe mismatch was known for long time.
> First maxactive was an issue. It should be solved by rethook now.
> Then kprobe/kretprobe attach is not atomic.
> bpf prog attaching kprobe and kretprobe to the same func cannot assume
> that they will always pair. bcc scripts had to deal with this.
> 
> Say, kprobe/kretprobe will become fentry/fexit like with prog->active only.
> If retsnoop wants to do its own per-cpu prog_active counter it will
> prevent out-of-order fentry/fexit for the case when the same prog
> is attached to before-bpf-func and during-bpf-func. Only retsnoop's progs
> will miss during-bpf-func events. Such policy decisions is localized to one tool.
> All other users will see the events they care about.
> kprobe/kretprobe/fprobe run handlers with preemption disabled which makes
> these mechanisms unfriendly to RT. Their design shows that they're not suitable
> for always-on running. When bpf+kprobe was introduced 7 years ago it wasn't
> meant to be 24-7 either. bpf_prog_active is modeled like current_kprobe.
> It was addressing the deadlock issue with spinlocks in maps.
> Recursion was not an issue.
> Sadly kprobe/kretprobe/fprobe look unfixable in this form. Too much work
> needs to be done to enable something like:
> user A attaches prog A to func X. X runs, prog A runs with migration disabled.
> Preemption. Something else starts on this cpu. Another user B attaching prog B
> to func Y should see its prog being executed.
> With kprobes it looks impossible. While fentry was designed with this use case
> in mind. Note it's not about sleepable progs. Normal bpf progs can be preempted.
> 
> Back to Jiri's question whether we can remove bpf_prog_active from
> trace_call_bpf.  Yes. We can and we should. It will allow bperf to collect
> stack traces that include bpf progs. It's an important fix. Incorrect retsnoop
> assumptions about kprobes will not be affected.

which bperf tool are you talking about (I found 2)?

and given that the kprobe layer is effectively doing the bpf_prog_active check,
what's the benefit of the change then?

thanks,
jirka
Andrii Nakryiko June 13, 2022, 4:32 p.m. UTC | #10
On Mon, Jun 13, 2022 at 5:36 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Sat, Jun 11, 2022 at 01:53:26PM -0700, Alexei Starovoitov wrote:
> > On Fri, Jun 10, 2022 at 10:58:50AM -0700, Andrii Nakryiko wrote:
> > > On Thu, Jun 9, 2022 at 3:03 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 9, 2022 at 11:27 AM Andrii Nakryiko
> > > > <andrii.nakryiko@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
> > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> > > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > > >
> > > > > > > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > > > > > >
> > > > > > > > hi,
> > > > > > > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > > > > > > for programs attached with kprobe multi [1].
> > > > > > > >
> > > > > > > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > > > > > > ok for some places like hash map update, but I'm not sure about other
> > > > > > > > places, hence this is RFC post.
> > > > > > > >
> > > > > > > > I'm not sure how are kprobes different to trampolines in this regard,
> > > > > > > > because trampolines use prog->active and it's not a problem.
> > > > > > > >
> > > > > > > > thoughts?
> > > > > > > >
> > > > > > >
> > > > > > > Let's say we have two kernel functions A and B? B can be called from
> > > > > > > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > > > > > > programs kprobeX and kretprobeX, both are attached to A and B. With
> > > > > > > using prog->active instead of per-cpu bpf_prog_active, what would be
> > > > > > > the behavior when A is called somewhere in the kernel.
> > > > > > >
> > > > > > > 1. A is called
> > > > > > > 2. kprobeX is activated for A, calls some helper which eventually calls B
> > > > > > >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> > > > > > >   4. B runs
> > > > > > >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> > > > > > >     6. kprobeX is ignored (prog->active > 0)
> > > > > > >     7. B runs
> > > > > > >     8. kretprobeX is ignored (prog->active > 0)
> > > > > > > 9. kretprobeX is activated for A, calls helper which calls B
> > > > > > >   10. kprobeX is activated for B
> > > > > > >     11. kprobeX is ignored (prog->active > 0)
> > > > > >
> > > > > > not correct. kprobeX actually runs.
> > > > > > but the end result is correct.
> > > > > >
> > > > >
> > > > > Right, it was a long sequence, but you got the idea :)
> >
> > The above analysis was actually incorrect.
> > There are three kprobe flavors: int3, opt, ftrace.
> > while multi-kprobe is based on fprobe.
> > kretprobe can be traditional and rethook based.
> > In all of these mechanisms there is at least ftrace_test_recursion_trylock()
> > and for kprobes there is kprobe_running (per-cpu current_kprobe) filter
> > that acts as bpf_prog_active.
> >
> > So this:
> > 1. kprobeX for A
> > 2. kretprobeX for B
> > 3. kretprobeX for A
> > 4. kprobeX for B
> > doesn't seem possible.
> > Unless there is reproducer of above behavior there is no point using above
> > as a design argument.
>
> yes, I just experimentally verified ;-) I have a selftest with new test
> helper doing Andrii's scenario (with kprobes on ftrace) and kprobe_running
> check will take care of the entry side:
>
>         if (kprobe_running()) {
>                 kprobes_inc_nmissed_count(p);
>
> and as a results kretprobe won't be installed as well

Great, then I rest my case, this is mostly what I've been worrying
about. Thanks, Jiri, for confirming!

>
> >
> > > > > > It's awful. We have to fix it.
> > > > >
> > > > > You can call it "a fix" if you'd like, but it's changing a very
> > > > > user-visible behavior and guarantees on which users relied for a
> > > > > while. So even if we switch to per-prog protection it will have to be
> > > > > some sort of opt-in (flag, new program type, whatever).
> > > >
> > > > No opt-in allowed for fixes and it's a bug fix.
> > > > No one should rely on broken kernel behavior.
> > > > If retsnoop rely on that it's really retsnoop's fault.
> > >
> > > No point in arguing if we can't even agree on whether this is a bug or
> > > not, sorry.
> > >
> > > Getting kretprobe invocation out of the blue without getting
> > > corresponding kprobe invocation first (both of which were successfully
> > > attached) seems like more of a bug to me. But perhaps that's a matter
> > > of subjective opinion.
> >
> > The issue of kprobe/kretprobe mismatch was known for long time.
> > First maxactive was an issue. It should be solved by rethook now.
> > Then kprobe/kretprobe attach is not atomic.
> > bpf prog attaching kprobe and kretprobe to the same func cannot assume
> > that they will always pair. bcc scripts had to deal with this.
> >
> > Say, kprobe/kretprobe will become fentry/fexit like with prog->active only.
> > If retsnoop wants to do its own per-cpu prog_active counter it will
> > prevent out-of-order fentry/fexit for the case when the same prog
> > is attached to before-bpf-func and during-bpf-func. Only retsnoop's progs
> > will miss during-bpf-func events. Such policy decisions is localized to one tool.
> > All other users will see the events they care about.
> > kprobe/kretprobe/fprobe run handlers with preemption disabled which makes
> > these mechanisms unfriendly to RT. Their design shows that they're not suitable
> > for always-on running. When bpf+kprobe was introduced 7 years ago it wasn't
> > meant to be 24-7 either. bpf_prog_active is modeled like current_kprobe.
> > It was addressing the deadlock issue with spinlocks in maps.
> > Recursion was not an issue.
> > Sadly kprobe/kretprobe/fprobe look unfixable in this form. Too much work
> > needs to be done to enable something like:
> > user A attaches prog A to func X. X runs, prog A runs with migration disabled.
> > Preemption. Something else starts on this cpu. Another user B attaching prog B
> > to func Y should see its prog being executed.
> > With kprobes it looks impossible. While fentry was designed with this use case
> > in mind. Note it's not about sleepable progs. Normal bpf progs can be preempted.
> >
> > Back to Jiri's question whether we can remove bpf_prog_active from
> > trace_call_bpf.  Yes. We can and we should. It will allow bperf to collect
> > stack traces that include bpf progs. It's an important fix. Incorrect retsnoop
> > assumptions about kprobes will not be affected.
>
> which bperf tool are you talking about (I found 2)?
>
> and given that the kprobe layer is effectively doing the bpf_prog_active check,
> what's the benefit of the change then?
>
> thanks,
> jirka
Andrii Nakryiko June 13, 2022, 4:38 p.m. UTC | #11
On Sat, Jun 11, 2022 at 1:53 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Jun 10, 2022 at 10:58:50AM -0700, Andrii Nakryiko wrote:
> > On Thu, Jun 9, 2022 at 3:03 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Jun 9, 2022 at 11:27 AM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Tue, Jun 7, 2022 at 9:29 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Tue, May 31, 2022 at 4:24 PM Andrii Nakryiko
> > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, May 25, 2022 at 4:40 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > > > > >
> > > > > > > hi,
> > > > > > > Alexei suggested to use prog->active instead global bpf_prog_active
> > > > > > > for programs attached with kprobe multi [1].
> > > > > > >
> > > > > > > AFAICS this will bypass bpf_disable_instrumentation, which seems to be
> > > > > > > ok for some places like hash map update, but I'm not sure about other
> > > > > > > places, hence this is RFC post.
> > > > > > >
> > > > > > > I'm not sure how are kprobes different to trampolines in this regard,
> > > > > > > because trampolines use prog->active and it's not a problem.
> > > > > > >
> > > > > > > thoughts?
> > > > > > >
> > > > > >
> > > > > > Let's say we have two kernel functions A and B? B can be called from
> > > > > > BPF program though some BPF helper, ok? Now let's say I have two BPF
> > > > > > programs kprobeX and kretprobeX, both are attached to A and B. With
> > > > > > using prog->active instead of per-cpu bpf_prog_active, what would be
> > > > > > the behavior when A is called somewhere in the kernel.
> > > > > >
> > > > > > 1. A is called
> > > > > > 2. kprobeX is activated for A, calls some helper which eventually calls B
> > > > > >   3. kprobeX is attempted to be called for B, but is skipped due to prog->active
> > > > > >   4. B runs
> > > > > >   5. kretprobeX is activated for B, calls some helper which eventually calls B
> > > > > >     6. kprobeX is ignored (prog->active > 0)
> > > > > >     7. B runs
> > > > > >     8. kretprobeX is ignored (prog->active > 0)
> > > > > > 9. kretprobeX is activated for A, calls helper which calls B
> > > > > >   10. kprobeX is activated for B
> > > > > >     11. kprobeX is ignored (prog->active > 0)
> > > > >
> > > > > not correct. kprobeX actually runs.
> > > > > but the end result is correct.
> > > > >
> > > >
> > > > Right, it was a long sequence, but you got the idea :)
>
> The above analysis was actually incorrect.
> There are three kprobe flavors: int3, opt, ftrace.
> while multi-kprobe is based on fprobe.
> kretprobe can be traditional and rethook based.
> In all of these mechanisms there is at least ftrace_test_recursion_trylock()
> and for kprobes there is kprobe_running (per-cpu current_kprobe) filter
> that acts as bpf_prog_active.
>
> So this:
> 1. kprobeX for A
> 2. kretprobeX for B
> 3. kretprobeX for A
> 4. kprobeX for B
> doesn't seem possible.
> Unless there is reproducer of above behavior there is no point using above
> as a design argument.
>

Cool, seems like Jiri confirmed this can't in fact happen for kprobes,
so I'm good, this behavior was the one I was worried about, not global
vs per-prog active flag, per se.

> > > > > It's awful. We have to fix it.
> > > >
> > > > You can call it "a fix" if you'd like, but it's changing a very
> > > > user-visible behavior and guarantees on which users relied for a
> > > > while. So even if we switch to per-prog protection it will have to be
> > > > some sort of opt-in (flag, new program type, whatever).
> > >
> > > No opt-in allowed for fixes and it's a bug fix.
> > > No one should rely on broken kernel behavior.
> > > If retsnoop rely on that it's really retsnoop's fault.
> >
> > No point in arguing if we can't even agree on whether this is a bug or
> > not, sorry.
> >
> > Getting kretprobe invocation out of the blue without getting
> > corresponding kprobe invocation first (both of which were successfully
> > attached) seems like more of a bug to me. But perhaps that's a matter
> > of subjective opinion.
>
> The issue of kprobe/kretprobe mismatch was known for long time.
> First maxactive was an issue. It should be solved by rethook now.
> Then kprobe/kretprobe attach is not atomic.
> bpf prog attaching kprobe and kretprobe to the same func cannot assume
> that they will always pair. bcc scripts had to deal with this.
>
> Say, kprobe/kretprobe will become fentry/fexit like with prog->active only.
> If retsnoop wants to do its own per-cpu prog_active counter it will
> prevent out-of-order fentry/fexit for the case when the same prog
> is attached to before-bpf-func and during-bpf-func. Only retsnoop's progs
> will miss during-bpf-func events. Such policy decisions is localized to one tool.
> All other users will see the events they care about.
> kprobe/kretprobe/fprobe run handlers with preemption disabled which makes
> these mechanisms unfriendly to RT. Their design shows that they're not suitable
> for always-on running. When bpf+kprobe was introduced 7 years ago it wasn't
> meant to be 24-7 either. bpf_prog_active is modeled like current_kprobe.
> It was addressing the deadlock issue with spinlocks in maps.
> Recursion was not an issue.
> Sadly kprobe/kretprobe/fprobe look unfixable in this form. Too much work
> needs to be done to enable something like:
> user A attaches prog A to func X. X runs, prog A runs with migration disabled.
> Preemption. Something else starts on this cpu. Another user B attaching prog B
> to func Y should see its prog being executed.
> With kprobes it looks impossible. While fentry was designed with this use case
> in mind. Note it's not about sleepable progs. Normal bpf progs can be preempted.
>
> Back to Jiri's question whether we can remove bpf_prog_active from
> trace_call_bpf.  Yes. We can and we should. It will allow bperf to collect
> stack traces that include bpf progs. It's an important fix. Incorrect retsnoop
> assumptions about kprobes will not be affected.
diff mbox series

Patch

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 10b157a6d73e..7aec39ae0a1c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2385,8 +2385,8 @@  static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
 }
 
 static int
-kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
-			   unsigned long entry_ip, struct pt_regs *regs)
+__kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+			     unsigned long entry_ip, struct pt_regs *regs)
 {
 	struct bpf_kprobe_multi_run_ctx run_ctx = {
 		.link = link,
@@ -2395,21 +2395,28 @@  kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 	struct bpf_run_ctx *old_run_ctx;
 	int err;
 
-	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
-		err = 0;
-		goto out;
-	}
-
-	migrate_disable();
-	rcu_read_lock();
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	err = bpf_prog_run(link->link.prog, regs);
 	bpf_reset_run_ctx(old_run_ctx);
+	return err;
+}
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+			   unsigned long entry_ip, struct pt_regs *regs)
+{
+	struct bpf_prog *prog = link->link.prog;
+	int err = 0;
+
+	migrate_disable();
+	rcu_read_lock();
+
+	if (likely(__this_cpu_inc_return(*(prog->active)) == 1))
+		err = __kprobe_multi_link_prog_run(link, entry_ip, regs);
+
+	__this_cpu_dec(*(prog->active));
 	rcu_read_unlock();
 	migrate_enable();
-
- out:
-	__this_cpu_dec(bpf_prog_active);
 	return err;
 }