diff mbox series

[PATCHv2,bpf-next,1/9] uprobe: Add support for session consumer

Message ID 20240701164115.723677-2-jolsa@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series uprobe, bpf: Add session support | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1540 this patch: 1540
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 15 maintainers not CCed: yonghong.song@linux.dev mark.rutland@arm.com acme@kernel.org linux-perf-users@vger.kernel.org mathieu.desnoyers@efficios.com song@kernel.org kan.liang@linux.intel.com irogers@google.com namhyung@kernel.org martin.lau@linux.dev alexander.shishkin@linux.intel.com eddyz87@gmail.com kpsingh@kernel.org adrian.hunter@intel.com mingo@redhat.com
netdev/build_clang success Errors and warnings before: 2034 this patch: 2034
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 16293 this patch: 16293
netdev/checkpatch warning WARNING: line length of 100 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

Jiri Olsa July 1, 2024, 4:41 p.m. UTC
Adding support for uprobe consumer to be defined as session and have
new behaviour for consumer's 'handler' and 'ret_handler' callbacks.

The session means that 'handler' and 'ret_handler' callbacks are
connected in a way that allows to:

  - control execution of 'ret_handler' from 'handler' callback
  - share data between 'handler' and 'ret_handler' callbacks

The session is enabled by setting new 'session' bool field to true
in uprobe_consumer object.

We keep count of session consumers for uprobe and allocate session_consumer
object for each in return_instance object. This allows us to store
return values of 'handler' callbacks and data pointers of shared
data between both handlers.

The session concept fits to our common use case where we do filtering
on entry uprobe and based on the result we decide to run the return
uprobe (or not).

It's also convenient to share the data between session callbacks.

The control of 'ret_handler' callback execution is done via return
value of the 'handler' callback. If it's 0 we install and execute
return uprobe, if it's 1 we do not.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/uprobes.h     |  16 ++++-
 kernel/events/uprobes.c     | 129 +++++++++++++++++++++++++++++++++---
 kernel/trace/bpf_trace.c    |   6 +-
 kernel/trace/trace_uprobe.c |  12 ++--
 4 files changed, 144 insertions(+), 19 deletions(-)

Comments

Peter Zijlstra July 2, 2024, 1:04 p.m. UTC | #1
On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:

> +static void
> +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> +{
> +	static unsigned int session_id;
> +
> +	if (uc->session) {
> +		uprobe->sessions_cnt++;
> +		uc->session_id = ++session_id ?: ++session_id;
> +	}
> +}

The way I understand this code, you create a consumer every time you do
uprobe_register() and unregister makes it go away.

Now, register one, then 4g-1 times register+unregister, then register
again.

The above seems to then result in two consumers with the same
session_id, which leads to trouble.

Hmm?
Jiri Olsa July 2, 2024, 4:10 p.m. UTC | #2
On Tue, Jul 02, 2024 at 03:04:08PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:
> 
> > +static void
> > +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > +{
> > +	static unsigned int session_id;
> > +
> > +	if (uc->session) {
> > +		uprobe->sessions_cnt++;
> > +		uc->session_id = ++session_id ?: ++session_id;
> > +	}
> > +}
> 
> The way I understand this code, you create a consumer every time you do
> uprobe_register() and unregister makes it go away.
> 
> Now, register one, then 4g-1 times register+unregister, then register
> again.
> 
> The above seems to then result in two consumers with the same
> session_id, which leads to trouble.
> 
> Hmm?

ugh true.. will make it u64 :)

I think we could store uprobe_consumer pointer+ref in session_consumer,
and that would make the unregister path more interesting.. will check

thanks,
jirka
Andrii Nakryiko July 2, 2024, 8:51 p.m. UTC | #3
On Mon, Jul 1, 2024 at 9:41 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Adding support for uprobe consumer to be defined as session and have
> new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
>
> The session means that 'handler' and 'ret_handler' callbacks are
> connected in a way that allows to:
>
>   - control execution of 'ret_handler' from 'handler' callback
>   - share data between 'handler' and 'ret_handler' callbacks
>
> The session is enabled by setting new 'session' bool field to true
> in uprobe_consumer object.
>
> We keep count of session consumers for uprobe and allocate session_consumer
> object for each in return_instance object. This allows us to store
> return values of 'handler' callbacks and data pointers of shared
> data between both handlers.
>
> The session concept fits to our common use case where we do filtering
> on entry uprobe and based on the result we decide to run the return
> uprobe (or not).
>
> It's also convenient to share the data between session callbacks.
>
> The control of 'ret_handler' callback execution is done via return
> value of the 'handler' callback. If it's 0 we install and execute
> return uprobe, if it's 1 we do not.
>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  include/linux/uprobes.h     |  16 ++++-
>  kernel/events/uprobes.c     | 129 +++++++++++++++++++++++++++++++++---
>  kernel/trace/bpf_trace.c    |   6 +-
>  kernel/trace/trace_uprobe.c |  12 ++--
>  4 files changed, 144 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..903a860a8d01 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
>  };
>
>  struct uprobe_consumer {
> -       int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> +       int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
>         int (*ret_handler)(struct uprobe_consumer *self,
>                                 unsigned long func,
> -                               struct pt_regs *regs);
> +                               struct pt_regs *regs, __u64 *data);
>         bool (*filter)(struct uprobe_consumer *self,
>                                 enum uprobe_filter_ctx ctx,
>                                 struct mm_struct *mm);
>
>         struct uprobe_consumer *next;
> +
> +       bool                    session;        /* marks uprobe session consumer */
> +       unsigned int            session_id;     /* set when uprobe_consumer is registered */
>  };
>
>  #ifdef CONFIG_UPROBES
> @@ -80,6 +83,12 @@ struct uprobe_task {
>         unsigned int                    depth;
>  };
>
> +struct session_consumer {
> +       __u64           cookie;
> +       unsigned int    id;
> +       int             rc;

you'll be using u64 for ID, right? so this struct will be 24 bytes.
Maybe we can just use topmost bit of ID to store whether uretprobe
should run or not? It's trivial to mask out during ID comparisons

> +};
> +
>  struct return_instance {
>         struct uprobe           *uprobe;
>         unsigned long           func;
> @@ -88,6 +97,9 @@ struct return_instance {
>         bool                    chained;        /* true, if instance is nested */
>
>         struct return_instance  *next;          /* keep as stack */
> +
> +       int                     sessions_cnt;

there is 7 byte gap before next field, let's put sessions_cnt there

> +       struct session_consumer sessions[];
>  };
>
>  enum rp_check {
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..4da410460f2a 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -63,6 +63,8 @@ struct uprobe {
>         loff_t                  ref_ctr_offset;
>         unsigned long           flags;
>
> +       unsigned int            sessions_cnt;
> +
>         /*
>          * The generic code assumes that it has two members of unknown type
>          * owned by the arch-specific code:
> @@ -750,11 +752,30 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
>         return uprobe;
>  }
>
> +static void
> +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> +{
> +       static unsigned int session_id;

(besides what Peter mentioned about wrap around of 32-bit counter)
let's use atomic here to not rely on any particular locking
(unnecessarily), this might make my life easier in the future, thanks.
This is registration time, low frequency, extra atomic won't hurt.

It might be already broken, actually, for two independently registering uprobes.

> +
> +       if (uc->session) {
> +               uprobe->sessions_cnt++;
> +               uc->session_id = ++session_id ?: ++session_id;
> +       }
> +}
> +
> +static void
> +uprobe_consumer_unaccount(struct uprobe *uprobe, struct uprobe_consumer *uc)

this fits in 100 characters, keep it single line, please. Same for
account function

> +{
> +       if (uc->session)
> +               uprobe->sessions_cnt--;
> +}
> +
>  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  {
>         down_write(&uprobe->consumer_rwsem);
>         uc->next = uprobe->consumers;
>         uprobe->consumers = uc;
> +       uprobe_consumer_account(uprobe, uc);
>         up_write(&uprobe->consumer_rwsem);
>  }
>
> @@ -773,6 +794,7 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
>                 if (*con == uc) {
>                         *con = uc->next;
>                         ret = true;
> +                       uprobe_consumer_unaccount(uprobe, uc);
>                         break;
>                 }
>         }
> @@ -1744,6 +1766,23 @@ static struct uprobe_task *get_utask(void)
>         return current->utask;
>  }
>
> +static size_t ri_size(int sessions_cnt)
> +{
> +       struct return_instance *ri __maybe_unused;
> +
> +       return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);

just use struct_size()?

> +}
> +
> +static struct return_instance *alloc_return_instance(int sessions_cnt)
> +{
> +       struct return_instance *ri;
> +
> +       ri = kzalloc(ri_size(sessions_cnt), GFP_KERNEL);
> +       if (ri)
> +               ri->sessions_cnt = sessions_cnt;
> +       return ri;
> +}
> +
>  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
>  {
>         struct uprobe_task *n_utask;
> @@ -1756,11 +1795,11 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
>
>         p = &n_utask->return_instances;
>         for (o = o_utask->return_instances; o; o = o->next) {
> -               n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> +               n = alloc_return_instance(o->sessions_cnt);
>                 if (!n)
>                         return -ENOMEM;
>
> -               *n = *o;
> +               memcpy(n, o, ri_size(o->sessions_cnt));
>                 get_uprobe(n->uprobe);
>                 n->next = NULL;
>
> @@ -1853,9 +1892,9 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
>         utask->return_instances = ri;
>  }
>
> -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
> +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
> +                             struct return_instance *ri)
>  {
> -       struct return_instance *ri;
>         struct uprobe_task *utask;
>         unsigned long orig_ret_vaddr, trampoline_vaddr;
>         bool chained;
> @@ -1874,9 +1913,11 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
>                 return;
>         }
>
> -       ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> -       if (!ri)
> -               return;
> +       if (!ri) {
> +               ri = alloc_return_instance(0);
> +               if (!ri)
> +                       return;
> +       }
>
>         trampoline_vaddr = get_trampoline_vaddr();
>         orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
> @@ -2065,35 +2106,85 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
>         return uprobe;
>  }
>
> +static struct session_consumer *
> +session_consumer_next(struct return_instance *ri, struct session_consumer *sc,
> +                     int session_id)
> +{
> +       struct session_consumer *next;
> +
> +       next = sc ? sc + 1 : &ri->sessions[0];
> +       next->id = session_id;

it's kind of unexpected that "session_consumer_next" would actually
set an ID... Maybe drop int session_id as input argument and fill it
out outside of this function, this function being just a simple
iterator?

> +       return next;
> +}
> +
> +static struct session_consumer *
> +session_consumer_find(struct return_instance *ri, int *iter, int session_id)
> +{
> +       struct session_consumer *sc;
> +       int idx = *iter;
> +
> +       for (sc = &ri->sessions[idx]; idx < ri->sessions_cnt; idx++, sc++) {
> +               if (sc->id == session_id) {
> +                       *iter = idx + 1;
> +                       return sc;
> +               }
> +       }
> +       return NULL;
> +}
> +
>  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
>  {
>         struct uprobe_consumer *uc;
>         int remove = UPROBE_HANDLER_REMOVE;
> +       struct session_consumer *sc = NULL;
> +       struct return_instance *ri = NULL;
>         bool need_prep = false; /* prepare return uprobe, when needed */
>
>         down_read(&uprobe->register_rwsem);
> +       if (uprobe->sessions_cnt) {
> +               ri = alloc_return_instance(uprobe->sessions_cnt);
> +               if (!ri)
> +                       goto out;
> +       }
> +
>         for (uc = uprobe->consumers; uc; uc = uc->next) {
> +               __u64 *cookie = NULL;
>                 int rc = 0;
>
> +               if (uc->session) {
> +                       sc = session_consumer_next(ri, sc, uc->session_id);
> +                       cookie = &sc->cookie;
> +               }
> +
>                 if (uc->handler) {
> -                       rc = uc->handler(uc, regs);
> +                       rc = uc->handler(uc, regs, cookie);
>                         WARN(rc & ~UPROBE_HANDLER_MASK,
>                                 "bad rc=0x%x from %ps()\n", rc, uc->handler);
>                 }
>
> -               if (uc->ret_handler)
> +               if (uc->session) {
> +                       sc->rc = rc;
> +                       need_prep |= !rc;

nit:

if (rc == 0)
    need_prep = true;

and then it's *extremely obvious* what happens and under which conditions

> +               } else if (uc->ret_handler) {
>                         need_prep = true;
> +               }
>
>                 remove &= rc;
>         }
>
> +       /* no removal if there's at least one session consumer */
> +       remove &= !uprobe->sessions_cnt;

this is counter (not error, not pointer), let's stick to ` == 0`, please

is this

if (uprobe->sessions_cnt != 0)
   remove = 0;

? I can't tell (honestly), without spending ridiculous amounts of
mental resources (for the underlying simplicity of the condition).

> +
>         if (need_prep && !remove)
> -               prepare_uretprobe(uprobe, regs); /* put bp at return */
> +               prepare_uretprobe(uprobe, regs, ri); /* put bp at return */
> +       else
> +               kfree(ri);
>
>         if (remove && uprobe->consumers) {
>                 WARN_ON(!uprobe_is_active(uprobe));
>                 unapply_uprobe(uprobe, current->mm);
>         }
> + out:
>         up_read(&uprobe->register_rwsem);
>  }
>

[...]
Andrii Nakryiko July 2, 2024, 8:52 p.m. UTC | #4
On Tue, Jul 2, 2024 at 9:11 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Tue, Jul 02, 2024 at 03:04:08PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:
> >
> > > +static void
> > > +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > > +{
> > > +   static unsigned int session_id;
> > > +
> > > +   if (uc->session) {
> > > +           uprobe->sessions_cnt++;
> > > +           uc->session_id = ++session_id ?: ++session_id;
> > > +   }
> > > +}
> >
> > The way I understand this code, you create a consumer every time you do
> > uprobe_register() and unregister makes it go away.
> >
> > Now, register one, then 4g-1 times register+unregister, then register
> > again.
> >
> > The above seems to then result in two consumers with the same
> > session_id, which leads to trouble.
> >
> > Hmm?
>
> ugh true.. will make it u64 :)
>
> I think we could store uprobe_consumer pointer+ref in session_consumer,
> and that would make the unregister path more interesting.. will check

More interesting how? It's actually a great idea, uprobe_consumer
pointer itself is a unique ID and 64-bit. We can still use lowest bit
for RC (see my other reply).

>
> thanks,
> jirka
Masami Hiramatsu (Google) July 2, 2024, 11:55 p.m. UTC | #5
Hi Jiri,

On Mon,  1 Jul 2024 18:41:07 +0200
Jiri Olsa <jolsa@kernel.org> wrote:

> Adding support for uprobe consumer to be defined as session and have
> new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
> 
> The session means that 'handler' and 'ret_handler' callbacks are
> connected in a way that allows to:
> 
>   - control execution of 'ret_handler' from 'handler' callback
>   - share data between 'handler' and 'ret_handler' callbacks
> 
> The session is enabled by setting new 'session' bool field to true
> in uprobe_consumer object.
> 
> We keep count of session consumers for uprobe and allocate session_consumer
> object for each in return_instance object. This allows us to store
> return values of 'handler' callbacks and data pointers of shared
> data between both handlers.
> 
> The session concept fits to our common use case where we do filtering
> on entry uprobe and based on the result we decide to run the return
> uprobe (or not).
> 
> It's also convenient to share the data between session callbacks.
> 
> The control of 'ret_handler' callback execution is done via return
> value of the 'handler' callback. If it's 0 we install and execute
> return uprobe, if it's 1 we do not.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  include/linux/uprobes.h     |  16 ++++-
>  kernel/events/uprobes.c     | 129 +++++++++++++++++++++++++++++++++---
>  kernel/trace/bpf_trace.c    |   6 +-
>  kernel/trace/trace_uprobe.c |  12 ++--
>  4 files changed, 144 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..903a860a8d01 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
>  };
>  
>  struct uprobe_consumer {
> -	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> +	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
>  	int (*ret_handler)(struct uprobe_consumer *self,
>  				unsigned long func,
> -				struct pt_regs *regs);
> +				struct pt_regs *regs, __u64 *data);
>  	bool (*filter)(struct uprobe_consumer *self,
>  				enum uprobe_filter_ctx ctx,
>  				struct mm_struct *mm);
>  
>  	struct uprobe_consumer *next;
> +
> +	bool			session;	/* marks uprobe session consumer */
> +	unsigned int		session_id;	/* set when uprobe_consumer is registered */

Hmm, why this has both session and session_id?
I also think we can use the address of uprobe_consumer itself as a unique id.

Also, if we can set session enabled by default, and skip ret_handler by handler's
return value, it is more simpler. (If handler returns a specific value, skip ret_handler)

>  };
>  
>  #ifdef CONFIG_UPROBES
> @@ -80,6 +83,12 @@ struct uprobe_task {
>  	unsigned int			depth;
>  };
>  
> +struct session_consumer {
> +	__u64		cookie;

And this cookie looks not scalable. If we can pass a data to handler, I would like to
reuse it to pass the target function parameters to ret_handler as kretprobe/fprobe does.

	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, void *data);

uprobes can collect its uc's required sizes and allocate the memory (shadow stack frame)
at handler_chain().

> +	unsigned int	id;
> +	int		rc;
> +};
> +
>  struct return_instance {
>  	struct uprobe		*uprobe;
>  	unsigned long		func;
> @@ -88,6 +97,9 @@ struct return_instance {
>  	bool			chained;	/* true, if instance is nested */
>  
>  	struct return_instance	*next;		/* keep as stack */
> +
> +	int			sessions_cnt;
> +	struct session_consumer	sessions[];

In that case, we don't have this array, but 

	char data[];

And decode data array, which is a slice of variable length structure;

struct session_consumer {
	struct uprobe_consumer *uc;
	char data[];
};

The size of session_consumer is uc->session_data_size + sizeof(uc).

What would you think?

Thank you,

>  };
>  
>  enum rp_check {
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..4da410460f2a 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -63,6 +63,8 @@ struct uprobe {
>  	loff_t			ref_ctr_offset;
>  	unsigned long		flags;
>  
> +	unsigned int		sessions_cnt;
> +
>  	/*
>  	 * The generic code assumes that it has two members of unknown type
>  	 * owned by the arch-specific code:
> @@ -750,11 +752,30 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
>  	return uprobe;
>  }
>  
> +static void
> +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> +{
> +	static unsigned int session_id;
> +
> +	if (uc->session) {
> +		uprobe->sessions_cnt++;
> +		uc->session_id = ++session_id ?: ++session_id;
> +	}
> +}
> +
> +static void
> +uprobe_consumer_unaccount(struct uprobe *uprobe, struct uprobe_consumer *uc)
> +{
> +	if (uc->session)
> +		uprobe->sessions_cnt--;
> +}
> +
>  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  {
>  	down_write(&uprobe->consumer_rwsem);
>  	uc->next = uprobe->consumers;
>  	uprobe->consumers = uc;
> +	uprobe_consumer_account(uprobe, uc);
>  	up_write(&uprobe->consumer_rwsem);
>  }
>  
> @@ -773,6 +794,7 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  		if (*con == uc) {
>  			*con = uc->next;
>  			ret = true;
> +			uprobe_consumer_unaccount(uprobe, uc);
>  			break;
>  		}
>  	}
> @@ -1744,6 +1766,23 @@ static struct uprobe_task *get_utask(void)
>  	return current->utask;
>  }
>  
> +static size_t ri_size(int sessions_cnt)
> +{
> +	struct return_instance *ri __maybe_unused;
> +
> +	return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);
> +}
> +
> +static struct return_instance *alloc_return_instance(int sessions_cnt)
> +{
> +	struct return_instance *ri;
> +
> +	ri = kzalloc(ri_size(sessions_cnt), GFP_KERNEL);
> +	if (ri)
> +		ri->sessions_cnt = sessions_cnt;
> +	return ri;
> +}
> +
>  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
>  {
>  	struct uprobe_task *n_utask;
> @@ -1756,11 +1795,11 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
>  
>  	p = &n_utask->return_instances;
>  	for (o = o_utask->return_instances; o; o = o->next) {
> -		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> +		n = alloc_return_instance(o->sessions_cnt);
>  		if (!n)
>  			return -ENOMEM;
>  
> -		*n = *o;
> +		memcpy(n, o, ri_size(o->sessions_cnt));
>  		get_uprobe(n->uprobe);
>  		n->next = NULL;
>  
> @@ -1853,9 +1892,9 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
>  	utask->return_instances = ri;
>  }
>  
> -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
> +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
> +			      struct return_instance *ri)
>  {
> -	struct return_instance *ri;
>  	struct uprobe_task *utask;
>  	unsigned long orig_ret_vaddr, trampoline_vaddr;
>  	bool chained;
> @@ -1874,9 +1913,11 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
>  		return;
>  	}
>  
> -	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> -	if (!ri)
> -		return;
> +	if (!ri) {
> +		ri = alloc_return_instance(0);
> +		if (!ri)
> +			return;
> +	}
>  
>  	trampoline_vaddr = get_trampoline_vaddr();
>  	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
> @@ -2065,35 +2106,85 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
>  	return uprobe;
>  }
>  
> +static struct session_consumer *
> +session_consumer_next(struct return_instance *ri, struct session_consumer *sc,
> +		      int session_id)
> +{
> +	struct session_consumer *next;
> +
> +	next = sc ? sc + 1 : &ri->sessions[0];
> +	next->id = session_id;
> +	return next;
> +}
> +
> +static struct session_consumer *
> +session_consumer_find(struct return_instance *ri, int *iter, int session_id)
> +{
> +	struct session_consumer *sc;
> +	int idx = *iter;
> +
> +	for (sc = &ri->sessions[idx]; idx < ri->sessions_cnt; idx++, sc++) {
> +		if (sc->id == session_id) {
> +			*iter = idx + 1;
> +			return sc;
> +		}
> +	}
> +	return NULL;
> +}
> +
>  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
>  {
>  	struct uprobe_consumer *uc;
>  	int remove = UPROBE_HANDLER_REMOVE;
> +	struct session_consumer *sc = NULL;
> +	struct return_instance *ri = NULL;
>  	bool need_prep = false; /* prepare return uprobe, when needed */
>  
>  	down_read(&uprobe->register_rwsem);
> +	if (uprobe->sessions_cnt) {
> +		ri = alloc_return_instance(uprobe->sessions_cnt);
> +		if (!ri)
> +			goto out;
> +	}
> +
>  	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +		__u64 *cookie = NULL;
>  		int rc = 0;
>  
> +		if (uc->session) {
> +			sc = session_consumer_next(ri, sc, uc->session_id);
> +			cookie = &sc->cookie;
> +		}
> +
>  		if (uc->handler) {
> -			rc = uc->handler(uc, regs);
> +			rc = uc->handler(uc, regs, cookie);
>  			WARN(rc & ~UPROBE_HANDLER_MASK,
>  				"bad rc=0x%x from %ps()\n", rc, uc->handler);
>  		}
>  
> -		if (uc->ret_handler)
> +		if (uc->session) {
> +			sc->rc = rc;
> +			need_prep |= !rc;
> +		} else if (uc->ret_handler) {
>  			need_prep = true;
> +		}
>  
>  		remove &= rc;
>  	}
>  
> +	/* no removal if there's at least one session consumer */
> +	remove &= !uprobe->sessions_cnt;
> +
>  	if (need_prep && !remove)
> -		prepare_uretprobe(uprobe, regs); /* put bp at return */
> +		prepare_uretprobe(uprobe, regs, ri); /* put bp at return */
> +	else
> +		kfree(ri);
>  
>  	if (remove && uprobe->consumers) {
>  		WARN_ON(!uprobe_is_active(uprobe));
>  		unapply_uprobe(uprobe, current->mm);
>  	}
> + out:
>  	up_read(&uprobe->register_rwsem);
>  }
>  
> @@ -2101,12 +2192,28 @@ static void
>  handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
>  {
>  	struct uprobe *uprobe = ri->uprobe;
> +	struct session_consumer *sc;
>  	struct uprobe_consumer *uc;
> +	int session_iter = 0;
>  
>  	down_read(&uprobe->register_rwsem);
>  	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +		__u64 *cookie = NULL;
> +
> +		if (uc->session) {
> +			/*
> +			 * Consumers could be added and removed, but they will not
> +			 * change position, so we can iterate sessions just once
> +			 * and keep the last found session as base for next search.
> +			 */
> +			sc = session_consumer_find(ri, &session_iter, uc->session_id);
> +			if (!sc || sc->rc)
> +				continue;
> +			cookie = &sc->cookie;
> +		}
> +
>  		if (uc->ret_handler)
> -			uc->ret_handler(uc, ri->func, regs);
> +			uc->ret_handler(uc, ri->func, regs, cookie);
>  	}
>  	up_read(&uprobe->register_rwsem);
>  }
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index cd098846e251..02d052639dfe 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -3332,7 +3332,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx
>  }
>  
>  static int
> -uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
> +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
> +			  __u64 *data)
>  {
>  	struct bpf_uprobe *uprobe;
>  
> @@ -3341,7 +3342,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
>  }
>  
>  static int
> -uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
> +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
> +			      __u64 *data)
>  {
>  	struct bpf_uprobe *uprobe;
>  
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index c98e3b3386ba..7068c279a244 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -88,9 +88,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
>  static int register_uprobe_event(struct trace_uprobe *tu);
>  static int unregister_uprobe_event(struct trace_uprobe *tu);
>  
> -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
> +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
> +			     __u64 *data);
>  static int uretprobe_dispatcher(struct uprobe_consumer *con,
> -				unsigned long func, struct pt_regs *regs);
> +				unsigned long func, struct pt_regs *regs,
> +				__u64 *data);
>  
>  #ifdef CONFIG_STACK_GROWSUP
>  static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
> @@ -1504,7 +1506,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
>  	}
>  }
>  
> -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
> +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
> +			     __u64 *data)
>  {
>  	struct trace_uprobe *tu;
>  	struct uprobe_dispatch_data udd;
> @@ -1534,7 +1537,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
>  }
>  
>  static int uretprobe_dispatcher(struct uprobe_consumer *con,
> -				unsigned long func, struct pt_regs *regs)
> +				unsigned long func, struct pt_regs *regs,
> +				__u64 *data)
>  {
>  	struct trace_uprobe *tu;
>  	struct uprobe_dispatch_data udd;
> -- 
> 2.45.2
>
Andrii Nakryiko July 3, 2024, 12:13 a.m. UTC | #6
On Tue, Jul 2, 2024 at 4:55 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> Hi Jiri,
>
> On Mon,  1 Jul 2024 18:41:07 +0200
> Jiri Olsa <jolsa@kernel.org> wrote:
>
> > Adding support for uprobe consumer to be defined as session and have
> > new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
> >
> > The session means that 'handler' and 'ret_handler' callbacks are
> > connected in a way that allows to:
> >
> >   - control execution of 'ret_handler' from 'handler' callback
> >   - share data between 'handler' and 'ret_handler' callbacks
> >
> > The session is enabled by setting new 'session' bool field to true
> > in uprobe_consumer object.
> >
> > We keep count of session consumers for uprobe and allocate session_consumer
> > object for each in return_instance object. This allows us to store
> > return values of 'handler' callbacks and data pointers of shared
> > data between both handlers.
> >
> > The session concept fits to our common use case where we do filtering
> > on entry uprobe and based on the result we decide to run the return
> > uprobe (or not).
> >
> > It's also convenient to share the data between session callbacks.
> >
> > The control of 'ret_handler' callback execution is done via return
> > value of the 'handler' callback. If it's 0 we install and execute
> > return uprobe, if it's 1 we do not.
> >
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  include/linux/uprobes.h     |  16 ++++-
> >  kernel/events/uprobes.c     | 129 +++++++++++++++++++++++++++++++++---
> >  kernel/trace/bpf_trace.c    |   6 +-
> >  kernel/trace/trace_uprobe.c |  12 ++--
> >  4 files changed, 144 insertions(+), 19 deletions(-)
> >
> > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > index f46e0ca0169c..903a860a8d01 100644
> > --- a/include/linux/uprobes.h
> > +++ b/include/linux/uprobes.h
> > @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
> >  };
> >
> >  struct uprobe_consumer {
> > -     int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> > +     int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
> >       int (*ret_handler)(struct uprobe_consumer *self,
> >                               unsigned long func,
> > -                             struct pt_regs *regs);
> > +                             struct pt_regs *regs, __u64 *data);
> >       bool (*filter)(struct uprobe_consumer *self,
> >                               enum uprobe_filter_ctx ctx,
> >                               struct mm_struct *mm);
> >
> >       struct uprobe_consumer *next;
> > +
> > +     bool                    session;        /* marks uprobe session consumer */
> > +     unsigned int            session_id;     /* set when uprobe_consumer is registered */
>
> Hmm, why this has both session and session_id?

session is caller's request to establish session semantics. Jiri, I
think it's better to move it higher next to
handler/ret_handler/filter, that's the part of uprobe_consumer struct
which has read-only caller-provided data (I'm adding offset and
ref_ctr_offset there as well).

> I also think we can use the address of uprobe_consumer itself as a unique id.

+1

>
> Also, if we can set session enabled by default, and skip ret_handler by handler's
> return value, it is more simpler. (If handler returns a specific value, skip ret_handler)

you mean derive if it's a session or not by both handler and
ret_handler being set? I guess this works fine for BPF side, because
there we never had them both set. If this doesn't regress others, I
think it's OK. We just need to make sure we don't unnecessarily
allocate session state for consumers that don't set both handler and
ret_handler. That would be a waste.

>
> >  };
> >
> >  #ifdef CONFIG_UPROBES
> > @@ -80,6 +83,12 @@ struct uprobe_task {
> >       unsigned int                    depth;
> >  };
> >
> > +struct session_consumer {
> > +     __u64           cookie;
>
> And this cookie looks not scalable. If we can pass a data to handler, I would like to
> reuse it to pass the target function parameters to ret_handler as kretprobe/fprobe does.
>
>         int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, void *data);
>
> uprobes can collect its uc's required sizes and allocate the memory (shadow stack frame)
> at handler_chain().

The goal here is to keep this simple and fast. I'd prefer to keep it
small and fixed size, if possible. I'm thinking about caching and
reusing return_instance as one of the future optimizations, so if we
can keep this more or less fixed (assuming there is typically not more
than 1 or 2 consumers per uprobe, which seems realistic), this will
provide a way to avoid excessive memory allocations.

>
> > +     unsigned int    id;
> > +     int             rc;
> > +};
> > +
> >  struct return_instance {
> >       struct uprobe           *uprobe;
> >       unsigned long           func;
> > @@ -88,6 +97,9 @@ struct return_instance {
> >       bool                    chained;        /* true, if instance is nested */
> >
> >       struct return_instance  *next;          /* keep as stack */
> > +
> > +     int                     sessions_cnt;
> > +     struct session_consumer sessions[];
>
> In that case, we don't have this array, but
>
>         char data[];
>
> And decode data array, which is a slice of variable length structure;
>
> struct session_consumer {
>         struct uprobe_consumer *uc;
>         char data[];
> };
>
> The size of session_consumer is uc->session_data_size + sizeof(uc).
>
> What would you think?
>
> Thank you,
>
> >  };
> >
> >  enum rp_check {
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 2c83ba776fc7..4da410460f2a 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -63,6 +63,8 @@ struct uprobe {
> >       loff_t                  ref_ctr_offset;
> >       unsigned long           flags;
> >
> > +     unsigned int            sessions_cnt;

[...]
Peter Zijlstra July 3, 2024, 8:10 a.m. UTC | #7
On Tue, Jul 02, 2024 at 01:51:28PM -0700, Andrii Nakryiko wrote:
> > +static size_t ri_size(int sessions_cnt)
> > +{
> > +       struct return_instance *ri __maybe_unused;
> > +
> > +       return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);
> 
> just use struct_size()?

Yeah, lets not. This is readable, struct_size() is not.
Jiri Olsa July 3, 2024, 3:31 p.m. UTC | #8
On Tue, Jul 02, 2024 at 01:52:38PM -0700, Andrii Nakryiko wrote:
> On Tue, Jul 2, 2024 at 9:11 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Tue, Jul 02, 2024 at 03:04:08PM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:
> > >
> > > > +static void
> > > > +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > > > +{
> > > > +   static unsigned int session_id;
> > > > +
> > > > +   if (uc->session) {
> > > > +           uprobe->sessions_cnt++;
> > > > +           uc->session_id = ++session_id ?: ++session_id;
> > > > +   }
> > > > +}
> > >
> > > The way I understand this code, you create a consumer every time you do
> > > uprobe_register() and unregister makes it go away.
> > >
> > > Now, register one, then 4g-1 times register+unregister, then register
> > > again.
> > >
> > > The above seems to then result in two consumers with the same
> > > session_id, which leads to trouble.
> > >
> > > Hmm?
> >
> > ugh true.. will make it u64 :)
> >
> > I think we could store uprobe_consumer pointer+ref in session_consumer,
> > and that would make the unregister path more interesting.. will check
> 
> More interesting how? It's actually a great idea, uprobe_consumer

nah, got confused ;-)

> pointer itself is a unique ID and 64-bit. We can still use lowest bit
> for RC (see my other reply).

I used pointers in the previous version, but then I thought what if the
consumer gets free-ed and new one created (with same address.. maybe not
likely but possible, right?) before the return probe is hit

jirka
Jiri Olsa July 3, 2024, 4:09 p.m. UTC | #9
On Tue, Jul 02, 2024 at 05:13:38PM -0700, Andrii Nakryiko wrote:
> On Tue, Jul 2, 2024 at 4:55 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > Hi Jiri,
> >
> > On Mon,  1 Jul 2024 18:41:07 +0200
> > Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > > Adding support for uprobe consumer to be defined as session and have
> > > new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
> > >
> > > The session means that 'handler' and 'ret_handler' callbacks are
> > > connected in a way that allows to:
> > >
> > >   - control execution of 'ret_handler' from 'handler' callback
> > >   - share data between 'handler' and 'ret_handler' callbacks
> > >
> > > The session is enabled by setting new 'session' bool field to true
> > > in uprobe_consumer object.
> > >
> > > We keep count of session consumers for uprobe and allocate session_consumer
> > > object for each in return_instance object. This allows us to store
> > > return values of 'handler' callbacks and data pointers of shared
> > > data between both handlers.
> > >
> > > The session concept fits to our common use case where we do filtering
> > > on entry uprobe and based on the result we decide to run the return
> > > uprobe (or not).
> > >
> > > It's also convenient to share the data between session callbacks.
> > >
> > > The control of 'ret_handler' callback execution is done via return
> > > value of the 'handler' callback. If it's 0 we install and execute
> > > return uprobe, if it's 1 we do not.
> > >
> > > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > > ---
> > >  include/linux/uprobes.h     |  16 ++++-
> > >  kernel/events/uprobes.c     | 129 +++++++++++++++++++++++++++++++++---
> > >  kernel/trace/bpf_trace.c    |   6 +-
> > >  kernel/trace/trace_uprobe.c |  12 ++--
> > >  4 files changed, 144 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > > index f46e0ca0169c..903a860a8d01 100644
> > > --- a/include/linux/uprobes.h
> > > +++ b/include/linux/uprobes.h
> > > @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
> > >  };
> > >
> > >  struct uprobe_consumer {
> > > -     int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> > > +     int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
> > >       int (*ret_handler)(struct uprobe_consumer *self,
> > >                               unsigned long func,
> > > -                             struct pt_regs *regs);
> > > +                             struct pt_regs *regs, __u64 *data);
> > >       bool (*filter)(struct uprobe_consumer *self,
> > >                               enum uprobe_filter_ctx ctx,
> > >                               struct mm_struct *mm);
> > >
> > >       struct uprobe_consumer *next;
> > > +
> > > +     bool                    session;        /* marks uprobe session consumer */
> > > +     unsigned int            session_id;     /* set when uprobe_consumer is registered */
> >
> > Hmm, why this has both session and session_id?
> 
> session is caller's request to establish session semantics. Jiri, I

and session_id is set when uprobe is registered and used when
return uprobe is executed to find matching uprobe_consumer,
plz check handle_uretprobe_chain/session_consumer_find

> think it's better to move it higher next to
> handler/ret_handler/filter, that's the part of uprobe_consumer struct
> which has read-only caller-provided data (I'm adding offset and
> ref_ctr_offset there as well).

ok, makes sense

> 
> > I also think we can use the address of uprobe_consumer itself as a unique id.
> 
> +1
> 
> >
> > Also, if we can set session enabled by default, and skip ret_handler by handler's
> > return value, it is more simpler. (If handler returns a specific value, skip ret_handler)
> 
> you mean derive if it's a session or not by both handler and
> ret_handler being set? I guess this works fine for BPF side, because
> there we never had them both set. If this doesn't regress others, I
> think it's OK. We just need to make sure we don't unnecessarily
> allocate session state for consumers that don't set both handler and
> ret_handler. That would be a waste.

hum.. so the current code installs return uprobe if there's ret_handler
defined in consumer and also the entry 'handler' needs to return 0

if entry 'handler' returns 1 the uprobe is unregistered

we could define new return value from 'handler' to 'not execute the
'ret_handler' and have 'handler' return values:

  0 - execute 'ret_handler' if defined
  1 - remove the uprobe
  2 - do NOT execute 'ret_handler'  // this current triggers WARN

we could delay the allocation of 'return_instance' until the first
consumer returns 0, so there's no perf regression

that way we could treat all consumers the same and we wouldn't need
the session flag..

ok looks like good idea ;-) will try that

thanks,
jirka

> 
> >
> > >  };
> > >
> > >  #ifdef CONFIG_UPROBES
> > > @@ -80,6 +83,12 @@ struct uprobe_task {
> > >       unsigned int                    depth;
> > >  };
> > >
> > > +struct session_consumer {
> > > +     __u64           cookie;
> >
> > And this cookie looks not scalable. If we can pass a data to handler, I would like to
> > reuse it to pass the target function parameters to ret_handler as kretprobe/fprobe does.
> >
> >         int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, void *data);
> >
> > uprobes can collect its uc's required sizes and allocate the memory (shadow stack frame)
> > at handler_chain().
> 
> The goal here is to keep this simple and fast. I'd prefer to keep it
> small and fixed size, if possible. I'm thinking about caching and
> reusing return_instance as one of the future optimizations, so if we
> can keep this more or less fixed (assuming there is typically not more
> than 1 or 2 consumers per uprobe, which seems realistic), this will
> provide a way to avoid excessive memory allocations.
> 
> >
> > > +     unsigned int    id;
> > > +     int             rc;
> > > +};
> > > +
> > >  struct return_instance {
> > >       struct uprobe           *uprobe;
> > >       unsigned long           func;
> > > @@ -88,6 +97,9 @@ struct return_instance {
> > >       bool                    chained;        /* true, if instance is nested */
> > >
> > >       struct return_instance  *next;          /* keep as stack */
> > > +
> > > +     int                     sessions_cnt;
> > > +     struct session_consumer sessions[];
> >
> > In that case, we don't have this array, but
> >
> >         char data[];
> >
> > And decode data array, which is a slice of variable length structure;
> >
> > struct session_consumer {
> >         struct uprobe_consumer *uc;
> >         char data[];
> > };
> >
> > The size of session_consumer is uc->session_data_size + sizeof(uc).
> >
> > What would you think?
> >
> > Thank you,
> >
> > >  };
> > >
> > >  enum rp_check {
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 2c83ba776fc7..4da410460f2a 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -63,6 +63,8 @@ struct uprobe {
> > >       loff_t                  ref_ctr_offset;
> > >       unsigned long           flags;
> > >
> > > +     unsigned int            sessions_cnt;
> 
> [...]
Jiri Olsa July 3, 2024, 4:20 p.m. UTC | #10
On Wed, Jul 03, 2024 at 05:31:32PM +0200, Jiri Olsa wrote:
> On Tue, Jul 02, 2024 at 01:52:38PM -0700, Andrii Nakryiko wrote:
> > On Tue, Jul 2, 2024 at 9:11 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >
> > > On Tue, Jul 02, 2024 at 03:04:08PM +0200, Peter Zijlstra wrote:
> > > > On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:
> > > >
> > > > > +static void
> > > > > +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > > > > +{
> > > > > +   static unsigned int session_id;
> > > > > +
> > > > > +   if (uc->session) {
> > > > > +           uprobe->sessions_cnt++;
> > > > > +           uc->session_id = ++session_id ?: ++session_id;
> > > > > +   }
> > > > > +}
> > > >
> > > > The way I understand this code, you create a consumer every time you do
> > > > uprobe_register() and unregister makes it go away.
> > > >
> > > > Now, register one, then 4g-1 times register+unregister, then register
> > > > again.
> > > >
> > > > The above seems to then result in two consumers with the same
> > > > session_id, which leads to trouble.
> > > >
> > > > Hmm?
> > >
> > > ugh true.. will make it u64 :)
> > >
> > > I think we could store uprobe_consumer pointer+ref in session_consumer,
> > > and that would make the unregister path more interesting.. will check
> > 
> > More interesting how? It's actually a great idea, uprobe_consumer
> 
> nah, got confused ;-)

actually.. the idea was to store uprobe_consumer pointers in 'return_instance'
and iterate them on return probe (not uprobe->consumers).. but that would
require the unregister code to somehow remove them (replace with null)

but we wouldn't need to do the search for matching consumer with session_id

also it probably regress current code, because we would execute only uprobe
return consumers that were registered at the time the function entry was hit,
whereas in current code the return uprobe executes all registered return
consumers

jirka

> 
> > pointer itself is a unique ID and 64-bit. We can still use lowest bit
> > for RC (see my other reply).
> 
> I used pointers in the previous version, but then I thought what if the
> consumer gets free-ed and new one created (with same address.. maybe not
> likely but possible, right?) before the return probe is hit
> 
> jirka
Jiri Olsa July 3, 2024, 5:13 p.m. UTC | #11
On Tue, Jul 02, 2024 at 01:51:28PM -0700, Andrii Nakryiko wrote:

SNIP

> >  #ifdef CONFIG_UPROBES
> > @@ -80,6 +83,12 @@ struct uprobe_task {
> >         unsigned int                    depth;
> >  };
> >
> > +struct session_consumer {
> > +       __u64           cookie;
> > +       unsigned int    id;
> > +       int             rc;
> 
> you'll be using u64 for ID, right? so this struct will be 24 bytes.

yes

> Maybe we can just use topmost bit of ID to store whether uretprobe
> should run or not? It's trivial to mask out during ID comparisons

actually.. I think we could store just consumers that need to be
executed in return probe so there will be no need for 'rc' value

> 
> > +};
> > +
> >  struct return_instance {
> >         struct uprobe           *uprobe;
> >         unsigned long           func;
> > @@ -88,6 +97,9 @@ struct return_instance {
> >         bool                    chained;        /* true, if instance is nested */
> >
> >         struct return_instance  *next;          /* keep as stack */
> > +
> > +       int                     sessions_cnt;
> 
> there is 7 byte gap before next field, let's put sessions_cnt there

ok

> 
> > +       struct session_consumer sessions[];
> >  };
> >
> >  enum rp_check {
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 2c83ba776fc7..4da410460f2a 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -63,6 +63,8 @@ struct uprobe {
> >         loff_t                  ref_ctr_offset;
> >         unsigned long           flags;
> >
> > +       unsigned int            sessions_cnt;
> > +
> >         /*
> >          * The generic code assumes that it has two members of unknown type
> >          * owned by the arch-specific code:
> > @@ -750,11 +752,30 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> >         return uprobe;
> >  }
> >
> > +static void
> > +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > +{
> > +       static unsigned int session_id;
> 
> (besides what Peter mentioned about wrap around of 32-bit counter)
> let's use atomic here to not rely on any particular locking
> (unnecessarily), this might make my life easier in the future, thanks.
> This is registration time, low frequency, extra atomic won't hurt.
> 
> It might be already broken, actually, for two independently registering uprobes.

ok, will try

> 
> > +
> > +       if (uc->session) {
> > +               uprobe->sessions_cnt++;
> > +               uc->session_id = ++session_id ?: ++session_id;
> > +       }
> > +}
> > +
> > +static void
> > +uprobe_consumer_unaccount(struct uprobe *uprobe, struct uprobe_consumer *uc)
> 
> this fits in 100 characters, keep it single line, please. Same for
> account function

ok

> 
> > +{
> > +       if (uc->session)
> > +               uprobe->sessions_cnt--;
> > +}
> > +
> >  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >  {
> >         down_write(&uprobe->consumer_rwsem);
> >         uc->next = uprobe->consumers;
> >         uprobe->consumers = uc;
> > +       uprobe_consumer_account(uprobe, uc);
> >         up_write(&uprobe->consumer_rwsem);
> >  }
> >
> > @@ -773,6 +794,7 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >                 if (*con == uc) {
> >                         *con = uc->next;
> >                         ret = true;
> > +                       uprobe_consumer_unaccount(uprobe, uc);
> >                         break;
> >                 }
> >         }
> > @@ -1744,6 +1766,23 @@ static struct uprobe_task *get_utask(void)
> >         return current->utask;
> >  }
> >
> > +static size_t ri_size(int sessions_cnt)
> > +{
> > +       struct return_instance *ri __maybe_unused;
> > +
> > +       return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);
> 
> just use struct_size()?
> 
> > +}
> > +
> > +static struct return_instance *alloc_return_instance(int sessions_cnt)
> > +{
> > +       struct return_instance *ri;
> > +
> > +       ri = kzalloc(ri_size(sessions_cnt), GFP_KERNEL);
> > +       if (ri)
> > +               ri->sessions_cnt = sessions_cnt;
> > +       return ri;
> > +}
> > +
> >  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
> >  {
> >         struct uprobe_task *n_utask;
> > @@ -1756,11 +1795,11 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
> >
> >         p = &n_utask->return_instances;
> >         for (o = o_utask->return_instances; o; o = o->next) {
> > -               n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> > +               n = alloc_return_instance(o->sessions_cnt);
> >                 if (!n)
> >                         return -ENOMEM;
> >
> > -               *n = *o;
> > +               memcpy(n, o, ri_size(o->sessions_cnt));
> >                 get_uprobe(n->uprobe);
> >                 n->next = NULL;
> >
> > @@ -1853,9 +1892,9 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
> >         utask->return_instances = ri;
> >  }
> >
> > -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
> > +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
> > +                             struct return_instance *ri)
> >  {
> > -       struct return_instance *ri;
> >         struct uprobe_task *utask;
> >         unsigned long orig_ret_vaddr, trampoline_vaddr;
> >         bool chained;
> > @@ -1874,9 +1913,11 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
> >                 return;
> >         }
> >
> > -       ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> > -       if (!ri)
> > -               return;
> > +       if (!ri) {
> > +               ri = alloc_return_instance(0);
> > +               if (!ri)
> > +                       return;
> > +       }
> >
> >         trampoline_vaddr = get_trampoline_vaddr();
> >         orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
> > @@ -2065,35 +2106,85 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> >         return uprobe;
> >  }
> >
> > +static struct session_consumer *
> > +session_consumer_next(struct return_instance *ri, struct session_consumer *sc,
> > +                     int session_id)
> > +{
> > +       struct session_consumer *next;
> > +
> > +       next = sc ? sc + 1 : &ri->sessions[0];
> > +       next->id = session_id;
> 
> it's kind of unexpected that "session_consumer_next" would actually
> set an ID... Maybe drop int session_id as input argument and fill it
> out outside of this function, this function being just a simple
> iterator?

yea, I was going back and forth on what to have in that function
or not, to keep the change minimal, but makes sense, will move

> 
> > +       return next;
> > +}
> > +
> > +static struct session_consumer *
> > +session_consumer_find(struct return_instance *ri, int *iter, int session_id)
> > +{
> > +       struct session_consumer *sc;
> > +       int idx = *iter;
> > +
> > +       for (sc = &ri->sessions[idx]; idx < ri->sessions_cnt; idx++, sc++) {
> > +               if (sc->id == session_id) {
> > +                       *iter = idx + 1;
> > +                       return sc;
> > +               }
> > +       }
> > +       return NULL;
> > +}
> > +
> >  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> >  {
> >         struct uprobe_consumer *uc;
> >         int remove = UPROBE_HANDLER_REMOVE;
> > +       struct session_consumer *sc = NULL;
> > +       struct return_instance *ri = NULL;
> >         bool need_prep = false; /* prepare return uprobe, when needed */
> >
> >         down_read(&uprobe->register_rwsem);
> > +       if (uprobe->sessions_cnt) {
> > +               ri = alloc_return_instance(uprobe->sessions_cnt);
> > +               if (!ri)
> > +                       goto out;
> > +       }
> > +
> >         for (uc = uprobe->consumers; uc; uc = uc->next) {
> > +               __u64 *cookie = NULL;
> >                 int rc = 0;
> >
> > +               if (uc->session) {
> > +                       sc = session_consumer_next(ri, sc, uc->session_id);
> > +                       cookie = &sc->cookie;
> > +               }
> > +
> >                 if (uc->handler) {
> > -                       rc = uc->handler(uc, regs);
> > +                       rc = uc->handler(uc, regs, cookie);
> >                         WARN(rc & ~UPROBE_HANDLER_MASK,
> >                                 "bad rc=0x%x from %ps()\n", rc, uc->handler);
> >                 }
> >
> > -               if (uc->ret_handler)
> > +               if (uc->session) {
> > +                       sc->rc = rc;
> > +                       need_prep |= !rc;
> 
> nit:
> 
> if (rc == 0)
>     need_prep = true;
> 
> and then it's *extremely obvious* what happens and under which conditions

ok

> 
> > +               } else if (uc->ret_handler) {
> >                         need_prep = true;
> > +               }
> >
> >                 remove &= rc;
> >         }
> >
> > +       /* no removal if there's at least one session consumer */
> > +       remove &= !uprobe->sessions_cnt;
> 
> this is counter (not error, not pointer), let's stick to ` == 0`, please
> 
> is this
> 
> if (uprobe->sessions_cnt != 0)
>    remove = 0;

yes ;-) will change

jirka

> 
> ? I can't tell (honestly), without spending ridiculous amounts of
> mental resources (for the underlying simplicity of the condition).

SNIP
Andrii Nakryiko July 3, 2024, 6:31 p.m. UTC | #12
On Wed, Jul 3, 2024 at 1:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jul 02, 2024 at 01:51:28PM -0700, Andrii Nakryiko wrote:
> > > +static size_t ri_size(int sessions_cnt)
> > > +{
> > > +       struct return_instance *ri __maybe_unused;
> > > +
> > > +       return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);
> >
> > just use struct_size()?
>
> Yeah, lets not. This is readable, struct_size() is not.

This hack with __maybe_unused is more readable than the standard
struct_size() helper that was added specifically for cases like this,
really?

I wonder if Kees agrees and whether there are any downsides to using
struct_size()

struct_size(struct return_instance, sessions, sessions_cnt) seems
readable enough to me, in any case.
diff mbox series

Patch

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f46e0ca0169c..903a860a8d01 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -34,15 +34,18 @@  enum uprobe_filter_ctx {
 };
 
 struct uprobe_consumer {
-	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
 	int (*ret_handler)(struct uprobe_consumer *self,
 				unsigned long func,
-				struct pt_regs *regs);
+				struct pt_regs *regs, __u64 *data);
 	bool (*filter)(struct uprobe_consumer *self,
 				enum uprobe_filter_ctx ctx,
 				struct mm_struct *mm);
 
 	struct uprobe_consumer *next;
+
+	bool			session;	/* marks uprobe session consumer */
+	unsigned int		session_id;	/* set when uprobe_consumer is registered */
 };
 
 #ifdef CONFIG_UPROBES
@@ -80,6 +83,12 @@  struct uprobe_task {
 	unsigned int			depth;
 };
 
+struct session_consumer {
+	__u64		cookie;
+	unsigned int	id;
+	int		rc;
+};
+
 struct return_instance {
 	struct uprobe		*uprobe;
 	unsigned long		func;
@@ -88,6 +97,9 @@  struct return_instance {
 	bool			chained;	/* true, if instance is nested */
 
 	struct return_instance	*next;		/* keep as stack */
+
+	int			sessions_cnt;
+	struct session_consumer	sessions[];
 };
 
 enum rp_check {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2c83ba776fc7..4da410460f2a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -63,6 +63,8 @@  struct uprobe {
 	loff_t			ref_ctr_offset;
 	unsigned long		flags;
 
+	unsigned int		sessions_cnt;
+
 	/*
 	 * The generic code assumes that it has two members of unknown type
 	 * owned by the arch-specific code:
@@ -750,11 +752,30 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 	return uprobe;
 }
 
+static void
+uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+	static unsigned int session_id;
+
+	if (uc->session) {
+		uprobe->sessions_cnt++;
+		uc->session_id = ++session_id ?: ++session_id;
+	}
+}
+
+static void
+uprobe_consumer_unaccount(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+	if (uc->session)
+		uprobe->sessions_cnt--;
+}
+
 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	down_write(&uprobe->consumer_rwsem);
 	uc->next = uprobe->consumers;
 	uprobe->consumers = uc;
+	uprobe_consumer_account(uprobe, uc);
 	up_write(&uprobe->consumer_rwsem);
 }
 
@@ -773,6 +794,7 @@  static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 		if (*con == uc) {
 			*con = uc->next;
 			ret = true;
+			uprobe_consumer_unaccount(uprobe, uc);
 			break;
 		}
 	}
@@ -1744,6 +1766,23 @@  static struct uprobe_task *get_utask(void)
 	return current->utask;
 }
 
+static size_t ri_size(int sessions_cnt)
+{
+	struct return_instance *ri __maybe_unused;
+
+	return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);
+}
+
+static struct return_instance *alloc_return_instance(int sessions_cnt)
+{
+	struct return_instance *ri;
+
+	ri = kzalloc(ri_size(sessions_cnt), GFP_KERNEL);
+	if (ri)
+		ri->sessions_cnt = sessions_cnt;
+	return ri;
+}
+
 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 {
 	struct uprobe_task *n_utask;
@@ -1756,11 +1795,11 @@  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 
 	p = &n_utask->return_instances;
 	for (o = o_utask->return_instances; o; o = o->next) {
-		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+		n = alloc_return_instance(o->sessions_cnt);
 		if (!n)
 			return -ENOMEM;
 
-		*n = *o;
+		memcpy(n, o, ri_size(o->sessions_cnt));
 		get_uprobe(n->uprobe);
 		n->next = NULL;
 
@@ -1853,9 +1892,9 @@  static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
 	utask->return_instances = ri;
 }
 
-static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
+			      struct return_instance *ri)
 {
-	struct return_instance *ri;
 	struct uprobe_task *utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
 	bool chained;
@@ -1874,9 +1913,11 @@  static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		return;
 	}
 
-	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
-	if (!ri)
-		return;
+	if (!ri) {
+		ri = alloc_return_instance(0);
+		if (!ri)
+			return;
+	}
 
 	trampoline_vaddr = get_trampoline_vaddr();
 	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
@@ -2065,35 +2106,85 @@  static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 	return uprobe;
 }
 
+static struct session_consumer *
+session_consumer_next(struct return_instance *ri, struct session_consumer *sc,
+		      int session_id)
+{
+	struct session_consumer *next;
+
+	next = sc ? sc + 1 : &ri->sessions[0];
+	next->id = session_id;
+	return next;
+}
+
+static struct session_consumer *
+session_consumer_find(struct return_instance *ri, int *iter, int session_id)
+{
+	struct session_consumer *sc;
+	int idx = *iter;
+
+	for (sc = &ri->sessions[idx]; idx < ri->sessions_cnt; idx++, sc++) {
+		if (sc->id == session_id) {
+			*iter = idx + 1;
+			return sc;
+		}
+	}
+	return NULL;
+}
+
 static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
 	struct uprobe_consumer *uc;
 	int remove = UPROBE_HANDLER_REMOVE;
+	struct session_consumer *sc = NULL;
+	struct return_instance *ri = NULL;
 	bool need_prep = false; /* prepare return uprobe, when needed */
 
 	down_read(&uprobe->register_rwsem);
+	if (uprobe->sessions_cnt) {
+		ri = alloc_return_instance(uprobe->sessions_cnt);
+		if (!ri)
+			goto out;
+	}
+
 	for (uc = uprobe->consumers; uc; uc = uc->next) {
+		__u64 *cookie = NULL;
 		int rc = 0;
 
+		if (uc->session) {
+			sc = session_consumer_next(ri, sc, uc->session_id);
+			cookie = &sc->cookie;
+		}
+
 		if (uc->handler) {
-			rc = uc->handler(uc, regs);
+			rc = uc->handler(uc, regs, cookie);
 			WARN(rc & ~UPROBE_HANDLER_MASK,
 				"bad rc=0x%x from %ps()\n", rc, uc->handler);
 		}
 
-		if (uc->ret_handler)
+		if (uc->session) {
+			sc->rc = rc;
+			need_prep |= !rc;
+		} else if (uc->ret_handler) {
 			need_prep = true;
+		}
 
 		remove &= rc;
 	}
 
+	/* no removal if there's at least one session consumer */
+	remove &= !uprobe->sessions_cnt;
+
 	if (need_prep && !remove)
-		prepare_uretprobe(uprobe, regs); /* put bp at return */
+		prepare_uretprobe(uprobe, regs, ri); /* put bp at return */
+	else
+		kfree(ri);
 
 	if (remove && uprobe->consumers) {
 		WARN_ON(!uprobe_is_active(uprobe));
 		unapply_uprobe(uprobe, current->mm);
 	}
+ out:
 	up_read(&uprobe->register_rwsem);
 }
 
@@ -2101,12 +2192,28 @@  static void
 handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 {
 	struct uprobe *uprobe = ri->uprobe;
+	struct session_consumer *sc;
 	struct uprobe_consumer *uc;
+	int session_iter = 0;
 
 	down_read(&uprobe->register_rwsem);
 	for (uc = uprobe->consumers; uc; uc = uc->next) {
+		__u64 *cookie = NULL;
+
+		if (uc->session) {
+			/*
+			 * Consumers could be added and removed, but they will not
+			 * change position, so we can iterate sessions just once
+			 * and keep the last found session as base for next search.
+			 */
+			sc = session_consumer_find(ri, &session_iter, uc->session_id);
+			if (!sc || sc->rc)
+				continue;
+			cookie = &sc->cookie;
+		}
+
 		if (uc->ret_handler)
-			uc->ret_handler(uc, ri->func, regs);
+			uc->ret_handler(uc, ri->func, regs, cookie);
 	}
 	up_read(&uprobe->register_rwsem);
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cd098846e251..02d052639dfe 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3332,7 +3332,8 @@  uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx
 }
 
 static int
-uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
+			  __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
 
@@ -3341,7 +3342,8 @@  uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
 }
 
 static int
-uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
+			      __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c98e3b3386ba..7068c279a244 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -88,9 +88,11 @@  static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
 static int register_uprobe_event(struct trace_uprobe *tu);
 static int unregister_uprobe_event(struct trace_uprobe *tu);
 
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+			     __u64 *data);
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
-				unsigned long func, struct pt_regs *regs);
+				unsigned long func, struct pt_regs *regs,
+				__u64 *data);
 
 #ifdef CONFIG_STACK_GROWSUP
 static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
@@ -1504,7 +1506,8 @@  trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
 	}
 }
 
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+			     __u64 *data)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
@@ -1534,7 +1537,8 @@  static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 }
 
 static int uretprobe_dispatcher(struct uprobe_consumer *con,
-				unsigned long func, struct pt_regs *regs)
+				unsigned long func, struct pt_regs *regs,
+				__u64 *data)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;