diff mbox series

[v2,11/11] perf/uprobe: Add uretprobe timer

Message ID 20240711110401.412779774@infradead.org (mailing list archive)
State New
Headers show
Series perf/uprobe: Optimize uprobes | expand

Commit Message

Peter Zijlstra July 11, 2024, 11:02 a.m. UTC
In order to put a bound on the uretprobe_srcu critical section, add a
timer to uprobe_task. Upon every RI added or removed the timer is
pushed forward to now + 1s. If the timer were ever to fire, it would
convert the SRCU 'reference' to a refcount reference if possible.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/uprobes.h |    8 +++++
 kernel/events/uprobes.c |   67 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 69 insertions(+), 6 deletions(-)

Comments

Oleg Nesterov July 11, 2024, 1:19 p.m. UTC | #1
Not sure I read this patch correctly, but at first glance it looks
suspicious..

On 07/11, Peter Zijlstra wrote:
>
> +static void return_instance_timer(struct timer_list *timer)
> +{
> +	struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
> +	task_work_add(utask->task, &utask->ri_task_work, TWA_SIGNAL);
> +}

What if utask->task sleeps in TASK_STOPPED/TASK_TRACED state before
return from the ret-probed function?

In this case it won't react to TWA_SIGNAL until debugger or SIGCONT
wakes it up.

---------------------------------------------------------------------------
And it seems that even task_work_add() itself is not safe...

Suppose we have 2 ret-probed functions

	void f2() { ... }
	void f1() { ...; f2(); }

A task T calls f1(), hits the bp, and calls prepare_uretprobe() which does

	mod_timer(&utask->ri_timer, jiffies + HZ);

Then later it calls f2() and the pending timer expires after it enters the
kernel, but before the next prepare_uretprobe() -> mod_timer().

In this case ri_task_work is already queued and the timer is pending again.

Now. Even if T goes to the exit_to_user_mode_loop() path immediately, in
theory nothing can guarantee that it will call get_signal/task_work_run
in less than 1 second, it can be preempted.

But T can sleep in xol_take_insn_slot() before return from handle_swbp(),
and this is not so theoretical.

Oleg.
Peter Zijlstra July 11, 2024, 3 p.m. UTC | #2
On Thu, Jul 11, 2024 at 03:19:19PM +0200, Oleg Nesterov wrote:
> Not sure I read this patch correctly, but at first glance it looks
> suspicious..
> 
> On 07/11, Peter Zijlstra wrote:
> >
> > +static void return_instance_timer(struct timer_list *timer)
> > +{
> > +	struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
> > +	task_work_add(utask->task, &utask->ri_task_work, TWA_SIGNAL);
> > +}
> 
> What if utask->task sleeps in TASK_STOPPED/TASK_TRACED state before
> return from the ret-probed function?
> 
> In this case it won't react to TWA_SIGNAL until debugger or SIGCONT
> wakes it up.

Or FROZEN etc.. Yeah.

> ---------------------------------------------------------------------------
> And it seems that even task_work_add() itself is not safe...
> 
> Suppose we have 2 ret-probed functions
> 
> 	void f2() { ... }
> 	void f1() { ...; f2(); }
> 
> A task T calls f1(), hits the bp, and calls prepare_uretprobe() which does
> 
> 	mod_timer(&utask->ri_timer, jiffies + HZ);
> 
> Then later it calls f2() and the pending timer expires after it enters the
> kernel, but before the next prepare_uretprobe() -> mod_timer().
> 
> In this case ri_task_work is already queued and the timer is pending again.

You're saying we can hit a double enqueue, right? Yeah, that's a
problem. But that can be fairly easily rectified.

> Now. Even if T goes to the exit_to_user_mode_loop() path immediately, in
> theory nothing can guarantee that it will call get_signal/task_work_run
> in less than 1 second, it can be preempted.
> 
> But T can sleep in xol_take_insn_slot() before return from handle_swbp(),
> and this is not so theoretical.

So the assumption is that kernel code makes forward progress. If we get
preempted, we'll get scheduled again. If the machine is so overloaded
this takes more than a second, stretching the SRCU period is the least
of your problems.

Same with sleeps, it'll get a wakeup.

The only thing that is out of our control is userspace. And yes, I had
not considered STOPPED/TRACED/FROZEN.

So the reason I did that task_work is because the return_instance list
is strictly current, so a random timer cannot safely poke at it. And
barring those pesky states, it does as desired.

Let me ponder that a little, I *can* make it work, but all 'solutions'
I've come up with so far are really rather vile.
Peter Zijlstra July 11, 2024, 3:55 p.m. UTC | #3
On Thu, Jul 11, 2024 at 05:00:54PM +0200, Peter Zijlstra wrote:

> Let me ponder that a little, I *can* make it work, but all 'solutions'
> I've come up with so far are really rather vile.

This is the least horrible solution I could come up with...

---
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -83,6 +83,7 @@ struct uprobe_task {
 
 	struct timer_list		ri_timer;
 	struct callback_head		ri_task_work;
+	bool				ri_work_pending;
 	struct task_struct		*task;
 };
 
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1797,9 +1797,8 @@ void uprobe_free_utask(struct task_struc
 	t->utask = NULL;
 }
 
-static void return_instance_task_work(struct callback_head *head)
+static void __return_instance_put(struct uprobe_task *utask)
 {
-	struct uprobe_task *utask = container_of(head, struct uprobe_task, ri_task_work);
 	struct return_instance *ri;
 
 	for (ri = utask->return_instances; ri; ri = ri->next) {
@@ -1815,9 +1814,43 @@ static void return_instance_task_work(st
 	}
 }
 
+static void return_instance_task_work(struct callback_head *head)
+{
+	struct uprobe_task *utask = container_of(head, struct uprobe_task, ri_task_work);
+	utask->ri_work_pending = false;
+	__return_instance_put(utask);
+}
+
+static int return_instance_blocked(struct task_struct *p, void *arg)
+{
+	unsigned int state = READ_ONCE(p->__state);
+
+	if (state == TASK_RUNNING || state == TASK_WAKING)
+		return 0;
+
+	smp_rmb();
+	if (p->on_rq)
+		return 0;
+
+	/*
+	 * Per __task_needs_rq_locked() we now have: !p->on_cpu and only hold
+	 * p->pi_lock, and can consiter the task fully blocked.
+	 */
+
+	__return_instance_put(p->utask);
+	return 1;
+}
+
 static void return_instance_timer(struct timer_list *timer)
 {
 	struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+	if (utask->ri_work_pending)
+		return;
+
+	if (task_call_func(utask->task, return_instance_blocked, NULL))
+		return;
+
+	utask->ri_work_pending = true;
 	task_work_add(utask->task, &utask->ri_task_work, TWA_SIGNAL);
 }
Peter Zijlstra July 11, 2024, 4:06 p.m. UTC | #4
On Thu, Jul 11, 2024 at 05:55:42PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 11, 2024 at 05:00:54PM +0200, Peter Zijlstra wrote:
> 
> > Let me ponder that a little, I *can* make it work, but all 'solutions'
> > I've come up with so far are really rather vile.
> 
> This is the least horrible solution I could come up with...
> 
> ---
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -83,6 +83,7 @@ struct uprobe_task {
>  
>  	struct timer_list		ri_timer;
>  	struct callback_head		ri_task_work;
> +	bool				ri_work_pending;
>  	struct task_struct		*task;
>  };
>  
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1797,9 +1797,8 @@ void uprobe_free_utask(struct task_struc
>  	t->utask = NULL;
>  }
>  
> -static void return_instance_task_work(struct callback_head *head)
> +static void __return_instance_put(struct uprobe_task *utask)
>  {
> -	struct uprobe_task *utask = container_of(head, struct uprobe_task, ri_task_work);
>  	struct return_instance *ri;
>  
>  	for (ri = utask->return_instances; ri; ri = ri->next) {
> @@ -1815,9 +1814,43 @@ static void return_instance_task_work(st
>  	}
>  }
>  
> +static void return_instance_task_work(struct callback_head *head)
> +{
> +	struct uprobe_task *utask = container_of(head, struct uprobe_task, ri_task_work);
> +	utask->ri_work_pending = false;
> +	__return_instance_put(utask);
> +}
> +
> +static int return_instance_blocked(struct task_struct *p, void *arg)
> +{
> +	unsigned int state = READ_ONCE(p->__state);
> +
> +	if (state == TASK_RUNNING || state == TASK_WAKING)
> +		return 0;
> +
> +	smp_rmb();
> +	if (p->on_rq)
> +		return 0;
> +
> +	/*
> +	 * Per __task_needs_rq_locked() we now have: !p->on_cpu and only hold
> +	 * p->pi_lock, and can consiter the task fully blocked.
> +	 */
> +
> +	__return_instance_put(p->utask);

PREEMPT_RT might not like this though, doing the full RI iteration under
a raw_spinlock_t...

I just can't think of anything saner just now. Oh well, let me go make
dinner, perhaps something will come to me.
Andrii Nakryiko July 12, 2024, 9:43 p.m. UTC | #5
+ bpf

On Thu, Jul 11, 2024 at 4:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> In order to put a bound on the uretprobe_srcu critical section, add a
> timer to uprobe_task. Upon every RI added or removed the timer is
> pushed forward to now + 1s. If the timer were ever to fire, it would
> convert the SRCU 'reference' to a refcount reference if possible.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/uprobes.h |    8 +++++
>  kernel/events/uprobes.c |   67 ++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 69 insertions(+), 6 deletions(-)
>
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -15,6 +15,7 @@
>  #include <linux/rbtree.h>
>  #include <linux/types.h>
>  #include <linux/wait.h>
> +#include <linux/timer.h>
>
>  struct vm_area_struct;
>  struct mm_struct;
> @@ -79,6 +80,10 @@ struct uprobe_task {
>         struct return_instance          *return_instances;
>         unsigned int                    depth;
>         unsigned int                    active_srcu_idx;
> +
> +       struct timer_list               ri_timer;
> +       struct callback_head            ri_task_work;
> +       struct task_struct              *task;
>  };
>
>  struct return_instance {
> @@ -86,7 +91,8 @@ struct return_instance {
>         unsigned long           func;
>         unsigned long           stack;          /* stack pointer */
>         unsigned long           orig_ret_vaddr; /* original return address */
> -       bool                    chained;        /* true, if instance is nested */
> +       u8                      chained;        /* true, if instance is nested */
> +       u8                      has_ref;

Why bool -> u8 switch? You don't touch chained, so why change its
type? And for has_ref you interchangeably use 0 and true for the same
field. Let's stick to bool as there is nothing wrong with it?

>         int                     srcu_idx;
>
>         struct return_instance  *next;          /* keep as stack */

[...]

> @@ -1822,13 +1864,20 @@ static int dup_utask(struct task_struct
>                         return -ENOMEM;
>
>                 *n = *o;
> -               __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
> +               if (n->uprobe) {
> +                       if (n->has_ref)
> +                               get_uprobe(n->uprobe);
> +                       else
> +                               __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
> +               }
>                 n->next = NULL;
>
>                 *p = n;
>                 p = &n->next;
>                 n_utask->depth++;
>         }
> +       if (n_utask->return_instances)
> +               mod_timer(&n_utask->ri_timer, jiffies + HZ);

let's add #define for HZ, so it's adjusted in just one place (instead
of 3 as it is right now)

Also, we can have up to 64 levels of uretprobe nesting, so,
technically, the user can cause a delay of 64 seconds in total. Maybe
let's use something smaller than a full second? After all, if the
user-space function has high latency, then this refcount congestion is
much less of a problem. I'd set it to something like 50-100 ms for
starters.

>
>         return 0;
>  }
> @@ -1967,6 +2016,7 @@ static void prepare_uretprobe(struct upr
>
>         ri->srcu_idx = __srcu_read_lock(&uretprobes_srcu);
>         ri->uprobe = uprobe;
> +       ri->has_ref = 0;
>         ri->func = instruction_pointer(regs);
>         ri->stack = user_stack_pointer(regs);
>         ri->orig_ret_vaddr = orig_ret_vaddr;
> @@ -1976,6 +2026,8 @@ static void prepare_uretprobe(struct upr
>         ri->next = utask->return_instances;
>         utask->return_instances = ri;
>
> +       mod_timer(&utask->ri_timer, jiffies + HZ);
> +
>         return;
>
>  err_mem:
> @@ -2204,6 +2256,9 @@ handle_uretprobe_chain(struct return_ins
>         struct uprobe *uprobe = ri->uprobe;
>         struct uprobe_consumer *uc;
>
> +       if (!uprobe)
> +               return;
> +
>         guard(srcu)(&uprobes_srcu);
>
>         for_each_consumer_rcu(uc, uprobe->consumers) {
> @@ -2250,8 +2305,10 @@ static void handle_trampoline(struct pt_
>
>                 instruction_pointer_set(regs, ri->orig_ret_vaddr);
>                 do {
> -                       if (valid)
> +                       if (valid) {
>                                 handle_uretprobe_chain(ri, regs);
> +                               mod_timer(&utask->ri_timer, jiffies + HZ);
> +                       }
>                         ri = free_ret_instance(ri);
>                         utask->depth--;
>                 } while (ri != next);
>
>
Peter Zijlstra July 15, 2024, 11:41 a.m. UTC | #6
On Fri, Jul 12, 2024 at 02:43:52PM -0700, Andrii Nakryiko wrote:
> + bpf
> 
> On Thu, Jul 11, 2024 at 4:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > In order to put a bound on the uretprobe_srcu critical section, add a
> > timer to uprobe_task. Upon every RI added or removed the timer is
> > pushed forward to now + 1s. If the timer were ever to fire, it would
> > convert the SRCU 'reference' to a refcount reference if possible.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  include/linux/uprobes.h |    8 +++++
> >  kernel/events/uprobes.c |   67 ++++++++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 69 insertions(+), 6 deletions(-)
> >
> > --- a/include/linux/uprobes.h
> > +++ b/include/linux/uprobes.h
> > @@ -15,6 +15,7 @@
> >  #include <linux/rbtree.h>
> >  #include <linux/types.h>
> >  #include <linux/wait.h>
> > +#include <linux/timer.h>
> >
> >  struct vm_area_struct;
> >  struct mm_struct;
> > @@ -79,6 +80,10 @@ struct uprobe_task {
> >         struct return_instance          *return_instances;
> >         unsigned int                    depth;
> >         unsigned int                    active_srcu_idx;
> > +
> > +       struct timer_list               ri_timer;
> > +       struct callback_head            ri_task_work;
> > +       struct task_struct              *task;
> >  };
> >
> >  struct return_instance {
> > @@ -86,7 +91,8 @@ struct return_instance {
> >         unsigned long           func;
> >         unsigned long           stack;          /* stack pointer */
> >         unsigned long           orig_ret_vaddr; /* original return address */
> > -       bool                    chained;        /* true, if instance is nested */
> > +       u8                      chained;        /* true, if instance is nested */
> > +       u8                      has_ref;
> 
> Why bool -> u8 switch? You don't touch chained, so why change its
> type? And for has_ref you interchangeably use 0 and true for the same
> field. Let's stick to bool as there is nothing wrong with it?

sizeof(_Bool) is implementation defined. It is 1 for x86_64, but there
are platforms where it ends up begin 4 (some PowerPC ABIs among others.
I didn't want to grow this structure for no reason.

> >         int                     srcu_idx;
> >
> >         struct return_instance  *next;          /* keep as stack */
> 
> [...]
> 
> > @@ -1822,13 +1864,20 @@ static int dup_utask(struct task_struct
> >                         return -ENOMEM;
> >
> >                 *n = *o;
> > -               __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
> > +               if (n->uprobe) {
> > +                       if (n->has_ref)
> > +                               get_uprobe(n->uprobe);
> > +                       else
> > +                               __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
> > +               }
> >                 n->next = NULL;
> >
> >                 *p = n;
> >                 p = &n->next;
> >                 n_utask->depth++;
> >         }
> > +       if (n_utask->return_instances)
> > +               mod_timer(&n_utask->ri_timer, jiffies + HZ);
> 
> let's add #define for HZ, so it's adjusted in just one place (instead
> of 3 as it is right now)

Can do I suppose.

> Also, we can have up to 64 levels of uretprobe nesting, so,
> technically, the user can cause a delay of 64 seconds in total. Maybe
> let's use something smaller than a full second? After all, if the
> user-space function has high latency, then this refcount congestion is
> much less of a problem. I'd set it to something like 50-100 ms for
> starters.

Before you know it we'll have a sysctl :/ But sure, we can do something
shorter.
Andrii Nakryiko July 15, 2024, 5:34 p.m. UTC | #7
On Mon, Jul 15, 2024 at 4:41 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jul 12, 2024 at 02:43:52PM -0700, Andrii Nakryiko wrote:
> > + bpf
> >
> > On Thu, Jul 11, 2024 at 4:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > In order to put a bound on the uretprobe_srcu critical section, add a
> > > timer to uprobe_task. Upon every RI added or removed the timer is
> > > pushed forward to now + 1s. If the timer were ever to fire, it would
> > > convert the SRCU 'reference' to a refcount reference if possible.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > >  include/linux/uprobes.h |    8 +++++
> > >  kernel/events/uprobes.c |   67 ++++++++++++++++++++++++++++++++++++++++++++----
> > >  2 files changed, 69 insertions(+), 6 deletions(-)
> > >
> > > --- a/include/linux/uprobes.h
> > > +++ b/include/linux/uprobes.h
> > > @@ -15,6 +15,7 @@
> > >  #include <linux/rbtree.h>
> > >  #include <linux/types.h>
> > >  #include <linux/wait.h>
> > > +#include <linux/timer.h>
> > >
> > >  struct vm_area_struct;
> > >  struct mm_struct;
> > > @@ -79,6 +80,10 @@ struct uprobe_task {
> > >         struct return_instance          *return_instances;
> > >         unsigned int                    depth;
> > >         unsigned int                    active_srcu_idx;
> > > +
> > > +       struct timer_list               ri_timer;
> > > +       struct callback_head            ri_task_work;
> > > +       struct task_struct              *task;
> > >  };
> > >
> > >  struct return_instance {
> > > @@ -86,7 +91,8 @@ struct return_instance {
> > >         unsigned long           func;
> > >         unsigned long           stack;          /* stack pointer */
> > >         unsigned long           orig_ret_vaddr; /* original return address */
> > > -       bool                    chained;        /* true, if instance is nested */
> > > +       u8                      chained;        /* true, if instance is nested */
> > > +       u8                      has_ref;
> >
> > Why bool -> u8 switch? You don't touch chained, so why change its
> > type? And for has_ref you interchangeably use 0 and true for the same
> > field. Let's stick to bool as there is nothing wrong with it?
>
> sizeof(_Bool) is implementation defined. It is 1 for x86_64, but there
> are platforms where it ends up begin 4 (some PowerPC ABIs among others.
> I didn't want to grow this structure for no reason.

There are tons of bools in the kernel, surprised that we (kernel
makefiles) don't do anything on PowerPC to keep it consistent with the
rest of the world. Oh well, it just kind of looks off when there is a
mix of 0 and true used for the same field.

>
> > >         int                     srcu_idx;
> > >
> > >         struct return_instance  *next;          /* keep as stack */
> >
> > [...]
> >
> > > @@ -1822,13 +1864,20 @@ static int dup_utask(struct task_struct
> > >                         return -ENOMEM;
> > >
> > >                 *n = *o;
> > > -               __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
> > > +               if (n->uprobe) {
> > > +                       if (n->has_ref)
> > > +                               get_uprobe(n->uprobe);
> > > +                       else
> > > +                               __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
> > > +               }
> > >                 n->next = NULL;
> > >
> > >                 *p = n;
> > >                 p = &n->next;
> > >                 n_utask->depth++;
> > >         }
> > > +       if (n_utask->return_instances)
> > > +               mod_timer(&n_utask->ri_timer, jiffies + HZ);
> >
> > let's add #define for HZ, so it's adjusted in just one place (instead
> > of 3 as it is right now)
>
> Can do I suppose.

thanks!

>
> > Also, we can have up to 64 levels of uretprobe nesting, so,
> > technically, the user can cause a delay of 64 seconds in total. Maybe
> > let's use something smaller than a full second? After all, if the
> > user-space function has high latency, then this refcount congestion is
> > much less of a problem. I'd set it to something like 50-100 ms for
> > starters.
>
> Before you know it we'll have a sysctl :/ But sure, we can do something
> shorter.

:) let's hope we won't need sysctl (I don't think we will, FWIW)
diff mbox series

Patch

--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -15,6 +15,7 @@ 
 #include <linux/rbtree.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/timer.h>
 
 struct vm_area_struct;
 struct mm_struct;
@@ -79,6 +80,10 @@  struct uprobe_task {
 	struct return_instance		*return_instances;
 	unsigned int			depth;
 	unsigned int			active_srcu_idx;
+
+	struct timer_list		ri_timer;
+	struct callback_head		ri_task_work;
+	struct task_struct		*task;
 };
 
 struct return_instance {
@@ -86,7 +91,8 @@  struct return_instance {
 	unsigned long		func;
 	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
-	bool			chained;	/* true, if instance is nested */
+	u8			chained;	/* true, if instance is nested */
+	u8			has_ref;
 	int			srcu_idx;
 
 	struct return_instance	*next;		/* keep as stack */
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1761,7 +1761,12 @@  unsigned long uprobe_get_trap_addr(struc
 static struct return_instance *free_ret_instance(struct return_instance *ri)
 {
 	struct return_instance *next = ri->next;
-	__srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
+	if (ri->uprobe) {
+		if (ri->has_ref)
+			put_uprobe(ri->uprobe);
+		else
+			__srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
+	}
 	kfree(ri);
 	return next;
 }
@@ -1785,11 +1790,48 @@  void uprobe_free_utask(struct task_struc
 	while (ri)
 		ri = free_ret_instance(ri);
 
+	timer_delete_sync(&utask->ri_timer);
+	task_work_cancel(utask->task, &utask->ri_task_work);
 	xol_free_insn_slot(t);
 	kfree(utask);
 	t->utask = NULL;
 }
 
+static void return_instance_task_work(struct callback_head *head)
+{
+	struct uprobe_task *utask = container_of(head, struct uprobe_task, ri_task_work);
+	struct return_instance *ri;
+
+	for (ri = utask->return_instances; ri; ri = ri->next) {
+		if (!ri->uprobe)
+			continue;
+		if (ri->has_ref)
+			continue;
+		if (refcount_inc_not_zero(&ri->uprobe->ref))
+			ri->has_ref = true;
+		else
+			ri->uprobe = NULL;
+		__srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
+	}
+}
+
+static void return_instance_timer(struct timer_list *timer)
+{
+	struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+	task_work_add(utask->task, &utask->ri_task_work, TWA_SIGNAL);
+}
+
+static struct uprobe_task *alloc_utask(struct task_struct *task)
+{
+	struct uprobe_task *utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+	if (!utask)
+		return NULL;
+	timer_setup(&utask->ri_timer, return_instance_timer, 0);
+	init_task_work(&utask->ri_task_work, return_instance_task_work);
+	utask->task = task;
+	return utask;
+}
+
 /*
  * Allocate a uprobe_task object for the task if necessary.
  * Called when the thread hits a breakpoint.
@@ -1801,7 +1843,7 @@  void uprobe_free_utask(struct task_struc
 static struct uprobe_task *get_utask(void)
 {
 	if (!current->utask)
-		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+		current->utask = alloc_utask(current);
 	return current->utask;
 }
 
@@ -1810,7 +1852,7 @@  static int dup_utask(struct task_struct
 	struct uprobe_task *n_utask;
 	struct return_instance **p, *o, *n;
 
-	n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+	n_utask = alloc_utask(t);
 	if (!n_utask)
 		return -ENOMEM;
 	t->utask = n_utask;
@@ -1822,13 +1864,20 @@  static int dup_utask(struct task_struct
 			return -ENOMEM;
 
 		*n = *o;
-		__srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
+		if (n->uprobe) {
+			if (n->has_ref)
+				get_uprobe(n->uprobe);
+			else
+				__srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
+		}
 		n->next = NULL;
 
 		*p = n;
 		p = &n->next;
 		n_utask->depth++;
 	}
+	if (n_utask->return_instances)
+		mod_timer(&n_utask->ri_timer, jiffies + HZ);
 
 	return 0;
 }
@@ -1967,6 +2016,7 @@  static void prepare_uretprobe(struct upr
 
 	ri->srcu_idx = __srcu_read_lock(&uretprobes_srcu);
 	ri->uprobe = uprobe;
+	ri->has_ref = 0;
 	ri->func = instruction_pointer(regs);
 	ri->stack = user_stack_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
@@ -1976,6 +2026,8 @@  static void prepare_uretprobe(struct upr
 	ri->next = utask->return_instances;
 	utask->return_instances = ri;
 
+	mod_timer(&utask->ri_timer, jiffies + HZ);
+
 	return;
 
 err_mem:
@@ -2204,6 +2256,9 @@  handle_uretprobe_chain(struct return_ins
 	struct uprobe *uprobe = ri->uprobe;
 	struct uprobe_consumer *uc;
 
+	if (!uprobe)
+		return;
+
 	guard(srcu)(&uprobes_srcu);
 
 	for_each_consumer_rcu(uc, uprobe->consumers) {
@@ -2250,8 +2305,10 @@  static void handle_trampoline(struct pt_
 
 		instruction_pointer_set(regs, ri->orig_ret_vaddr);
 		do {
-			if (valid)
+			if (valid) {
 				handle_uretprobe_chain(ri, regs);
+				mod_timer(&utask->ri_timer, jiffies + HZ);
+			}
 			ri = free_ret_instance(ri);
 			utask->depth--;
 		} while (ri != next);