diff mbox series

[RFC,v3,09/13] uprobes: SRCU-protect uretprobe lifetime (with timeout)

Message ID 20240813042917.506057-10-andrii@kernel.org (mailing list archive)
State Handled Elsewhere
Headers show
Series uprobes: RCU-protected hot path optimizations | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch
bpf/vmtest-bpf-PR fail merge-conflict
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-14 pending Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / veristat

Commit Message

Andrii Nakryiko Aug. 13, 2024, 4:29 a.m. UTC
Avoid taking refcount on uprobe in prepare_uretprobe(), instead take
uretprobe-specific SRCU lock and keep it active as kernel transfers
control back to user space.

Given we can't rely on user space returning from traced function within
reasonable time period, we need to make sure not to keep SRCU lock
active for too long, though. To that effect, we employ a timer callback
which is meant to terminate SRCU lock region after predefined timeout
(currently set to 100ms), and instead transfer underlying struct
uprobe's lifetime protection to refcounting.

This fallback to less scalable refcounting after 100ms is a fine
tradeoff from uretprobe's scalability and performance perspective,
because uretprobing long running user functions inherently doesn't run
into scalability issues (there is just not enough frequency to cause
noticeable issues with either performance or scalability).

The overall trick is in ensuring synchronization between current thread
and timer's callback fired on some other thread. To cope with that with
minimal logic complications, we add hprobe wrapper which is used to
contain all the racy and synchronization related issues behind a small
number of basic helpers: hprobe_expire() and a matching pair of
hprobe_consume() and hprobe_finalize(). Other than that whatever current
thread's logic is there stays the same, as timer thread cannot modify
return_instance state (or add new/remove old return_instances). It only
takes care of SRCU unlock and uprobe refcounting, which is hidden from
the higher-level uretprobe handling logic.

We use atomic xchg() in hprobe_consume(), which is called from
performance critical handle_uretprobe_chain() function run in the
current context. When uncontended, this xchg() doesn't seem to hurt
performance as there are no other competing CPUs fighting for the same
cache line.  We also mark struct return_instance as
____cacheline_aligned to ensure no false sharing can happen.

Another technical moment, we need to make sure that the list of return
instances can be safely traversed under RCU from timer callback, so we
delay return_instance freeing with kfree_rcu() and make sure that list
modifications use RCU-aware operations.

Also, given SRCU lock survives transition from kernel to user space and
back we need to use lower-level __srcu_read_lock() and
__srcu_read_unlock() to avoid lockdep complaining.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/uprobes.h |  49 ++++++-
 kernel/events/uprobes.c | 294 ++++++++++++++++++++++++++++++++++------
 2 files changed, 301 insertions(+), 42 deletions(-)

Comments

Oleg Nesterov Aug. 19, 2024, 1:41 p.m. UTC | #1
On 08/12, Andrii Nakryiko wrote:
>
> Avoid taking refcount on uprobe in prepare_uretprobe(), instead take
> uretprobe-specific SRCU lock and keep it active as kernel transfers
> control back to user space.
...
>  include/linux/uprobes.h |  49 ++++++-
>  kernel/events/uprobes.c | 294 ++++++++++++++++++++++++++++++++++------
>  2 files changed, 301 insertions(+), 42 deletions(-)

Oh. To be honest I don't like this patch.

I would like to know what other reviewers think, but to me it adds too many
complications that I can't even fully understand...

And how much does it help performance-wise?

I'll try to take another look, and I'll try to think about other approaches,
not that I have something better in mind...


But lets forgets this patch for the moment. The next one adds even more
complications, and I think it doesn't make sense.

As I have already mentioned in the previous discussions, we can simply kill
utask->active_uprobe. And utask->auprobe.

So can't we start with the patch below? On top of your 08/13. It doesn't kill
utask->auprobe yet, this needs a bit more trivial changes.

What do you think?

Oleg.

-------------------------------------------------------------------------------
From d7cb674eb6f7bb891408b2b6a5fb872a6c2f0f6c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 19 Aug 2024 15:34:55 +0200
Subject: [RFC PATCH] uprobe: kill uprobe_task->active_uprobe

Untested, not for inclusion yet, and I need to split it into 2 changes.
It does 2 simple things:

	1. active_uprobe != NULL is possible if and only if utask->state != 0,
	   so it turns the active_uprobe checks into the utask->state checks.

	2. handle_singlestep() doesn't really need ->active_uprobe, it only
	   needs uprobe->arch which is "const" after prepare_uprobe().

	   So this patch adds the new "arch_uprobe uarch" member into utask
	   and changes pre_ssout() to do memcpy(&utask->uarch, &uprobe->arch).
---
 include/linux/uprobes.h |  2 +-
 kernel/events/uprobes.c | 37 +++++++++++--------------------------
 2 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 3a3154b74fe0..df6f3dab032c 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -56,6 +56,7 @@ struct uprobe_task {
 
 	union {
 		struct {
+			struct arch_uprobe	uarch;
 			struct arch_uprobe_task	autask;
 			unsigned long		vaddr;
 		};
@@ -66,7 +67,6 @@ struct uprobe_task {
 		};
 	};
 
-	struct uprobe			*active_uprobe;
 	unsigned long			xol_vaddr;
 
 	struct arch_uprobe              *auprobe;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index acc73c1bc54c..9689b557a5cf 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1721,7 +1721,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
 {
 	struct uprobe_task *utask = current->utask;
 
-	if (unlikely(utask && utask->active_uprobe))
+	if (unlikely(utask && utask->state))
 		return utask->vaddr;
 
 	return instruction_pointer(regs);
@@ -1747,9 +1747,6 @@ void uprobe_free_utask(struct task_struct *t)
 	if (!utask)
 		return;
 
-	if (utask->active_uprobe)
-		put_uprobe(utask->active_uprobe);
-
 	ri = utask->return_instances;
 	while (ri)
 		ri = free_ret_instance(ri);
@@ -1965,14 +1962,9 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 	if (!utask)
 		return -ENOMEM;
 
-	if (!try_get_uprobe(uprobe))
-		return -EINVAL;
-
 	xol_vaddr = xol_get_insn_slot(uprobe);
-	if (!xol_vaddr) {
-		err = -ENOMEM;
-		goto err_out;
-	}
+	if (!xol_vaddr)
+		return -ENOMEM;
 
 	utask->xol_vaddr = xol_vaddr;
 	utask->vaddr = bp_vaddr;
@@ -1980,15 +1972,12 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 	err = arch_uprobe_pre_xol(&uprobe->arch, regs);
 	if (unlikely(err)) {
 		xol_free_insn_slot(current);
-		goto err_out;
+		return err;
 	}
 
-	utask->active_uprobe = uprobe;
+	memcpy(&utask->uarch, &uprobe->arch, sizeof(utask->uarch));
 	utask->state = UTASK_SSTEP;
 	return 0;
-err_out:
-	put_uprobe(uprobe);
-	return err;
 }
 
 /*
@@ -2005,7 +1994,7 @@ bool uprobe_deny_signal(void)
 	struct task_struct *t = current;
 	struct uprobe_task *utask = t->utask;
 
-	if (likely(!utask || !utask->active_uprobe))
+	if (likely(!utask || !utask->state))
 		return false;
 
 	WARN_ON_ONCE(utask->state != UTASK_SSTEP);
@@ -2313,19 +2302,15 @@ static void handle_swbp(struct pt_regs *regs)
  */
 static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 {
-	struct uprobe *uprobe;
 	int err = 0;
 
-	uprobe = utask->active_uprobe;
 	if (utask->state == UTASK_SSTEP_ACK)
-		err = arch_uprobe_post_xol(&uprobe->arch, regs);
+		err = arch_uprobe_post_xol(&utask->uarch, regs);
 	else if (utask->state == UTASK_SSTEP_TRAPPED)
-		arch_uprobe_abort_xol(&uprobe->arch, regs);
+		arch_uprobe_abort_xol(&utask->uarch, regs);
 	else
 		WARN_ON_ONCE(1);
 
-	put_uprobe(uprobe);
-	utask->active_uprobe = NULL;
 	utask->state = UTASK_RUNNING;
 	xol_free_insn_slot(current);
 
@@ -2342,7 +2327,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 /*
  * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
  * allows the thread to return from interrupt. After that handle_swbp()
- * sets utask->active_uprobe.
+ * sets utask->state != 0.
  *
  * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
  * and allows the thread to return from interrupt.
@@ -2357,7 +2342,7 @@ void uprobe_notify_resume(struct pt_regs *regs)
 	clear_thread_flag(TIF_UPROBE);
 
 	utask = current->utask;
-	if (utask && utask->active_uprobe)
+	if (utask && utask->state)
 		handle_singlestep(utask, regs);
 	else
 		handle_swbp(regs);
@@ -2388,7 +2373,7 @@ int uprobe_post_sstep_notifier(struct pt_regs *regs)
 {
 	struct uprobe_task *utask = current->utask;
 
-	if (!current->mm || !utask || !utask->active_uprobe)
+	if (!current->mm || !utask || !utask->state)
 		/* task is currently not uprobed */
 		return 0;
Andrii Nakryiko Aug. 19, 2024, 8:34 p.m. UTC | #2
On Mon, Aug 19, 2024 at 6:41 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 08/12, Andrii Nakryiko wrote:
> >
> > Avoid taking refcount on uprobe in prepare_uretprobe(), instead take
> > uretprobe-specific SRCU lock and keep it active as kernel transfers
> > control back to user space.
> ...
> >  include/linux/uprobes.h |  49 ++++++-
> >  kernel/events/uprobes.c | 294 ++++++++++++++++++++++++++++++++++------
> >  2 files changed, 301 insertions(+), 42 deletions(-)
>
> Oh. To be honest I don't like this patch.
>
> I would like to know what other reviewers think, but to me it adds too many
> complications that I can't even fully understand...

Which parts? The atomic xchg() and cmpxchg() parts? What exactly do
you feel like you don't fully understand?

>
> And how much does it help performance-wise?

A lot, as we increase uprobe parallelism. Here's a subset of
benchmarks for 1-4, 8, 16, 32, 64, and 80 threads firing uretprobe.
With and without this SRCU change, but including all the other
changes, including the lockless VMA lookup. It's noticeable already
with just two competing CPUs/threads, and it just gets much worse from
there.

Of course in production you shouldn't come close to such rates of
uprobe/uretprobe firing, so this is definitely a microbenchmark
emphasizing the sharing between CPUs, but it still adds up. And we do
have production use cases that would like to fire uprobes at 100K+ per
second rates.

WITH SRCU for uretprobes
========================
uretprobe-nop         ( 1 cpus):    1.968 ± 0.001M/s  (  1.968M/s/cpu)
uretprobe-nop         ( 2 cpus):    3.739 ± 0.003M/s  (  1.869M/s/cpu)
uretprobe-nop         ( 3 cpus):    5.616 ± 0.003M/s  (  1.872M/s/cpu)
uretprobe-nop         ( 4 cpus):    7.286 ± 0.002M/s  (  1.822M/s/cpu)
uretprobe-nop         ( 8 cpus):   13.657 ± 0.007M/s  (  1.707M/s/cpu)
uretprobe-nop         (32 cpus):   45.305 ± 0.066M/s  (  1.416M/s/cpu)
uretprobe-nop         (64 cpus):   42.390 ± 0.922M/s  (  0.662M/s/cpu)
uretprobe-nop         (80 cpus):   47.554 ± 2.411M/s  (  0.594M/s/cpu)

WITHOUT SRCU for uretprobes
===========================
uretprobe-nop         ( 1 cpus):    2.197 ± 0.002M/s  (  2.197M/s/cpu)
uretprobe-nop         ( 2 cpus):    3.325 ± 0.001M/s  (  1.662M/s/cpu)
uretprobe-nop         ( 3 cpus):    4.129 ± 0.002M/s  (  1.376M/s/cpu)
uretprobe-nop         ( 4 cpus):    6.180 ± 0.003M/s  (  1.545M/s/cpu)
uretprobe-nop         ( 8 cpus):    7.323 ± 0.005M/s  (  0.915M/s/cpu)
uretprobe-nop         (16 cpus):    6.943 ± 0.005M/s  (  0.434M/s/cpu)
uretprobe-nop         (32 cpus):    5.931 ± 0.014M/s  (  0.185M/s/cpu)
uretprobe-nop         (64 cpus):    5.145 ± 0.003M/s  (  0.080M/s/cpu)
uretprobe-nop         (80 cpus):    4.925 ± 0.005M/s  (  0.062M/s/cpu)

>
> I'll try to take another look, and I'll try to think about other approaches,
> not that I have something better in mind...

Ok.

>
>
> But lets forgets this patch for the moment. The next one adds even more
> complications, and I think it doesn't make sense.
>

"Even more complications" is a bit of an overstatement. It just
applies everything we do for uretprobes in this patch to a very
straightforward single-stepped case.

> As I have already mentioned in the previous discussions, we can simply kill
> utask->active_uprobe. And utask->auprobe.

I don't have anything against that, in principle, but let's benchmark
and test that thoroughly. I'm a bit uneasy about the possibility that
some arch-specific code will do container_of() on this arch_uprobe in
order to get to uprobe, we'd need to audit all the code to make sure
that can't happen. Also it's a bit unfortunate that we have to assume
that struct arch_uprobe is small on all architectures, and there is no
code that assumes it can't be moved, etc, etc. (I also don't get why
you need memcpy

>
> So can't we start with the patch below? On top of your 08/13. It doesn't kill
> utask->auprobe yet, this needs a bit more trivial changes.
>
> What do you think?

I think that single-stepped case isn't the main use case (typically
uprobe/uretprobe will be installed on nop or push %rbp, both
emulated). uretprobes, though, are the main use case (along with
optimized entry uprobes). So what we do about single-stepped is a bit
secondary (for me, looking at production use cases).

But we do need to do something with uretprobes first and foremost.

>
> Oleg.
>
> -------------------------------------------------------------------------------
> From d7cb674eb6f7bb891408b2b6a5fb872a6c2f0f6c Mon Sep 17 00:00:00 2001
> From: Oleg Nesterov <oleg@redhat.com>
> Date: Mon, 19 Aug 2024 15:34:55 +0200
> Subject: [RFC PATCH] uprobe: kill uprobe_task->active_uprobe
>
> Untested, not for inclusion yet, and I need to split it into 2 changes.
> It does 2 simple things:
>
>         1. active_uprobe != NULL is possible if and only if utask->state != 0,
>            so it turns the active_uprobe checks into the utask->state checks.
>
>         2. handle_singlestep() doesn't really need ->active_uprobe, it only
>            needs uprobe->arch which is "const" after prepare_uprobe().
>
>            So this patch adds the new "arch_uprobe uarch" member into utask
>            and changes pre_ssout() to do memcpy(&utask->uarch, &uprobe->arch).
> ---
>  include/linux/uprobes.h |  2 +-
>  kernel/events/uprobes.c | 37 +++++++++++--------------------------
>  2 files changed, 12 insertions(+), 27 deletions(-)

[...]
Oleg Nesterov Aug. 20, 2024, 3:05 p.m. UTC | #3
On 08/19, Andrii Nakryiko wrote:
>
> On Mon, Aug 19, 2024 at 6:41 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > On 08/12, Andrii Nakryiko wrote:
> > >
> > > Avoid taking refcount on uprobe in prepare_uretprobe(), instead take
> > > uretprobe-specific SRCU lock and keep it active as kernel transfers
> > > control back to user space.
> > ...
> > >  include/linux/uprobes.h |  49 ++++++-
> > >  kernel/events/uprobes.c | 294 ++++++++++++++++++++++++++++++++++------
> > >  2 files changed, 301 insertions(+), 42 deletions(-)
> >
> > Oh. To be honest I don't like this patch.
> >
> > I would like to know what other reviewers think, but to me it adds too many
> > complications that I can't even fully understand...
>
> Which parts? The atomic xchg() and cmpxchg() parts? What exactly do
> you feel like you don't fully understand?

Heh, everything looks too complex for me ;)

Say, hprobe_expire(). It is also called by ri_timer() in softirq context,
right? And it does

	/* We lost the race, undo our refcount bump. It can drop to zero. */
	put_uprobe(uprobe);

How so? If the refcount goes to zero, put_uprobe() does mutex_lock(),
but it must not sleep in softirq context.


Or, prepare_uretprobe() which does

	rcu_assign_pointer(utask->return_instances, ri);

	if (!timer_pending(&utask->ri_timer))
		mod_timer(&utask->ri_timer, ...);

Suppose that the timer was pending and it was fired right before
rcu_assign_pointer(). What guarantees that prepare_uretprobe() will see
timer_pending() == false?

rcu_assign_pointer()->smp_store_release() is a one-way barrier. This
timer_pending() check may appear to happen before rcu_assign_pointer()
completes.

In this (yes, theoretical) case ri_timer() can miss the new return_instance,
while prepare_uretprobe() can miss the necessary mod_timer(). I think this
needs another mb() in between.


And I can't convince myself hprobe_expire() is correct... OK, I don't
fully understand the logic, but why data_race(READ_ONCE(hprobe->leased)) ?
READ_ONCE() should be enough in this case?


> > As I have already mentioned in the previous discussions, we can simply kill
> > utask->active_uprobe. And utask->auprobe.
>
> I don't have anything against that, in principle, but let's benchmark
> and test that thoroughly. I'm a bit uneasy about the possibility that
> some arch-specific code will do container_of() on this arch_uprobe in
> order to get to uprobe,

Well, struct uprobe is not "exported", the arch-specific code can't do this.

Oleg.
Andrii Nakryiko Aug. 20, 2024, 6:01 p.m. UTC | #4
On Tue, Aug 20, 2024 at 8:06 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 08/19, Andrii Nakryiko wrote:
> >
> > On Mon, Aug 19, 2024 at 6:41 AM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > On 08/12, Andrii Nakryiko wrote:
> > > >
> > > > Avoid taking refcount on uprobe in prepare_uretprobe(), instead take
> > > > uretprobe-specific SRCU lock and keep it active as kernel transfers
> > > > control back to user space.
> > > ...
> > > >  include/linux/uprobes.h |  49 ++++++-
> > > >  kernel/events/uprobes.c | 294 ++++++++++++++++++++++++++++++++++------
> > > >  2 files changed, 301 insertions(+), 42 deletions(-)
> > >
> > > Oh. To be honest I don't like this patch.
> > >
> > > I would like to know what other reviewers think, but to me it adds too many
> > > complications that I can't even fully understand...
> >
> > Which parts? The atomic xchg() and cmpxchg() parts? What exactly do
> > you feel like you don't fully understand?
>
> Heh, everything looks too complex for me ;)

Well, the best code is no code. But I'm not doing this just for fun,
so I'm happy with the simplest solution *that works*.

>
> Say, hprobe_expire(). It is also called by ri_timer() in softirq context,
> right? And it does
>
>         /* We lost the race, undo our refcount bump. It can drop to zero. */
>         put_uprobe(uprobe);
>
> How so? If the refcount goes to zero, put_uprobe() does mutex_lock(),
> but it must not sleep in softirq context.
>

Now we are talking about specific issues, thank you! It's hard to
discuss "I don't like".

Yes, I think you are right and it is certainly a problem with
put_uprobe() that it can't be called from softirq (just having to
remember that is error-prone, as is evidenced by me forgetting about
this issue).

It's easy enough to solve. We can either schedule work from timer
thread (and that will solve this particular issue only), or we can
teach put_uprobe() to schedule work item if it drops refcount to zero
from softirq and other restricted contexts.

I vote for making put_uprobe() flexible in this regard, add
work_struct to uprobe, and schedule all this to be done in the work
queue callback. WDYT?

>
> Or, prepare_uretprobe() which does
>
>         rcu_assign_pointer(utask->return_instances, ri);
>
>         if (!timer_pending(&utask->ri_timer))
>                 mod_timer(&utask->ri_timer, ...);
>
> Suppose that the timer was pending and it was fired right before
> rcu_assign_pointer(). What guarantees that prepare_uretprobe() will see
> timer_pending() == false?
>
> rcu_assign_pointer()->smp_store_release() is a one-way barrier. This
> timer_pending() check may appear to happen before rcu_assign_pointer()
> completes.
>
> In this (yes, theoretical) case ri_timer() can miss the new return_instance,
> while prepare_uretprobe() can miss the necessary mod_timer(). I think this
> needs another mb() in between.
>

Ok, that's fair. I felt like this pattern might be a bit problematic,
but I also felt like it's good to have to ensure that we do
occasionally have a race between timer callback and uretprobe, even if
uretprobe returns very quickly. (and I did confirm we get those races
and they seem to be handled fine, i.e., I saw uprobes being "expired"
into refcounted ones from ri_timer)

But the really simple way to solve this is to do unconditional
mod_timer(), so I can do just that to keep this less tricky. Would you
be ok with that?

>
> And I can't convince myself hprobe_expire() is correct... OK, I don't
> fully understand the logic, but why data_race(READ_ONCE(hprobe->leased)) ?
> READ_ONCE() should be enough in this case?

you mean why data_race() annotation? To appease KCSAN, given that we
modify hprobe->leased with xchg/cmpxchg, but read here with
READ_ONCE(). Maybe I'm overthinking it, not sure. There is a reason
why this is an RFC ;)

>
>
> > > As I have already mentioned in the previous discussions, we can simply kill
> > > utask->active_uprobe. And utask->auprobe.
> >
> > I don't have anything against that, in principle, but let's benchmark
> > and test that thoroughly. I'm a bit uneasy about the possibility that
> > some arch-specific code will do container_of() on this arch_uprobe in
> > order to get to uprobe,
>
> Well, struct uprobe is not "exported", the arch-specific code can't do this.
>

Ah, good point, that's great. But as I said, uretprobe is actually
*the common* use case, not single-stepped uprobe. Still not very happy
about that memcpy() and assumption that it's cheap, but that's minor.

But no matter what we do for single-stepped one, uretprobe needs some
solution. (and if that solution works for uretprobe, why wouldn't it
work for single-step?...)

> Oleg.
>
diff mbox series

Patch

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index e41cdae5597b..9a0aa0b2a5fe 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -15,6 +15,7 @@ 
 #include <linux/rbtree.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/timer.h>
 
 struct uprobe;
 struct vm_area_struct;
@@ -48,6 +49,45 @@  enum uprobe_task_state {
 	UTASK_SSTEP_TRAPPED,
 };
 
+/*
+ * Hybrid lifetime uprobe. Represents a uprobe instance that could be either
+ * SRCU protected (with SRCU protection eventually potentially timing out),
+ * refcounted using uprobe->ref, or there could be no valid uprobe (NULL).
+ *
+ * hprobe's internal state is setup such that background timer thread can
+ * atomically "downgrade" temporarily RCU-protected uprobe into refcounted one
+ * (or no uprobe, if refcounting failed).
+ *
+ * *stable* pointer always point to the uprobe (or could be NULL if there is
+ * was no valid underlying uprobe to begin with).
+ *
+ * *leased* pointer is the key to achieving race-free atomic lifetime state
+ * transition and can have three possible states:
+ *   - either the same non-NULL value as *stable*, in which case uprobe is
+ *     SRCU-protected;
+ *   - NULL, in which case uprobe (if there is any) is refcounted;
+ *   - special __UPROBE_DEAD value, which represents an uprobe that was SRCU
+ *     protected initially, but SRCU period timed out and we attempted to
+ *     convert it to refcounted, but refcount_inc_not_zero() failed, because
+ *     uprobe effectively went away (the last consumer unsubscribed). In this
+ *     case it's important to know that *stable* pointer (which still has
+ *     non-NULL uprobe pointer) shouldn't be used, because lifetime of
+ *     underlying uprobe is not guaranteed anymore. __UPROBE_DEAD is just an
+ *     internal marker and is handled transparently by hprobe_fetch() helper.
+ *
+ * When uprobe is SRCU-protected, we also record srcu_idx value, necessary for
+ * SRCU unlocking.
+ *
+ * See hprobe_expire() and hprobe_fetch() for details of race-free uprobe
+ * state transitioning details. It all hinges on atomic xchg() over *leaded*
+ * pointer. *stable* pointer, once initially set, is not modified concurrently.
+ */
+struct hprobe {
+	struct uprobe *stable;
+	struct uprobe *leased;
+	int srcu_idx;
+};
+
 /*
  * uprobe_task: Metadata of a task while it singlesteps.
  */
@@ -68,6 +108,7 @@  struct uprobe_task {
 	};
 
 	struct uprobe			*active_uprobe;
+	struct timer_list		ri_timer;
 	unsigned long			xol_vaddr;
 
 	struct arch_uprobe              *auprobe;
@@ -77,14 +118,18 @@  struct uprobe_task {
 };
 
 struct return_instance {
-	struct uprobe		*uprobe;
 	unsigned long		func;
 	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
 	bool			chained;	/* true, if instance is nested */
 
 	struct return_instance	*next;		/* keep as stack */
-};
+
+	union {
+		struct hprobe		hprobe;
+		struct rcu_head		rcu;
+	};
+} ____cacheline_aligned;
 
 enum rp_check {
 	RP_CHECK_CALL,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0480ad841942..26acd06871e6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@ 
 #include <linux/task_work.h>
 #include <linux/shmem_fs.h>
 #include <linux/khugepaged.h>
+#include <linux/srcu.h>
 
 #include <linux/uprobes.h>
 
@@ -49,6 +50,9 @@  static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 
 DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
 
+/* Covers return_instance's uprobe lifetime. */
+DEFINE_STATIC_SRCU(uretprobes_srcu);
+
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN	0
 
@@ -594,13 +598,6 @@  set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
 			*(uprobe_opcode_t *)&auprobe->insn);
 }
 
-/* uprobe should have guaranteed positive refcount */
-static struct uprobe *get_uprobe(struct uprobe *uprobe)
-{
-	refcount_inc(&uprobe->ref);
-	return uprobe;
-}
-
 /*
  * uprobe should have guaranteed lifetime, which can be either of:
  *   - caller already has refcount taken (and wants an extra one);
@@ -619,13 +616,20 @@  static inline bool uprobe_is_active(struct uprobe *uprobe)
 	return !RB_EMPTY_NODE(&uprobe->rb_node);
 }
 
-static void uprobe_free_rcu(struct rcu_head *rcu)
+static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
 {
 	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
 
 	kfree(uprobe);
 }
 
+static void uprobe_free_srcu(struct rcu_head *rcu)
+{
+	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+	call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
+}
+
 static void put_uprobe(struct uprobe *uprobe)
 {
 	if (!refcount_dec_and_test(&uprobe->ref))
@@ -650,7 +654,146 @@  static void put_uprobe(struct uprobe *uprobe)
 	delayed_uprobe_remove(uprobe, NULL);
 	mutex_unlock(&delayed_uprobe_lock);
 
-	call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
+	/* start srcu -> rcu_tasks_trace -> kfree chain */
+	call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
+}
+
+/*
+ * Special marker pointer for when ri_timer() expired, unlocking RCU, but
+ * failed to acquire refcount on uprobe (because it doesn't have any
+ * associated consumer anymore, for example). In such case it's important for
+ * hprobe_consume() to return NULL uprobe, instead of "stable" uprobe pointer,
+ * as that one isn't protected by either refcount nor RCU region now.
+ */
+#define __UPROBE_DEAD ((struct uprobe *)(-0xdead))
+
+#define RI_TIMER_PERIOD (HZ/10) /* 100 ms */
+
+/* Initialize hprobe as SRCU-protected "leased" uprobe */
+static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
+{
+	hprobe->srcu_idx = srcu_idx;
+	hprobe->stable = uprobe;
+	hprobe->leased = uprobe;
+}
+
+/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
+static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
+{
+	hprobe->srcu_idx = -1;
+	hprobe->stable = uprobe;
+	hprobe->leased = NULL;
+}
+
+/*
+ * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
+ * uprobe is still SRCU protected, or is refcounted. hprobe_consume() can be
+ * used only once for a given hprobe.
+ *
+ * Caller has to perform SRCU unlock if under_rcu is set to true;
+ * otherwise, either properly refcounted uprobe is returned or NULL.
+ */
+static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, bool *under_rcu)
+{
+	struct uprobe *uprobe;
+
+	uprobe = xchg(&hprobe->leased, NULL);
+	if (uprobe) {
+		if (unlikely(uprobe == __UPROBE_DEAD)) {
+			*under_rcu = false;
+			return NULL;
+		}
+
+		*under_rcu = true;
+		return uprobe;
+	}
+
+	*under_rcu = false;
+	return hprobe->stable;
+}
+
+/*
+ * Reset hprobe state and, if under_rcu is true, release SRCU lock.
+ * hprobe_finalize() can only be used from current context after
+ * hprobe_consume() call (which determines uprobe and under_rcu value).
+ */
+static void hprobe_finalize(struct hprobe *hprobe, struct uprobe *uprobe, bool under_rcu)
+{
+	if (under_rcu)
+		__srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+	else if (uprobe)
+		put_uprobe(uprobe);
+	/* prevent free_ret_instance() from double-putting uprobe */
+	hprobe->stable = NULL;
+}
+
+/*
+ * Attempt to switch (atomically) uprobe from being RCU protected ("leased")
+ * to refcounted ("stable") state. Competes with hprobe_consume(), only one of
+ * them can win the race to perform SRCU unlocking. Whoever wins must perform
+ * SRCU unlock.
+ *
+ * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
+ * to begin with or we failed to bump its refcount and it's going away.
+ *
+ * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
+ * locked region. It's not guaranteed that returned uprobe has a positive
+ * refcount, so caller has to attempt try_get_uprobe(), if it needs to use
+ * returned uprobe instance beyond ongoing SRCU lock region. See dup_utask().
+ */
+static struct uprobe* hprobe_expire(struct hprobe *hprobe)
+{
+	struct uprobe *uprobe;
+
+	/*
+	 * return_instance's hprobe is protected by RCU.
+	 * Underlying uprobe is itself protected from reuse by SRCU.
+	 */
+	lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu));
+
+	/*
+	 * Leased pointer can only be NULL, __UPROBE_DEAD, or some valid uprobe
+	 * pointer. This pointer can only be updated to NULL or __UPROBE_DEAD,
+	 * not any other valid uprobe pointer. So it's safe to fetch it with
+	 * READ_ONCE() and try to refcount it, if it's not NULL or __UPROBE_DEAD.
+	 */
+	uprobe = data_race(READ_ONCE(hprobe->leased));
+	if (!uprobe || uprobe == __UPROBE_DEAD)
+		return NULL;
+
+	if (!try_get_uprobe(uprobe)) {
+		/*
+		 * hprobe_consume() might have xchg()'ed to NULL already,
+		 * in which case we shouldn't set __UPROBE_DEAD.
+		 */
+		cmpxchg(&hprobe->leased, uprobe, __UPROBE_DEAD);
+		return NULL;
+	}
+
+	/*
+	 * Even if hprobe_consume() won and unlocked SRCU, we still have
+	 * a guarantee that uprobe won't be freed (and thus won't be reused)
+	 * because out caller maintains its own SRCU locked region.
+	 * So cmpxchg() below is well-formed.
+	 */
+	if (cmpxchg(&hprobe->leased, uprobe, NULL)) {
+		/*
+		 * At this point uprobe is properly refcounted, so it's safe
+		 * to end its original SRCU locked region.
+		 */
+		__srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+		return uprobe;
+	}
+
+	/* We lost the race, undo our refcount bump. It can drop to zero. */
+	put_uprobe(uprobe);
+
+	/*
+	 * We return underlying uprobe nevertheless because it's still valid
+	 * until the end of current SRCU locked region, and can be used to
+	 * try_get_uprobe(). This is used in dup_utask().
+	 */
+	return uprobe;
 }
 
 static __always_inline
@@ -1727,11 +1870,18 @@  unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
-static struct return_instance *free_ret_instance(struct return_instance *ri)
+static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe)
 {
 	struct return_instance *next = ri->next;
-	put_uprobe(ri->uprobe);
-	kfree(ri);
+	struct uprobe *uprobe;
+	bool under_rcu;
+
+	if (cleanup_hprobe) {
+		uprobe = hprobe_consume(&ri->hprobe, &under_rcu);
+		hprobe_finalize(&ri->hprobe, uprobe, under_rcu);
+	}
+
+	kfree_rcu(ri, rcu);
 	return next;
 }
 
@@ -1747,18 +1897,51 @@  void uprobe_free_utask(struct task_struct *t)
 	if (!utask)
 		return;
 
+	timer_delete_sync(&utask->ri_timer);
+
 	if (utask->active_uprobe)
 		put_uprobe(utask->active_uprobe);
 
 	ri = utask->return_instances;
 	while (ri)
-		ri = free_ret_instance(ri);
+		ri = free_ret_instance(ri, true /* cleanup_hprobe */);
 
 	xol_free_insn_slot(t);
 	kfree(utask);
 	t->utask = NULL;
 }
 
+#define for_each_ret_instance_rcu(pos, head) \
+	for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
+
+static void ri_timer(struct timer_list *timer)
+{
+	struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+	struct return_instance *ri;
+
+	/* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
+	guard(srcu)(&uretprobes_srcu);
+	/* RCU protects return_instance from freeing. */
+	guard(rcu)();
+
+	for_each_ret_instance_rcu(ri, utask->return_instances) {
+		hprobe_expire(&ri->hprobe);
+	}
+}
+
+static struct uprobe_task *alloc_utask(void)
+{
+	struct uprobe_task *utask;
+
+	utask = kzalloc(sizeof(*utask), GFP_KERNEL);
+	if (!utask)
+		return NULL;
+
+	timer_setup(&utask->ri_timer, ri_timer, 0);
+
+	return utask;
+}
+
 /*
  * Allocate a uprobe_task object for the task if necessary.
  * Called when the thread hits a breakpoint.
@@ -1770,7 +1953,7 @@  void uprobe_free_utask(struct task_struct *t)
 static struct uprobe_task *get_utask(void)
 {
 	if (!current->utask)
-		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+		current->utask = alloc_utask();
 	return current->utask;
 }
 
@@ -1778,12 +1961,16 @@  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 {
 	struct uprobe_task *n_utask;
 	struct return_instance **p, *o, *n;
+	struct uprobe *uprobe;
 
-	n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+	n_utask = alloc_utask();
 	if (!n_utask)
 		return -ENOMEM;
 	t->utask = n_utask;
 
+	/* protect uprobes from freeing, we'll need try_get_uprobe() them */
+	guard(srcu)(&uretprobes_srcu);
+
 	p = &n_utask->return_instances;
 	for (o = o_utask->return_instances; o; o = o->next) {
 		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
@@ -1791,17 +1978,24 @@  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 			return -ENOMEM;
 
 		*n = *o;
+
+		/* see hprobe_expire() comments */
+		uprobe = hprobe_expire(&o->hprobe);
+		if (uprobe) /* refcount bump for new utask */
+			uprobe = try_get_uprobe(uprobe);
+
 		/*
-		 * uprobe's refcnt has to be positive at this point, kept by
-		 * utask->return_instances items; return_instances can't be
-		 * removed right now, as task is blocked due to duping; so
-		 * get_uprobe() is safe to use here.
+		 * New utask will have stable properly refcounted uprobe or NULL.
+		 * Even if we failed to get refcounted uprobe, we still need
+		 * to preserve full set of return_instances for proper
+		 * uretprobe handling and nesting in forked task.
 		 */
-		get_uprobe(n->uprobe);
-		n->next = NULL;
+		hprobe_init_stable(&n->hprobe, uprobe);
 
-		*p = n;
+		n->next = NULL;
+		rcu_assign_pointer(*p, n);
 		p = &n->next;
+
 		n_utask->depth++;
 	}
 
@@ -1877,10 +2071,10 @@  static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
 	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
 
 	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
-		ri = free_ret_instance(ri);
+		ri = free_ret_instance(ri, true /* cleanup_hprobe */);
 		utask->depth--;
 	}
-	utask->return_instances = ri;
+	rcu_assign_pointer(utask->return_instances, ri);
 }
 
 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
@@ -1889,6 +2083,7 @@  static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	struct uprobe_task *utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
 	bool chained;
+	int srcu_idx;
 
 	if (!get_xol_area())
 		return;
@@ -1904,10 +2099,6 @@  static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		return;
 	}
 
-	/* we need to bump refcount to store uprobe in utask */
-	if (!try_get_uprobe(uprobe))
-		return;
-
 	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
 	if (!ri)
 		goto fail;
@@ -1937,20 +2128,36 @@  static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		}
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
-	ri->uprobe = uprobe;
+
+	/* __srcu_read_lock() because SRCU lock survives switch to user space */
+	srcu_idx = __srcu_read_lock(&uretprobes_srcu);
+
 	ri->func = instruction_pointer(regs);
 	ri->stack = user_stack_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
 
 	utask->depth++;
+
+	hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
 	ri->next = utask->return_instances;
-	utask->return_instances = ri;
+	rcu_assign_pointer(utask->return_instances, ri);
+
+	/*
+	 * Don't reschedule if timer is already active. This way we have
+	 * a guaranteed cap on maximum timer period (SRCU expiration duration)
+	 * regardless of how long and well-timed uretprobe chain user space
+	 * might cause. At worst we'll just have a few extra inconsequential
+	 * refcount bumps even if we could, technically, get away with just an
+	 * SRCU lock. On the other hand, we get timer expiration logic
+	 * triggered and tested regularly even for very short-running uretprobes.
+	 */
+	if (!timer_pending(&utask->ri_timer))
+		mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
 
 	return;
 fail:
 	kfree(ri);
-	put_uprobe(uprobe);
 }
 
 /* Prepare to single-step probed instruction out of line. */
@@ -2144,11 +2351,14 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 }
 
 static void
-handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
 {
-	struct uprobe *uprobe = ri->uprobe;
 	struct uprobe_consumer *uc;
 
+	/* all consumers unsubscribed meanwhile */
+	if (unlikely(!uprobe))
+		return;
+
 	rcu_read_lock_trace();
 	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
 		if (uc->ret_handler)
@@ -2173,7 +2383,8 @@  void uprobe_handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
 	struct return_instance *ri, *next;
-	bool valid;
+	struct uprobe *uprobe;
+	bool valid, under_rcu;
 
 	utask = current->utask;
 	if (!utask)
@@ -2203,21 +2414,24 @@  void uprobe_handle_trampoline(struct pt_regs *regs)
 			 * trampoline addresses on the stack are replaced with correct
 			 * original return addresses
 			 */
-			utask->return_instances = ri->next;
+			rcu_assign_pointer(utask->return_instances, ri->next);
+
+			uprobe = hprobe_consume(&ri->hprobe, &under_rcu);
 			if (valid)
-				handle_uretprobe_chain(ri, regs);
-			ri = free_ret_instance(ri);
+				handle_uretprobe_chain(ri, uprobe, regs);
+			hprobe_finalize(&ri->hprobe, uprobe, under_rcu);
+
+			/* We already took care of hprobe, no need to waste more time on that. */
+			ri = free_ret_instance(ri, false /* !cleanup_hprobe */);
 			utask->depth--;
 		} while (ri != next);
 	} while (!valid);
 
-	utask->return_instances = ri;
 	return;
 
- sigill:
+sigill:
 	uprobe_warn(current, "handle uretprobe, sending SIGILL.");
 	force_sig(SIGILL);
-
 }
 
 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)