diff mbox series

[v2,tip/perf/core,1/2] uprobes: allow put_uprobe() from non-sleepable softirq context

Message ID 20241008002556.2332835-2-andrii@kernel.org (mailing list archive)
State Superseded
Delegated to: Masami Hiramatsu
Headers show
Series SRCU-protected uretprobes hot path | expand

Commit Message

Andrii Nakryiko Oct. 8, 2024, 12:25 a.m. UTC
Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
makes it unsuitable to be called from more restricted context like softirq.

Let's make put_uprobe() agnostic to the context in which it is called,
and use work queue to defer the mutex-protected clean up steps.

RB tree removal step is also moved into work-deferred callback to avoid
potential deadlock between softirq-based timer callback, added in the
next patch, and the rest of uprobe code.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/events/uprobes.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Peter Zijlstra Oct. 18, 2024, 8:26 a.m. UTC | #1
On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote:
> Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> makes it unsuitable to be called from more restricted context like softirq.

This is delayed_uprobe_lock, right?

So can't we do something like so instead? 

---
 kernel/events/uprobes.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2a0059464383..d17a9046de35 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -83,9 +83,11 @@ struct delayed_uprobe {
 	struct list_head list;
 	struct uprobe *uprobe;
 	struct mm_struct *mm;
+	struct rcu_head rcu;
 };
 
-static DEFINE_MUTEX(delayed_uprobe_lock);
+/* XXX global state; use per mm list instead ? */
+static DEFINE_SPINLOCK(delayed_uprobe_lock);
 static LIST_HEAD(delayed_uprobe_list);
 
 /*
@@ -289,9 +291,11 @@ delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
 {
 	struct delayed_uprobe *du;
 
-	list_for_each_entry(du, &delayed_uprobe_list, list)
+	guard(rcu)();
+	list_for_each_entry_rcu(du, &delayed_uprobe_list, list) {
 		if (du->uprobe == uprobe && du->mm == mm)
 			return du;
+	}
 	return NULL;
 }
 
@@ -308,7 +312,8 @@ static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
 
 	du->uprobe = uprobe;
 	du->mm = mm;
-	list_add(&du->list, &delayed_uprobe_list);
+	scoped_guard(spinlock, &delayed_uprobe_lock)
+		list_add_rcu(&du->list, &delayed_uprobe_list);
 	return 0;
 }
 
@@ -316,19 +321,21 @@ static void delayed_uprobe_delete(struct delayed_uprobe *du)
 {
 	if (WARN_ON(!du))
 		return;
-	list_del(&du->list);
-	kfree(du);
+	scoped_guard(spinlock, &delayed_uprobe_lock)
+		list_del(&du->list);
+	kfree_rcu(du, rcu);
 }
 
 static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
 {
-	struct list_head *pos, *q;
 	struct delayed_uprobe *du;
+	struct list_head *pos;
 
 	if (!uprobe && !mm)
 		return;
 
-	list_for_each_safe(pos, q, &delayed_uprobe_list) {
+	guard(rcu)();
+	list_for_each_rcu(pos, &delayed_uprobe_list) {
 		du = list_entry(pos, struct delayed_uprobe, list);
 
 		if (uprobe && du->uprobe != uprobe)
@@ -434,12 +441,10 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
 			return ret;
 	}
 
-	mutex_lock(&delayed_uprobe_lock);
 	if (d > 0)
 		ret = delayed_uprobe_add(uprobe, mm);
 	else
 		delayed_uprobe_remove(uprobe, mm);
-	mutex_unlock(&delayed_uprobe_lock);
 
 	return ret;
 }
@@ -645,9 +650,7 @@ static void put_uprobe(struct uprobe *uprobe)
 	 * gets called, we don't get a chance to remove uprobe from
 	 * delayed_uprobe_list from remove_breakpoint(). Do it here.
 	 */
-	mutex_lock(&delayed_uprobe_lock);
 	delayed_uprobe_remove(uprobe, NULL);
-	mutex_unlock(&delayed_uprobe_lock);
 
 	call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
 }
@@ -1350,13 +1353,18 @@ static void build_probe_list(struct inode *inode,
 /* @vma contains reference counter, not the probed instruction. */
 static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
 {
-	struct list_head *pos, *q;
 	struct delayed_uprobe *du;
+	struct list_head *pos;
 	unsigned long vaddr;
 	int ret = 0, err = 0;
 
-	mutex_lock(&delayed_uprobe_lock);
-	list_for_each_safe(pos, q, &delayed_uprobe_list) {
+	/*
+	 * delayed_uprobe_list is added to when the ref_ctr is not mapped
+	 * and is consulted (this function) when adding maps. And since
+	 * mmap_lock serializes these, it is not possible miss an entry.
+	 */
+	guard(rcu)();
+	list_for_each_rcu(pos, &delayed_uprobe_list) {
 		du = list_entry(pos, struct delayed_uprobe, list);
 
 		if (du->mm != vma->vm_mm ||
@@ -1370,9 +1378,9 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
 			if (!err)
 				err = ret;
 		}
+
 		delayed_uprobe_delete(du);
 	}
-	mutex_unlock(&delayed_uprobe_lock);
 	return err;
 }
 
@@ -1596,9 +1604,7 @@ void uprobe_clear_state(struct mm_struct *mm)
 {
 	struct xol_area *area = mm->uprobes_state.xol_area;
 
-	mutex_lock(&delayed_uprobe_lock);
 	delayed_uprobe_remove(NULL, mm);
-	mutex_unlock(&delayed_uprobe_lock);
 
 	if (!area)
 		return;
Andrii Nakryiko Oct. 18, 2024, 6:22 p.m. UTC | #2
On Fri, Oct 18, 2024 at 1:26 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote:
> > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > makes it unsuitable to be called from more restricted context like softirq.
>
> This is delayed_uprobe_lock, right?

Not just delated_uprobe_lock, there is also uprobes_treelock (I forgot
to update the commit message to mention that). Oleg had concerns (see
[0]) with that being taken from the timer thread, so I just moved all
of the locking into deferred work callback.

  [0] https://lore.kernel.org/linux-trace-kernel/20240915144910.GA27726@redhat.com/

>
> So can't we do something like so instead?

I'll need to look at this more thoroughly (and hopefully Oleg will get
a chance as well), dropping lock from delayed_ref_ctr_inc() is a bit
scary, but might be ok.

But generally speaking, what's your concern with doing deferred work
in put_uprobe()? It's not a hot path by any means, worst case we'll
have maybe thousands of uprobes attached/detached.

>
> ---
>  kernel/events/uprobes.c | 40 +++++++++++++++++++++++-----------------
>  1 file changed, 23 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2a0059464383..d17a9046de35 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -83,9 +83,11 @@ struct delayed_uprobe {
>         struct list_head list;
>         struct uprobe *uprobe;
>         struct mm_struct *mm;
> +       struct rcu_head rcu;
>  };
>
> -static DEFINE_MUTEX(delayed_uprobe_lock);
> +/* XXX global state; use per mm list instead ? */
> +static DEFINE_SPINLOCK(delayed_uprobe_lock);
>  static LIST_HEAD(delayed_uprobe_list);
>
>  /*
> @@ -289,9 +291,11 @@ delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
>  {
>         struct delayed_uprobe *du;
>
> -       list_for_each_entry(du, &delayed_uprobe_list, list)
> +       guard(rcu)();
> +       list_for_each_entry_rcu(du, &delayed_uprobe_list, list) {
>                 if (du->uprobe == uprobe && du->mm == mm)
>                         return du;
> +       }
>         return NULL;
>  }
>
> @@ -308,7 +312,8 @@ static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
>
>         du->uprobe = uprobe;
>         du->mm = mm;
> -       list_add(&du->list, &delayed_uprobe_list);
> +       scoped_guard(spinlock, &delayed_uprobe_lock)
> +               list_add_rcu(&du->list, &delayed_uprobe_list);
>         return 0;
>  }
>
> @@ -316,19 +321,21 @@ static void delayed_uprobe_delete(struct delayed_uprobe *du)
>  {
>         if (WARN_ON(!du))
>                 return;
> -       list_del(&du->list);
> -       kfree(du);
> +       scoped_guard(spinlock, &delayed_uprobe_lock)
> +               list_del(&du->list);
> +       kfree_rcu(du, rcu);
>  }
>
>  static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
>  {
> -       struct list_head *pos, *q;
>         struct delayed_uprobe *du;
> +       struct list_head *pos;
>
>         if (!uprobe && !mm)
>                 return;
>
> -       list_for_each_safe(pos, q, &delayed_uprobe_list) {
> +       guard(rcu)();
> +       list_for_each_rcu(pos, &delayed_uprobe_list) {
>                 du = list_entry(pos, struct delayed_uprobe, list);
>
>                 if (uprobe && du->uprobe != uprobe)
> @@ -434,12 +441,10 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
>                         return ret;
>         }
>
> -       mutex_lock(&delayed_uprobe_lock);
>         if (d > 0)
>                 ret = delayed_uprobe_add(uprobe, mm);
>         else
>                 delayed_uprobe_remove(uprobe, mm);
> -       mutex_unlock(&delayed_uprobe_lock);
>
>         return ret;
>  }
> @@ -645,9 +650,7 @@ static void put_uprobe(struct uprobe *uprobe)
>          * gets called, we don't get a chance to remove uprobe from
>          * delayed_uprobe_list from remove_breakpoint(). Do it here.
>          */
> -       mutex_lock(&delayed_uprobe_lock);
>         delayed_uprobe_remove(uprobe, NULL);
> -       mutex_unlock(&delayed_uprobe_lock);
>
>         call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
>  }
> @@ -1350,13 +1353,18 @@ static void build_probe_list(struct inode *inode,
>  /* @vma contains reference counter, not the probed instruction. */
>  static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
>  {
> -       struct list_head *pos, *q;
>         struct delayed_uprobe *du;
> +       struct list_head *pos;
>         unsigned long vaddr;
>         int ret = 0, err = 0;
>
> -       mutex_lock(&delayed_uprobe_lock);
> -       list_for_each_safe(pos, q, &delayed_uprobe_list) {
> +       /*
> +        * delayed_uprobe_list is added to when the ref_ctr is not mapped
> +        * and is consulted (this function) when adding maps. And since
> +        * mmap_lock serializes these, it is not possible miss an entry.
> +        */
> +       guard(rcu)();
> +       list_for_each_rcu(pos, &delayed_uprobe_list) {
>                 du = list_entry(pos, struct delayed_uprobe, list);
>
>                 if (du->mm != vma->vm_mm ||
> @@ -1370,9 +1378,9 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
>                         if (!err)
>                                 err = ret;
>                 }
> +
>                 delayed_uprobe_delete(du);
>         }
> -       mutex_unlock(&delayed_uprobe_lock);
>         return err;
>  }
>
> @@ -1596,9 +1604,7 @@ void uprobe_clear_state(struct mm_struct *mm)
>  {
>         struct xol_area *area = mm->uprobes_state.xol_area;
>
> -       mutex_lock(&delayed_uprobe_lock);
>         delayed_uprobe_remove(NULL, mm);
> -       mutex_unlock(&delayed_uprobe_lock);
>
>         if (!area)
>                 return;
Peter Zijlstra Oct. 21, 2024, 10:31 a.m. UTC | #3
On Fri, Oct 18, 2024 at 11:22:00AM -0700, Andrii Nakryiko wrote:
> On Fri, Oct 18, 2024 at 1:26 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote:
> > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > makes it unsuitable to be called from more restricted context like softirq.
> >
> > This is delayed_uprobe_lock, right?
> 
> Not just delated_uprobe_lock, there is also uprobes_treelock (I forgot
> to update the commit message to mention that). Oleg had concerns (see
> [0]) with that being taken from the timer thread, so I just moved all
> of the locking into deferred work callback.
> 
>   [0] https://lore.kernel.org/linux-trace-kernel/20240915144910.GA27726@redhat.com/

Right, but at least that's not a sleeping lock. He's right about it
needing to become a softirq-safe lock though. And yeah, unfortunate
that.

> > So can't we do something like so instead?
> 
> I'll need to look at this more thoroughly (and hopefully Oleg will get
> a chance as well), dropping lock from delayed_ref_ctr_inc() is a bit
> scary, but might be ok.

So I figured that update_ref_ctr() is already doing the
__update_ref_ctr() thing without holding the lock, so that lock really
is only there to manage the list.

And that list is super offensive... That really wants to be a per-mm
rb-tree or somesuch.

AFAICT the only reason it is a mutex, is because doing unbouded list
iteration under a spinlock is a really bad idea.

> But generally speaking, what's your concern with doing deferred work
> in put_uprobe()? It's not a hot path by any means, worst case we'll
> have maybe thousands of uprobes attached/detached.

Mostly I got offended by the level of crap in that code, and working
around crap instead of fixing crap just ain't right.
Andrii Nakryiko Oct. 21, 2024, 5:04 p.m. UTC | #4
On Mon, Oct 21, 2024 at 3:31 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Oct 18, 2024 at 11:22:00AM -0700, Andrii Nakryiko wrote:
> > On Fri, Oct 18, 2024 at 1:26 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote:
> > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > > makes it unsuitable to be called from more restricted context like softirq.
> > >
> > > This is delayed_uprobe_lock, right?
> >
> > Not just delated_uprobe_lock, there is also uprobes_treelock (I forgot
> > to update the commit message to mention that). Oleg had concerns (see
> > [0]) with that being taken from the timer thread, so I just moved all
> > of the locking into deferred work callback.
> >
> >   [0] https://lore.kernel.org/linux-trace-kernel/20240915144910.GA27726@redhat.com/
>
> Right, but at least that's not a sleeping lock. He's right about it
> needing to become a softirq-safe lock though. And yeah, unfortunate
> that.
>
> > > So can't we do something like so instead?
> >
> > I'll need to look at this more thoroughly (and hopefully Oleg will get
> > a chance as well), dropping lock from delayed_ref_ctr_inc() is a bit
> > scary, but might be ok.
>
> So I figured that update_ref_ctr() is already doing the
> __update_ref_ctr() thing without holding the lock, so that lock really
> is only there to manage the list.
>
> And that list is super offensive... That really wants to be a per-mm
> rb-tree or somesuch.

Probably hard to justify to add that to mm_struct, tbh, given that
uprobe+refcnt case (which is USDT with semaphore) isn't all that
frequent, and even then it will be active on a very small subset of
processes in the system, most probably. But, even if (see below),
probably should be a separate change.

>
> AFAICT the only reason it is a mutex, is because doing unbouded list
> iteration under a spinlock is a really bad idea.
>
> > But generally speaking, what's your concern with doing deferred work
> > in put_uprobe()? It's not a hot path by any means, worst case we'll
> > have maybe thousands of uprobes attached/detached.
>
> Mostly I got offended by the level of crap in that code, and working
> around crap instead of fixing crap just ain't right.
>

Ok, so where are we at? Do you insist on the delayed_ref_ctr_inc()
rework, switching uprobe_treelock to be softirq-safe and leaving
put_uprobe() mostly as is? Or is it ok, to do a quick deferred work
change for put_uprobe()  to unblock uretprobe+SRCU and land it sooner?
What if we split this work into two independent patch sets, go with
deferred work for uretprobe + SRCU, and then work with Oleg and you on
simplifying and improving delayed_uprobe_lock-related stuff?

After all, neither deferred work nor delayed_ref_ctr_inc() change has
much practical bearing on real-world performance. WDYT?
diff mbox series

Patch

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a2e6a57f79f2..9d3ab472200d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/khugepaged.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
 
 #include <linux/uprobes.h>
 
@@ -61,7 +62,10 @@  struct uprobe {
 	struct list_head	pending_list;
 	struct list_head	consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
-	struct rcu_head		rcu;
+	union {
+		struct rcu_head		rcu;
+		struct work_struct	work;
+	};
 	loff_t			offset;
 	loff_t			ref_ctr_offset;
 	unsigned long		flags;
@@ -627,10 +631,9 @@  static void uprobe_free_rcu(struct rcu_head *rcu)
 	kfree(uprobe);
 }
 
-static void put_uprobe(struct uprobe *uprobe)
+static void uprobe_free_deferred(struct work_struct *work)
 {
-	if (!refcount_dec_and_test(&uprobe->ref))
-		return;
+	struct uprobe *uprobe = container_of(work, struct uprobe, work);
 
 	write_lock(&uprobes_treelock);
 
@@ -654,6 +657,15 @@  static void put_uprobe(struct uprobe *uprobe)
 	call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
 }
 
+static void put_uprobe(struct uprobe *uprobe)
+{
+	if (!refcount_dec_and_test(&uprobe->ref))
+		return;
+
+	INIT_WORK(&uprobe->work, uprobe_free_deferred);
+	schedule_work(&uprobe->work);
+}
+
 static __always_inline
 int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
 	       const struct uprobe *r)