diff mbox series

[1/3] uprobes: allow put_uprobe() from non-sleepable softirq context

Message ID 20240909224903.3498207-2-andrii@kernel.org (mailing list archive)
State Not Applicable
Headers show
Series SRCU-protected uretprobes hot path | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Andrii Nakryiko Sept. 9, 2024, 10:49 p.m. UTC
Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
makes it unsuitable to be called from more restricted context like softirq.

Let's make put_uprobe() agnostic to the context in which it is called,
and use work queue to defer the mutex-protected clean up steps.

To avoid unnecessarily increasing the size of struct uprobe, we colocate
work_struct in parallel with rb_node and rcu, both of which are unused
by the time we get to schedule clean up work.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

Comments

Alexei Starovoitov Sept. 10, 2024, 2:51 a.m. UTC | #1
On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> makes it unsuitable to be called from more restricted context like softirq.
>
> Let's make put_uprobe() agnostic to the context in which it is called,
> and use work queue to defer the mutex-protected clean up steps.
>
> To avoid unnecessarily increasing the size of struct uprobe, we colocate
> work_struct in parallel with rb_node and rcu, both of which are unused
> by the time we get to schedule clean up work.
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
>  1 file changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index a2e6a57f79f2..377bd524bc8b 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -27,6 +27,7 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/khugepaged.h>
>  #include <linux/rcupdate_trace.h>
> +#include <linux/workqueue.h>
>
>  #include <linux/uprobes.h>
>
> @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
>  #define UPROBE_COPY_INSN       0
>
>  struct uprobe {
> -       struct rb_node          rb_node;        /* node in the rb tree */
> +       union {
> +               struct {
> +                       struct rb_node          rb_node;        /* node in the rb tree */
> +                       struct rcu_head         rcu;
> +               };
> +               /* work is used only during freeing, rcu and rb_node are unused at that point */
> +               struct work_struct work;
> +       };
>         refcount_t              ref;
>         struct rw_semaphore     register_rwsem;
>         struct rw_semaphore     consumer_rwsem;
>         struct list_head        pending_list;
>         struct list_head        consumers;
>         struct inode            *inode;         /* Also hold a ref to inode */
> -       struct rcu_head         rcu;
>         loff_t                  offset;
>         loff_t                  ref_ctr_offset;
>         unsigned long           flags;
> @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
>         return !RB_EMPTY_NODE(&uprobe->rb_node);
>  }
>
> +static void uprobe_free_deferred(struct work_struct *work)
> +{
> +       struct uprobe *uprobe = container_of(work, struct uprobe, work);
> +
> +       /*
> +        * If application munmap(exec_vma) before uprobe_unregister()
> +        * gets called, we don't get a chance to remove uprobe from
> +        * delayed_uprobe_list from remove_breakpoint(). Do it here.
> +        */
> +       mutex_lock(&delayed_uprobe_lock);
> +       delayed_uprobe_remove(uprobe, NULL);
> +       mutex_unlock(&delayed_uprobe_lock);
> +
> +       kfree(uprobe);
> +}
> +
>  static void uprobe_free_rcu(struct rcu_head *rcu)
>  {
>         struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
>
> -       kfree(uprobe);
> +       INIT_WORK(&uprobe->work, uprobe_free_deferred);
> +       schedule_work(&uprobe->work);
>  }
>
>  static void put_uprobe(struct uprobe *uprobe)

It seems put_uprobe hunk was lost, since the patch is not doing
what commit log describes.
Andrii Nakryiko Sept. 10, 2024, 5:13 a.m. UTC | #2
On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > makes it unsuitable to be called from more restricted context like softirq.
> >
> > Let's make put_uprobe() agnostic to the context in which it is called,
> > and use work queue to defer the mutex-protected clean up steps.
> >
> > To avoid unnecessarily increasing the size of struct uprobe, we colocate
> > work_struct in parallel with rb_node and rcu, both of which are unused
> > by the time we get to schedule clean up work.
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> >  1 file changed, 27 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index a2e6a57f79f2..377bd524bc8b 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/shmem_fs.h>
> >  #include <linux/khugepaged.h>
> >  #include <linux/rcupdate_trace.h>
> > +#include <linux/workqueue.h>
> >
> >  #include <linux/uprobes.h>
> >
> > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> >  #define UPROBE_COPY_INSN       0
> >
> >  struct uprobe {
> > -       struct rb_node          rb_node;        /* node in the rb tree */
> > +       union {
> > +               struct {
> > +                       struct rb_node          rb_node;        /* node in the rb tree */
> > +                       struct rcu_head         rcu;
> > +               };
> > +               /* work is used only during freeing, rcu and rb_node are unused at that point */
> > +               struct work_struct work;
> > +       };
> >         refcount_t              ref;
> >         struct rw_semaphore     register_rwsem;
> >         struct rw_semaphore     consumer_rwsem;
> >         struct list_head        pending_list;
> >         struct list_head        consumers;
> >         struct inode            *inode;         /* Also hold a ref to inode */
> > -       struct rcu_head         rcu;
> >         loff_t                  offset;
> >         loff_t                  ref_ctr_offset;
> >         unsigned long           flags;
> > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> >         return !RB_EMPTY_NODE(&uprobe->rb_node);
> >  }
> >
> > +static void uprobe_free_deferred(struct work_struct *work)
> > +{
> > +       struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > +
> > +       /*
> > +        * If application munmap(exec_vma) before uprobe_unregister()
> > +        * gets called, we don't get a chance to remove uprobe from
> > +        * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > +        */
> > +       mutex_lock(&delayed_uprobe_lock);
> > +       delayed_uprobe_remove(uprobe, NULL);
> > +       mutex_unlock(&delayed_uprobe_lock);
> > +
> > +       kfree(uprobe);
> > +}
> > +
> >  static void uprobe_free_rcu(struct rcu_head *rcu)
> >  {
> >         struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> >
> > -       kfree(uprobe);
> > +       INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > +       schedule_work(&uprobe->work);
> >  }
> >
> >  static void put_uprobe(struct uprobe *uprobe)
>
> It seems put_uprobe hunk was lost, since the patch is not doing
> what commit log describes.


Hmm, put_uprobe() has:

call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);

at the end (see [0], which added that), so we do schedule_work() in
RCU callback, similarly to what we do with bpf_map freeing in the BPF
subsystem.

This patch set is based on the latest tip/perf/core (and also assuming
the RCU Tasks Trace patch that mysteriously disappeared is actually
there, hopefully it will just as magically be restored).

  [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c
Alexei Starovoitov Sept. 10, 2024, 3:56 p.m. UTC | #3
On Mon, Sep 9, 2024 at 10:13 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > >
> > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > makes it unsuitable to be called from more restricted context like softirq.
> > >
> > > Let's make put_uprobe() agnostic to the context in which it is called,
> > > and use work queue to defer the mutex-protected clean up steps.
> > >
> > > To avoid unnecessarily increasing the size of struct uprobe, we colocate
> > > work_struct in parallel with rb_node and rcu, both of which are unused
> > > by the time we get to schedule clean up work.
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > >  kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> > >  1 file changed, 27 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index a2e6a57f79f2..377bd524bc8b 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/shmem_fs.h>
> > >  #include <linux/khugepaged.h>
> > >  #include <linux/rcupdate_trace.h>
> > > +#include <linux/workqueue.h>
> > >
> > >  #include <linux/uprobes.h>
> > >
> > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> > >  #define UPROBE_COPY_INSN       0
> > >
> > >  struct uprobe {
> > > -       struct rb_node          rb_node;        /* node in the rb tree */
> > > +       union {
> > > +               struct {
> > > +                       struct rb_node          rb_node;        /* node in the rb tree */
> > > +                       struct rcu_head         rcu;
> > > +               };
> > > +               /* work is used only during freeing, rcu and rb_node are unused at that point */
> > > +               struct work_struct work;
> > > +       };
> > >         refcount_t              ref;
> > >         struct rw_semaphore     register_rwsem;
> > >         struct rw_semaphore     consumer_rwsem;
> > >         struct list_head        pending_list;
> > >         struct list_head        consumers;
> > >         struct inode            *inode;         /* Also hold a ref to inode */
> > > -       struct rcu_head         rcu;
> > >         loff_t                  offset;
> > >         loff_t                  ref_ctr_offset;
> > >         unsigned long           flags;
> > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> > >         return !RB_EMPTY_NODE(&uprobe->rb_node);
> > >  }
> > >
> > > +static void uprobe_free_deferred(struct work_struct *work)
> > > +{
> > > +       struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > > +
> > > +       /*
> > > +        * If application munmap(exec_vma) before uprobe_unregister()
> > > +        * gets called, we don't get a chance to remove uprobe from
> > > +        * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > > +        */
> > > +       mutex_lock(&delayed_uprobe_lock);
> > > +       delayed_uprobe_remove(uprobe, NULL);
> > > +       mutex_unlock(&delayed_uprobe_lock);
> > > +
> > > +       kfree(uprobe);
> > > +}
> > > +
> > >  static void uprobe_free_rcu(struct rcu_head *rcu)
> > >  {
> > >         struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > >
> > > -       kfree(uprobe);
> > > +       INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > > +       schedule_work(&uprobe->work);
> > >  }
> > >
> > >  static void put_uprobe(struct uprobe *uprobe)
> >
> > It seems put_uprobe hunk was lost, since the patch is not doing
> > what commit log describes.
>
>
> Hmm, put_uprobe() has:
>
> call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
>
> at the end (see [0], which added that), so we do schedule_work() in
> RCU callback, similarly to what we do with bpf_map freeing in the BPF
> subsystem.
>
> This patch set is based on the latest tip/perf/core (and also assuming
> the RCU Tasks Trace patch that mysteriously disappeared is actually
> there, hopefully it will just as magically be restored).
>
>   [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c

I'm still not following.
put_uprobe() did delayed_uprobe_remove() and this patch is doing it again.

The commit log implies that mutex+delayed_uprobe_remove should be removed
from put_uprobe(), but that's not what the patch is doing.
Andrii Nakryiko Sept. 10, 2024, 5:46 p.m. UTC | #4
On Tue, Sep 10, 2024 at 8:56 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 10:13 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > >
> > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > > makes it unsuitable to be called from more restricted context like softirq.
> > > >
> > > > Let's make put_uprobe() agnostic to the context in which it is called,
> > > > and use work queue to defer the mutex-protected clean up steps.
> > > >
> > > > To avoid unnecessarily increasing the size of struct uprobe, we colocate
> > > > work_struct in parallel with rb_node and rcu, both of which are unused
> > > > by the time we get to schedule clean up work.
> > > >
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > ---
> > > >  kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> > > >  1 file changed, 27 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > index a2e6a57f79f2..377bd524bc8b 100644
> > > > --- a/kernel/events/uprobes.c
> > > > +++ b/kernel/events/uprobes.c
> > > > @@ -27,6 +27,7 @@
> > > >  #include <linux/shmem_fs.h>
> > > >  #include <linux/khugepaged.h>
> > > >  #include <linux/rcupdate_trace.h>
> > > > +#include <linux/workqueue.h>
> > > >
> > > >  #include <linux/uprobes.h>
> > > >
> > > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> > > >  #define UPROBE_COPY_INSN       0
> > > >
> > > >  struct uprobe {
> > > > -       struct rb_node          rb_node;        /* node in the rb tree */
> > > > +       union {
> > > > +               struct {
> > > > +                       struct rb_node          rb_node;        /* node in the rb tree */
> > > > +                       struct rcu_head         rcu;
> > > > +               };
> > > > +               /* work is used only during freeing, rcu and rb_node are unused at that point */
> > > > +               struct work_struct work;
> > > > +       };
> > > >         refcount_t              ref;
> > > >         struct rw_semaphore     register_rwsem;
> > > >         struct rw_semaphore     consumer_rwsem;
> > > >         struct list_head        pending_list;
> > > >         struct list_head        consumers;
> > > >         struct inode            *inode;         /* Also hold a ref to inode */
> > > > -       struct rcu_head         rcu;
> > > >         loff_t                  offset;
> > > >         loff_t                  ref_ctr_offset;
> > > >         unsigned long           flags;
> > > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> > > >         return !RB_EMPTY_NODE(&uprobe->rb_node);
> > > >  }
> > > >
> > > > +static void uprobe_free_deferred(struct work_struct *work)
> > > > +{
> > > > +       struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > > > +
> > > > +       /*
> > > > +        * If application munmap(exec_vma) before uprobe_unregister()
> > > > +        * gets called, we don't get a chance to remove uprobe from
> > > > +        * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > > > +        */
> > > > +       mutex_lock(&delayed_uprobe_lock);
> > > > +       delayed_uprobe_remove(uprobe, NULL);
> > > > +       mutex_unlock(&delayed_uprobe_lock);
> > > > +
> > > > +       kfree(uprobe);
> > > > +}
> > > > +
> > > >  static void uprobe_free_rcu(struct rcu_head *rcu)
> > > >  {
> > > >         struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > > >
> > > > -       kfree(uprobe);
> > > > +       INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > > > +       schedule_work(&uprobe->work);
> > > >  }
> > > >
> > > >  static void put_uprobe(struct uprobe *uprobe)
> > >
> > > It seems put_uprobe hunk was lost, since the patch is not doing
> > > what commit log describes.
> >
> >
> > Hmm, put_uprobe() has:
> >
> > call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
> >
> > at the end (see [0], which added that), so we do schedule_work() in
> > RCU callback, similarly to what we do with bpf_map freeing in the BPF
> > subsystem.
> >
> > This patch set is based on the latest tip/perf/core (and also assuming
> > the RCU Tasks Trace patch that mysteriously disappeared is actually
> > there, hopefully it will just as magically be restored).
> >
> >   [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c
>
> I'm still not following.
> put_uprobe() did delayed_uprobe_remove() and this patch is doing it again.
>
> The commit log implies that mutex+delayed_uprobe_remove should be removed
> from put_uprobe(), but that's not what the patch is doing.

Ah, that part. You are right, it's my bad. I copied that lock to a
work queue callback, but didn't remove it from the put_uprobe(). Will
fix it in v2, thanks for catching!
Oleg Nesterov Sept. 15, 2024, 2:49 p.m. UTC | #5
On 09/09, Andrii Nakryiko wrote:
>
> Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> makes it unsuitable to be called from more restricted context like softirq.
>
> Let's make put_uprobe() agnostic to the context in which it is called,
> and use work queue to defer the mutex-protected clean up steps.

...

> +static void uprobe_free_deferred(struct work_struct *work)
> +{
> +	struct uprobe *uprobe = container_of(work, struct uprobe, work);
> +
> +	/*
> +	 * If application munmap(exec_vma) before uprobe_unregister()
> +	 * gets called, we don't get a chance to remove uprobe from
> +	 * delayed_uprobe_list from remove_breakpoint(). Do it here.
> +	 */
> +	mutex_lock(&delayed_uprobe_lock);
> +	delayed_uprobe_remove(uprobe, NULL);
> +	mutex_unlock(&delayed_uprobe_lock);
> +
> +	kfree(uprobe);
> +}
> +
>  static void uprobe_free_rcu(struct rcu_head *rcu)
>  {
>  	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
>
> -	kfree(uprobe);
> +	INIT_WORK(&uprobe->work, uprobe_free_deferred);
> +	schedule_work(&uprobe->work);
>  }

This is still wrong afaics...

If put_uprobe() can be called from softirq (after the next patch), then
put_uprobe() and all other users of uprobes_treelock should use
write_lock_bh/read_lock_bh to avoid the deadlock.

To be honest... I simply can't force myself to even try to read 2/3 ;) I'll
try to do this later, but I am sure I will never like it, sorry.

Oleg.
Andrii Nakryiko Sept. 17, 2024, 8:19 a.m. UTC | #6
On Sun, Sep 15, 2024 at 4:49 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 09/09, Andrii Nakryiko wrote:
> >
> > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > makes it unsuitable to be called from more restricted context like softirq.
> >
> > Let's make put_uprobe() agnostic to the context in which it is called,
> > and use work queue to defer the mutex-protected clean up steps.
>
> ...
>
> > +static void uprobe_free_deferred(struct work_struct *work)
> > +{
> > +     struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > +
> > +     /*
> > +      * If application munmap(exec_vma) before uprobe_unregister()
> > +      * gets called, we don't get a chance to remove uprobe from
> > +      * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > +      */
> > +     mutex_lock(&delayed_uprobe_lock);
> > +     delayed_uprobe_remove(uprobe, NULL);
> > +     mutex_unlock(&delayed_uprobe_lock);
> > +
> > +     kfree(uprobe);
> > +}
> > +
> >  static void uprobe_free_rcu(struct rcu_head *rcu)
> >  {
> >       struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> >
> > -     kfree(uprobe);
> > +     INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > +     schedule_work(&uprobe->work);
> >  }
>
> This is still wrong afaics...
>
> If put_uprobe() can be called from softirq (after the next patch), then
> put_uprobe() and all other users of uprobes_treelock should use
> write_lock_bh/read_lock_bh to avoid the deadlock.

Ok, I see the problem, that's unfortunate.

I see three ways to handle that:

1) keep put_uprobe() as is, and instead do schedule_work() from the
timer thread to postpone put_uprobe(). (but I'm not a big fan of this)
2) move uprobes_treelock part of put_uprobe() into rcu callback, I
think it has no bearing on correctness, uprobe_is_active() is there
already to handle races between putting uprobe and removing it from
uprobes_tree (I prefer this one over #1 )
3) you might like this the most ;) I think I can simplify
hprobes_expire() from patch #2 to not need put_uprobe() at all, if I
protect uprobe lifetime with non-sleepable
rcu_read_lock()/rcu_read_unlock() and perform try_get_uprobe() as the
very last step after cmpxchg() succeeded.

I'm leaning towards #3, but #2 seems fine to me as well.

>
> To be honest... I simply can't force myself to even try to read 2/3 ;) I'll
> try to do this later, but I am sure I will never like it, sorry.

This might sound rude, but the goal here is not to make you like it :)
The goal is to improve performance with minimal complexity. And I'm
very open to any alternative proposals as to how to make uretprobes
RCU-protected to avoid refcounting in the hot path.

I think #3 proposal above will make it a bit more palatable (but there
is still locklessness, cmpxchg, etc, I see no way around that,
unfortunately).

>
> Oleg.
>
Andrii Nakryiko Oct. 4, 2024, 8:18 p.m. UTC | #7
On Tue, Sep 17, 2024 at 1:19 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sun, Sep 15, 2024 at 4:49 PM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > On 09/09, Andrii Nakryiko wrote:
> > >
> > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > makes it unsuitable to be called from more restricted context like softirq.
> > >
> > > Let's make put_uprobe() agnostic to the context in which it is called,
> > > and use work queue to defer the mutex-protected clean up steps.
> >
> > ...
> >
> > > +static void uprobe_free_deferred(struct work_struct *work)
> > > +{
> > > +     struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > > +
> > > +     /*
> > > +      * If application munmap(exec_vma) before uprobe_unregister()
> > > +      * gets called, we don't get a chance to remove uprobe from
> > > +      * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > > +      */
> > > +     mutex_lock(&delayed_uprobe_lock);
> > > +     delayed_uprobe_remove(uprobe, NULL);
> > > +     mutex_unlock(&delayed_uprobe_lock);
> > > +
> > > +     kfree(uprobe);
> > > +}
> > > +
> > >  static void uprobe_free_rcu(struct rcu_head *rcu)
> > >  {
> > >       struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > >
> > > -     kfree(uprobe);
> > > +     INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > > +     schedule_work(&uprobe->work);
> > >  }
> >
> > This is still wrong afaics...
> >
> > If put_uprobe() can be called from softirq (after the next patch), then
> > put_uprobe() and all other users of uprobes_treelock should use
> > write_lock_bh/read_lock_bh to avoid the deadlock.
>
> Ok, I see the problem, that's unfortunate.
>
> I see three ways to handle that:
>
> 1) keep put_uprobe() as is, and instead do schedule_work() from the
> timer thread to postpone put_uprobe(). (but I'm not a big fan of this)
> 2) move uprobes_treelock part of put_uprobe() into rcu callback, I
> think it has no bearing on correctness, uprobe_is_active() is there
> already to handle races between putting uprobe and removing it from
> uprobes_tree (I prefer this one over #1 )
> 3) you might like this the most ;) I think I can simplify
> hprobes_expire() from patch #2 to not need put_uprobe() at all, if I
> protect uprobe lifetime with non-sleepable
> rcu_read_lock()/rcu_read_unlock() and perform try_get_uprobe() as the
> very last step after cmpxchg() succeeded.
>
> I'm leaning towards #3, but #2 seems fine to me as well.

Ok, so just a short update. I don't think #3 works, I do need
try_get_uprobe() before I know for sure that cmpxchg() succeeds. Which
means I'd need a compensating put_uprobe() if cmpxchg() fails. So for
put_uprobe(), I just made it do all the locking in deferred work
callback (which is #2 above), which I think resolved the issue you
pointed out with potential deadlock and removes any limitations on
put_uprobe().

Also, I rewrote the hprobe_consume() and hprobe_expire() in terms of
an explicit state machine with 4 possible states (LEASED, STABLE,
GONE, CONSUMED), which I think makes the logic a bit more
straightforward to follow. Hopefully that will make the change more
palatable for you. I'm probably going to post patches next week,
though.

>
> >
> > To be honest... I simply can't force myself to even try to read 2/3 ;) I'll
> > try to do this later, but I am sure I will never like it, sorry.
>
> This might sound rude, but the goal here is not to make you like it :)
> The goal is to improve performance with minimal complexity. And I'm
> very open to any alternative proposals as to how to make uretprobes
> RCU-protected to avoid refcounting in the hot path.
>
> I think #3 proposal above will make it a bit more palatable (but there
> is still locklessness, cmpxchg, etc, I see no way around that,
> unfortunately).
>
> >
> > Oleg.
> >
diff mbox series

Patch

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a2e6a57f79f2..377bd524bc8b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/khugepaged.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
 
 #include <linux/uprobes.h>
 
@@ -54,14 +55,20 @@  DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
 #define UPROBE_COPY_INSN	0
 
 struct uprobe {
-	struct rb_node		rb_node;	/* node in the rb tree */
+	union {
+		struct {
+			struct rb_node		rb_node;	/* node in the rb tree */
+			struct rcu_head		rcu;
+		};
+		/* work is used only during freeing, rcu and rb_node are unused at that point */
+		struct work_struct work;
+	};
 	refcount_t		ref;
 	struct rw_semaphore	register_rwsem;
 	struct rw_semaphore	consumer_rwsem;
 	struct list_head	pending_list;
 	struct list_head	consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
-	struct rcu_head		rcu;
 	loff_t			offset;
 	loff_t			ref_ctr_offset;
 	unsigned long		flags;
@@ -620,11 +627,28 @@  static inline bool uprobe_is_active(struct uprobe *uprobe)
 	return !RB_EMPTY_NODE(&uprobe->rb_node);
 }
 
+static void uprobe_free_deferred(struct work_struct *work)
+{
+	struct uprobe *uprobe = container_of(work, struct uprobe, work);
+
+	/*
+	 * If application munmap(exec_vma) before uprobe_unregister()
+	 * gets called, we don't get a chance to remove uprobe from
+	 * delayed_uprobe_list from remove_breakpoint(). Do it here.
+	 */
+	mutex_lock(&delayed_uprobe_lock);
+	delayed_uprobe_remove(uprobe, NULL);
+	mutex_unlock(&delayed_uprobe_lock);
+
+	kfree(uprobe);
+}
+
 static void uprobe_free_rcu(struct rcu_head *rcu)
 {
 	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
 
-	kfree(uprobe);
+	INIT_WORK(&uprobe->work, uprobe_free_deferred);
+	schedule_work(&uprobe->work);
 }
 
 static void put_uprobe(struct uprobe *uprobe)