Message ID | 20240909224903.3498207-2-andrii@kernel.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | SRCU-protected uretprobes hot path | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote: > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > makes it unsuitable to be called from more restricted context like softirq. > > Let's make put_uprobe() agnostic to the context in which it is called, > and use work queue to defer the mutex-protected clean up steps. > > To avoid unnecessarily increasing the size of struct uprobe, we colocate > work_struct in parallel with rb_node and rcu, both of which are unused > by the time we get to schedule clean up work. > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > --- > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++--- > 1 file changed, 27 insertions(+), 3 deletions(-) > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index a2e6a57f79f2..377bd524bc8b 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -27,6 +27,7 @@ > #include <linux/shmem_fs.h> > #include <linux/khugepaged.h> > #include <linux/rcupdate_trace.h> > +#include <linux/workqueue.h> > > #include <linux/uprobes.h> > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); > #define UPROBE_COPY_INSN 0 > > struct uprobe { > - struct rb_node rb_node; /* node in the rb tree */ > + union { > + struct { > + struct rb_node rb_node; /* node in the rb tree */ > + struct rcu_head rcu; > + }; > + /* work is used only during freeing, rcu and rb_node are unused at that point */ > + struct work_struct work; > + }; > refcount_t ref; > struct rw_semaphore register_rwsem; > struct rw_semaphore consumer_rwsem; > struct list_head pending_list; > struct list_head consumers; > struct inode *inode; /* Also hold a ref to inode */ > - struct rcu_head rcu; > loff_t offset; > loff_t ref_ctr_offset; > unsigned long flags; > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) > return !RB_EMPTY_NODE(&uprobe->rb_node); > } > > +static void uprobe_free_deferred(struct work_struct *work) > +{ > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > + > + /* > + * If application munmap(exec_vma) before uprobe_unregister() > + * gets called, we don't get a chance to remove uprobe from > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > + */ > + mutex_lock(&delayed_uprobe_lock); > + delayed_uprobe_remove(uprobe, NULL); > + mutex_unlock(&delayed_uprobe_lock); > + > + kfree(uprobe); > +} > + > static void uprobe_free_rcu(struct rcu_head *rcu) > { > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > - kfree(uprobe); > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > + schedule_work(&uprobe->work); > } > > static void put_uprobe(struct uprobe *uprobe) It seems put_uprobe hunk was lost, since the patch is not doing what commit log describes.
On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote: > > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > makes it unsuitable to be called from more restricted context like softirq. > > > > Let's make put_uprobe() agnostic to the context in which it is called, > > and use work queue to defer the mutex-protected clean up steps. > > > > To avoid unnecessarily increasing the size of struct uprobe, we colocate > > work_struct in parallel with rb_node and rcu, both of which are unused > > by the time we get to schedule clean up work. > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > --- > > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++--- > > 1 file changed, 27 insertions(+), 3 deletions(-) > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > > index a2e6a57f79f2..377bd524bc8b 100644 > > --- a/kernel/events/uprobes.c > > +++ b/kernel/events/uprobes.c > > @@ -27,6 +27,7 @@ > > #include <linux/shmem_fs.h> > > #include <linux/khugepaged.h> > > #include <linux/rcupdate_trace.h> > > +#include <linux/workqueue.h> > > > > #include <linux/uprobes.h> > > > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); > > #define UPROBE_COPY_INSN 0 > > > > struct uprobe { > > - struct rb_node rb_node; /* node in the rb tree */ > > + union { > > + struct { > > + struct rb_node rb_node; /* node in the rb tree */ > > + struct rcu_head rcu; > > + }; > > + /* work is used only during freeing, rcu and rb_node are unused at that point */ > > + struct work_struct work; > > + }; > > refcount_t ref; > > struct rw_semaphore register_rwsem; > > struct rw_semaphore consumer_rwsem; > > struct list_head pending_list; > > struct list_head consumers; > > struct inode *inode; /* Also hold a ref to inode */ > > - struct rcu_head rcu; > > loff_t offset; > > loff_t ref_ctr_offset; > > unsigned long flags; > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) > > return !RB_EMPTY_NODE(&uprobe->rb_node); > > } > > > > +static void uprobe_free_deferred(struct work_struct *work) > > +{ > > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > > + > > + /* > > + * If application munmap(exec_vma) before uprobe_unregister() > > + * gets called, we don't get a chance to remove uprobe from > > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > > + */ > > + mutex_lock(&delayed_uprobe_lock); > > + delayed_uprobe_remove(uprobe, NULL); > > + mutex_unlock(&delayed_uprobe_lock); > > + > > + kfree(uprobe); > > +} > > + > > static void uprobe_free_rcu(struct rcu_head *rcu) > > { > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > > > - kfree(uprobe); > > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > > + schedule_work(&uprobe->work); > > } > > > > static void put_uprobe(struct uprobe *uprobe) > > It seems put_uprobe hunk was lost, since the patch is not doing > what commit log describes. Hmm, put_uprobe() has: call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); at the end (see [0], which added that), so we do schedule_work() in RCU callback, similarly to what we do with bpf_map freeing in the BPF subsystem. This patch set is based on the latest tip/perf/core (and also assuming the RCU Tasks Trace patch that mysteriously disappeared is actually there, hopefully it will just as magically be restored). [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c
On Mon, Sep 9, 2024 at 10:13 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > > > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote: > > > > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > > makes it unsuitable to be called from more restricted context like softirq. > > > > > > Let's make put_uprobe() agnostic to the context in which it is called, > > > and use work queue to defer the mutex-protected clean up steps. > > > > > > To avoid unnecessarily increasing the size of struct uprobe, we colocate > > > work_struct in parallel with rb_node and rcu, both of which are unused > > > by the time we get to schedule clean up work. > > > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > > --- > > > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++--- > > > 1 file changed, 27 insertions(+), 3 deletions(-) > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > > > index a2e6a57f79f2..377bd524bc8b 100644 > > > --- a/kernel/events/uprobes.c > > > +++ b/kernel/events/uprobes.c > > > @@ -27,6 +27,7 @@ > > > #include <linux/shmem_fs.h> > > > #include <linux/khugepaged.h> > > > #include <linux/rcupdate_trace.h> > > > +#include <linux/workqueue.h> > > > > > > #include <linux/uprobes.h> > > > > > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); > > > #define UPROBE_COPY_INSN 0 > > > > > > struct uprobe { > > > - struct rb_node rb_node; /* node in the rb tree */ > > > + union { > > > + struct { > > > + struct rb_node rb_node; /* node in the rb tree */ > > > + struct rcu_head rcu; > > > + }; > > > + /* work is used only during freeing, rcu and rb_node are unused at that point */ > > > + struct work_struct work; > > > + }; > > > refcount_t ref; > > > struct rw_semaphore register_rwsem; > > > struct rw_semaphore consumer_rwsem; > > > struct list_head pending_list; > > > struct list_head consumers; > > > struct inode *inode; /* Also hold a ref to inode */ > > > - struct rcu_head rcu; > > > loff_t offset; > > > loff_t ref_ctr_offset; > > > unsigned long flags; > > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) > > > return !RB_EMPTY_NODE(&uprobe->rb_node); > > > } > > > > > > +static void uprobe_free_deferred(struct work_struct *work) > > > +{ > > > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > > > + > > > + /* > > > + * If application munmap(exec_vma) before uprobe_unregister() > > > + * gets called, we don't get a chance to remove uprobe from > > > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > > > + */ > > > + mutex_lock(&delayed_uprobe_lock); > > > + delayed_uprobe_remove(uprobe, NULL); > > > + mutex_unlock(&delayed_uprobe_lock); > > > + > > > + kfree(uprobe); > > > +} > > > + > > > static void uprobe_free_rcu(struct rcu_head *rcu) > > > { > > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > > > > > - kfree(uprobe); > > > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > > > + schedule_work(&uprobe->work); > > > } > > > > > > static void put_uprobe(struct uprobe *uprobe) > > > > It seems put_uprobe hunk was lost, since the patch is not doing > > what commit log describes. > > > Hmm, put_uprobe() has: > > call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); > > at the end (see [0], which added that), so we do schedule_work() in > RCU callback, similarly to what we do with bpf_map freeing in the BPF > subsystem. > > This patch set is based on the latest tip/perf/core (and also assuming > the RCU Tasks Trace patch that mysteriously disappeared is actually > there, hopefully it will just as magically be restored). > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c I'm still not following. put_uprobe() did delayed_uprobe_remove() and this patch is doing it again. The commit log implies that mutex+delayed_uprobe_remove should be removed from put_uprobe(), but that's not what the patch is doing.
On Tue, Sep 10, 2024 at 8:56 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Mon, Sep 9, 2024 at 10:13 PM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov > > <alexei.starovoitov@gmail.com> wrote: > > > > > > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote: > > > > > > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > > > makes it unsuitable to be called from more restricted context like softirq. > > > > > > > > Let's make put_uprobe() agnostic to the context in which it is called, > > > > and use work queue to defer the mutex-protected clean up steps. > > > > > > > > To avoid unnecessarily increasing the size of struct uprobe, we colocate > > > > work_struct in parallel with rb_node and rcu, both of which are unused > > > > by the time we get to schedule clean up work. > > > > > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > > > --- > > > > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++--- > > > > 1 file changed, 27 insertions(+), 3 deletions(-) > > > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > > > > index a2e6a57f79f2..377bd524bc8b 100644 > > > > --- a/kernel/events/uprobes.c > > > > +++ b/kernel/events/uprobes.c > > > > @@ -27,6 +27,7 @@ > > > > #include <linux/shmem_fs.h> > > > > #include <linux/khugepaged.h> > > > > #include <linux/rcupdate_trace.h> > > > > +#include <linux/workqueue.h> > > > > > > > > #include <linux/uprobes.h> > > > > > > > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); > > > > #define UPROBE_COPY_INSN 0 > > > > > > > > struct uprobe { > > > > - struct rb_node rb_node; /* node in the rb tree */ > > > > + union { > > > > + struct { > > > > + struct rb_node rb_node; /* node in the rb tree */ > > > > + struct rcu_head rcu; > > > > + }; > > > > + /* work is used only during freeing, rcu and rb_node are unused at that point */ > > > > + struct work_struct work; > > > > + }; > > > > refcount_t ref; > > > > struct rw_semaphore register_rwsem; > > > > struct rw_semaphore consumer_rwsem; > > > > struct list_head pending_list; > > > > struct list_head consumers; > > > > struct inode *inode; /* Also hold a ref to inode */ > > > > - struct rcu_head rcu; > > > > loff_t offset; > > > > loff_t ref_ctr_offset; > > > > unsigned long flags; > > > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) > > > > return !RB_EMPTY_NODE(&uprobe->rb_node); > > > > } > > > > > > > > +static void uprobe_free_deferred(struct work_struct *work) > > > > +{ > > > > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > > > > + > > > > + /* > > > > + * If application munmap(exec_vma) before uprobe_unregister() > > > > + * gets called, we don't get a chance to remove uprobe from > > > > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > > > > + */ > > > > + mutex_lock(&delayed_uprobe_lock); > > > > + delayed_uprobe_remove(uprobe, NULL); > > > > + mutex_unlock(&delayed_uprobe_lock); > > > > + > > > > + kfree(uprobe); > > > > +} > > > > + > > > > static void uprobe_free_rcu(struct rcu_head *rcu) > > > > { > > > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > > > > > > > - kfree(uprobe); > > > > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > > > > + schedule_work(&uprobe->work); > > > > } > > > > > > > > static void put_uprobe(struct uprobe *uprobe) > > > > > > It seems put_uprobe hunk was lost, since the patch is not doing > > > what commit log describes. > > > > > > Hmm, put_uprobe() has: > > > > call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); > > > > at the end (see [0], which added that), so we do schedule_work() in > > RCU callback, similarly to what we do with bpf_map freeing in the BPF > > subsystem. > > > > This patch set is based on the latest tip/perf/core (and also assuming > > the RCU Tasks Trace patch that mysteriously disappeared is actually > > there, hopefully it will just as magically be restored). > > > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c > > I'm still not following. > put_uprobe() did delayed_uprobe_remove() and this patch is doing it again. > > The commit log implies that mutex+delayed_uprobe_remove should be removed > from put_uprobe(), but that's not what the patch is doing. Ah, that part. You are right, it's my bad. I copied that lock to a work queue callback, but didn't remove it from the put_uprobe(). Will fix it in v2, thanks for catching!
On 09/09, Andrii Nakryiko wrote: > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > makes it unsuitable to be called from more restricted context like softirq. > > Let's make put_uprobe() agnostic to the context in which it is called, > and use work queue to defer the mutex-protected clean up steps. ... > +static void uprobe_free_deferred(struct work_struct *work) > +{ > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > + > + /* > + * If application munmap(exec_vma) before uprobe_unregister() > + * gets called, we don't get a chance to remove uprobe from > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > + */ > + mutex_lock(&delayed_uprobe_lock); > + delayed_uprobe_remove(uprobe, NULL); > + mutex_unlock(&delayed_uprobe_lock); > + > + kfree(uprobe); > +} > + > static void uprobe_free_rcu(struct rcu_head *rcu) > { > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > - kfree(uprobe); > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > + schedule_work(&uprobe->work); > } This is still wrong afaics... If put_uprobe() can be called from softirq (after the next patch), then put_uprobe() and all other users of uprobes_treelock should use write_lock_bh/read_lock_bh to avoid the deadlock. To be honest... I simply can't force myself to even try to read 2/3 ;) I'll try to do this later, but I am sure I will never like it, sorry. Oleg.
On Sun, Sep 15, 2024 at 4:49 PM Oleg Nesterov <oleg@redhat.com> wrote: > > On 09/09, Andrii Nakryiko wrote: > > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > makes it unsuitable to be called from more restricted context like softirq. > > > > Let's make put_uprobe() agnostic to the context in which it is called, > > and use work queue to defer the mutex-protected clean up steps. > > ... > > > +static void uprobe_free_deferred(struct work_struct *work) > > +{ > > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > > + > > + /* > > + * If application munmap(exec_vma) before uprobe_unregister() > > + * gets called, we don't get a chance to remove uprobe from > > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > > + */ > > + mutex_lock(&delayed_uprobe_lock); > > + delayed_uprobe_remove(uprobe, NULL); > > + mutex_unlock(&delayed_uprobe_lock); > > + > > + kfree(uprobe); > > +} > > + > > static void uprobe_free_rcu(struct rcu_head *rcu) > > { > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > > > - kfree(uprobe); > > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > > + schedule_work(&uprobe->work); > > } > > This is still wrong afaics... > > If put_uprobe() can be called from softirq (after the next patch), then > put_uprobe() and all other users of uprobes_treelock should use > write_lock_bh/read_lock_bh to avoid the deadlock. Ok, I see the problem, that's unfortunate. I see three ways to handle that: 1) keep put_uprobe() as is, and instead do schedule_work() from the timer thread to postpone put_uprobe(). (but I'm not a big fan of this) 2) move uprobes_treelock part of put_uprobe() into rcu callback, I think it has no bearing on correctness, uprobe_is_active() is there already to handle races between putting uprobe and removing it from uprobes_tree (I prefer this one over #1 ) 3) you might like this the most ;) I think I can simplify hprobes_expire() from patch #2 to not need put_uprobe() at all, if I protect uprobe lifetime with non-sleepable rcu_read_lock()/rcu_read_unlock() and perform try_get_uprobe() as the very last step after cmpxchg() succeeded. I'm leaning towards #3, but #2 seems fine to me as well. > > To be honest... I simply can't force myself to even try to read 2/3 ;) I'll > try to do this later, but I am sure I will never like it, sorry. This might sound rude, but the goal here is not to make you like it :) The goal is to improve performance with minimal complexity. And I'm very open to any alternative proposals as to how to make uretprobes RCU-protected to avoid refcounting in the hot path. I think #3 proposal above will make it a bit more palatable (but there is still locklessness, cmpxchg, etc, I see no way around that, unfortunately). > > Oleg. >
On Tue, Sep 17, 2024 at 1:19 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Sun, Sep 15, 2024 at 4:49 PM Oleg Nesterov <oleg@redhat.com> wrote: > > > > On 09/09, Andrii Nakryiko wrote: > > > > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > > makes it unsuitable to be called from more restricted context like softirq. > > > > > > Let's make put_uprobe() agnostic to the context in which it is called, > > > and use work queue to defer the mutex-protected clean up steps. > > > > ... > > > > > +static void uprobe_free_deferred(struct work_struct *work) > > > +{ > > > + struct uprobe *uprobe = container_of(work, struct uprobe, work); > > > + > > > + /* > > > + * If application munmap(exec_vma) before uprobe_unregister() > > > + * gets called, we don't get a chance to remove uprobe from > > > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > > > + */ > > > + mutex_lock(&delayed_uprobe_lock); > > > + delayed_uprobe_remove(uprobe, NULL); > > > + mutex_unlock(&delayed_uprobe_lock); > > > + > > > + kfree(uprobe); > > > +} > > > + > > > static void uprobe_free_rcu(struct rcu_head *rcu) > > > { > > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); > > > > > > - kfree(uprobe); > > > + INIT_WORK(&uprobe->work, uprobe_free_deferred); > > > + schedule_work(&uprobe->work); > > > } > > > > This is still wrong afaics... > > > > If put_uprobe() can be called from softirq (after the next patch), then > > put_uprobe() and all other users of uprobes_treelock should use > > write_lock_bh/read_lock_bh to avoid the deadlock. > > Ok, I see the problem, that's unfortunate. > > I see three ways to handle that: > > 1) keep put_uprobe() as is, and instead do schedule_work() from the > timer thread to postpone put_uprobe(). (but I'm not a big fan of this) > 2) move uprobes_treelock part of put_uprobe() into rcu callback, I > think it has no bearing on correctness, uprobe_is_active() is there > already to handle races between putting uprobe and removing it from > uprobes_tree (I prefer this one over #1 ) > 3) you might like this the most ;) I think I can simplify > hprobes_expire() from patch #2 to not need put_uprobe() at all, if I > protect uprobe lifetime with non-sleepable > rcu_read_lock()/rcu_read_unlock() and perform try_get_uprobe() as the > very last step after cmpxchg() succeeded. > > I'm leaning towards #3, but #2 seems fine to me as well. Ok, so just a short update. I don't think #3 works, I do need try_get_uprobe() before I know for sure that cmpxchg() succeeds. Which means I'd need a compensating put_uprobe() if cmpxchg() fails. So for put_uprobe(), I just made it do all the locking in deferred work callback (which is #2 above), which I think resolved the issue you pointed out with potential deadlock and removes any limitations on put_uprobe(). Also, I rewrote the hprobe_consume() and hprobe_expire() in terms of an explicit state machine with 4 possible states (LEASED, STABLE, GONE, CONSUMED), which I think makes the logic a bit more straightforward to follow. Hopefully that will make the change more palatable for you. I'm probably going to post patches next week, though. > > > > > To be honest... I simply can't force myself to even try to read 2/3 ;) I'll > > try to do this later, but I am sure I will never like it, sorry. > > This might sound rude, but the goal here is not to make you like it :) > The goal is to improve performance with minimal complexity. And I'm > very open to any alternative proposals as to how to make uretprobes > RCU-protected to avoid refcounting in the hot path. > > I think #3 proposal above will make it a bit more palatable (but there > is still locklessness, cmpxchg, etc, I see no way around that, > unfortunately). > > > > > Oleg. > >
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a2e6a57f79f2..377bd524bc8b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> #include <linux/uprobes.h> @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); #define UPROBE_COPY_INSN 0 struct uprobe { - struct rb_node rb_node; /* node in the rb tree */ + union { + struct { + struct rb_node rb_node; /* node in the rb tree */ + struct rcu_head rcu; + }; + /* work is used only during freeing, rcu and rb_node are unused at that point */ + struct work_struct work; + }; refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ - struct rcu_head rcu; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) return !RB_EMPTY_NODE(&uprobe->rb_node); } +static void uprobe_free_deferred(struct work_struct *work) +{ + struct uprobe *uprobe = container_of(work, struct uprobe, work); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + kfree(uprobe); +} + static void uprobe_free_rcu(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); - kfree(uprobe); + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); } static void put_uprobe(struct uprobe *uprobe)
Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which makes it unsuitable to be called from more restricted context like softirq. Let's make put_uprobe() agnostic to the context in which it is called, and use work queue to defer the mutex-protected clean up steps. To avoid unnecessarily increasing the size of struct uprobe, we colocate work_struct in parallel with rb_node and rcu, both of which are unused by the time we get to schedule clean up work. Signed-off-by: Andrii Nakryiko <andrii@kernel.org> --- kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-)