Message ID | 20241008002556.2332835-2-andrii@kernel.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | SRCU-protected uretprobes hot path | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote: > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > makes it unsuitable to be called from more restricted context like softirq. This is delayed_uprobe_lock, right? So can't we do something like so instead? --- kernel/events/uprobes.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2a0059464383..d17a9046de35 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -83,9 +83,11 @@ struct delayed_uprobe { struct list_head list; struct uprobe *uprobe; struct mm_struct *mm; + struct rcu_head rcu; }; -static DEFINE_MUTEX(delayed_uprobe_lock); +/* XXX global state; use per mm list instead ? */ +static DEFINE_SPINLOCK(delayed_uprobe_lock); static LIST_HEAD(delayed_uprobe_list); /* @@ -289,9 +291,11 @@ delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; - list_for_each_entry(du, &delayed_uprobe_list, list) + guard(rcu)(); + list_for_each_entry_rcu(du, &delayed_uprobe_list, list) { if (du->uprobe == uprobe && du->mm == mm) return du; + } return NULL; } @@ -308,7 +312,8 @@ static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) du->uprobe = uprobe; du->mm = mm; - list_add(&du->list, &delayed_uprobe_list); + scoped_guard(spinlock, &delayed_uprobe_lock) + list_add_rcu(&du->list, &delayed_uprobe_list); return 0; } @@ -316,19 +321,21 @@ static void delayed_uprobe_delete(struct delayed_uprobe *du) { if (WARN_ON(!du)) return; - list_del(&du->list); - kfree(du); + scoped_guard(spinlock, &delayed_uprobe_lock) + list_del(&du->list); + kfree_rcu(du, rcu); } static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) { - struct list_head *pos, *q; struct delayed_uprobe *du; + struct list_head *pos; if (!uprobe && !mm) return; - list_for_each_safe(pos, q, &delayed_uprobe_list) { + guard(rcu)(); + list_for_each_rcu(pos, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (uprobe && du->uprobe != uprobe) @@ -434,12 +441,10 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } - mutex_lock(&delayed_uprobe_lock); if (d > 0) ret = delayed_uprobe_add(uprobe, mm); else delayed_uprobe_remove(uprobe, mm); - mutex_unlock(&delayed_uprobe_lock); return ret; } @@ -645,9 +650,7 @@ static void put_uprobe(struct uprobe *uprobe) * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ - mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); - mutex_unlock(&delayed_uprobe_lock); call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu); } @@ -1350,13 +1353,18 @@ static void build_probe_list(struct inode *inode, /* @vma contains reference counter, not the probed instruction. */ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) { - struct list_head *pos, *q; struct delayed_uprobe *du; + struct list_head *pos; unsigned long vaddr; int ret = 0, err = 0; - mutex_lock(&delayed_uprobe_lock); - list_for_each_safe(pos, q, &delayed_uprobe_list) { + /* + * delayed_uprobe_list is added to when the ref_ctr is not mapped + * and is consulted (this function) when adding maps. And since + * mmap_lock serializes these, it is not possible miss an entry. + */ + guard(rcu)(); + list_for_each_rcu(pos, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (du->mm != vma->vm_mm || @@ -1370,9 +1378,9 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) if (!err) err = ret; } + delayed_uprobe_delete(du); } - mutex_unlock(&delayed_uprobe_lock); return err; } @@ -1596,9 +1604,7 @@ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; - mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(NULL, mm); - mutex_unlock(&delayed_uprobe_lock); if (!area) return;
On Fri, Oct 18, 2024 at 1:26 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote: > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > makes it unsuitable to be called from more restricted context like softirq. > > This is delayed_uprobe_lock, right? Not just delated_uprobe_lock, there is also uprobes_treelock (I forgot to update the commit message to mention that). Oleg had concerns (see [0]) with that being taken from the timer thread, so I just moved all of the locking into deferred work callback. [0] https://lore.kernel.org/linux-trace-kernel/20240915144910.GA27726@redhat.com/ > > So can't we do something like so instead? I'll need to look at this more thoroughly (and hopefully Oleg will get a chance as well), dropping lock from delayed_ref_ctr_inc() is a bit scary, but might be ok. But generally speaking, what's your concern with doing deferred work in put_uprobe()? It's not a hot path by any means, worst case we'll have maybe thousands of uprobes attached/detached. > > --- > kernel/events/uprobes.c | 40 +++++++++++++++++++++++----------------- > 1 file changed, 23 insertions(+), 17 deletions(-) > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index 2a0059464383..d17a9046de35 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -83,9 +83,11 @@ struct delayed_uprobe { > struct list_head list; > struct uprobe *uprobe; > struct mm_struct *mm; > + struct rcu_head rcu; > }; > > -static DEFINE_MUTEX(delayed_uprobe_lock); > +/* XXX global state; use per mm list instead ? */ > +static DEFINE_SPINLOCK(delayed_uprobe_lock); > static LIST_HEAD(delayed_uprobe_list); > > /* > @@ -289,9 +291,11 @@ delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) > { > struct delayed_uprobe *du; > > - list_for_each_entry(du, &delayed_uprobe_list, list) > + guard(rcu)(); > + list_for_each_entry_rcu(du, &delayed_uprobe_list, list) { > if (du->uprobe == uprobe && du->mm == mm) > return du; > + } > return NULL; > } > > @@ -308,7 +312,8 @@ static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) > > du->uprobe = uprobe; > du->mm = mm; > - list_add(&du->list, &delayed_uprobe_list); > + scoped_guard(spinlock, &delayed_uprobe_lock) > + list_add_rcu(&du->list, &delayed_uprobe_list); > return 0; > } > > @@ -316,19 +321,21 @@ static void delayed_uprobe_delete(struct delayed_uprobe *du) > { > if (WARN_ON(!du)) > return; > - list_del(&du->list); > - kfree(du); > + scoped_guard(spinlock, &delayed_uprobe_lock) > + list_del(&du->list); > + kfree_rcu(du, rcu); > } > > static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) > { > - struct list_head *pos, *q; > struct delayed_uprobe *du; > + struct list_head *pos; > > if (!uprobe && !mm) > return; > > - list_for_each_safe(pos, q, &delayed_uprobe_list) { > + guard(rcu)(); > + list_for_each_rcu(pos, &delayed_uprobe_list) { > du = list_entry(pos, struct delayed_uprobe, list); > > if (uprobe && du->uprobe != uprobe) > @@ -434,12 +441,10 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, > return ret; > } > > - mutex_lock(&delayed_uprobe_lock); > if (d > 0) > ret = delayed_uprobe_add(uprobe, mm); > else > delayed_uprobe_remove(uprobe, mm); > - mutex_unlock(&delayed_uprobe_lock); > > return ret; > } > @@ -645,9 +650,7 @@ static void put_uprobe(struct uprobe *uprobe) > * gets called, we don't get a chance to remove uprobe from > * delayed_uprobe_list from remove_breakpoint(). Do it here. > */ > - mutex_lock(&delayed_uprobe_lock); > delayed_uprobe_remove(uprobe, NULL); > - mutex_unlock(&delayed_uprobe_lock); > > call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu); > } > @@ -1350,13 +1353,18 @@ static void build_probe_list(struct inode *inode, > /* @vma contains reference counter, not the probed instruction. */ > static int delayed_ref_ctr_inc(struct vm_area_struct *vma) > { > - struct list_head *pos, *q; > struct delayed_uprobe *du; > + struct list_head *pos; > unsigned long vaddr; > int ret = 0, err = 0; > > - mutex_lock(&delayed_uprobe_lock); > - list_for_each_safe(pos, q, &delayed_uprobe_list) { > + /* > + * delayed_uprobe_list is added to when the ref_ctr is not mapped > + * and is consulted (this function) when adding maps. And since > + * mmap_lock serializes these, it is not possible miss an entry. > + */ > + guard(rcu)(); > + list_for_each_rcu(pos, &delayed_uprobe_list) { > du = list_entry(pos, struct delayed_uprobe, list); > > if (du->mm != vma->vm_mm || > @@ -1370,9 +1378,9 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) > if (!err) > err = ret; > } > + > delayed_uprobe_delete(du); > } > - mutex_unlock(&delayed_uprobe_lock); > return err; > } > > @@ -1596,9 +1604,7 @@ void uprobe_clear_state(struct mm_struct *mm) > { > struct xol_area *area = mm->uprobes_state.xol_area; > > - mutex_lock(&delayed_uprobe_lock); > delayed_uprobe_remove(NULL, mm); > - mutex_unlock(&delayed_uprobe_lock); > > if (!area) > return;
On Fri, Oct 18, 2024 at 11:22:00AM -0700, Andrii Nakryiko wrote: > On Fri, Oct 18, 2024 at 1:26 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote: > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > > makes it unsuitable to be called from more restricted context like softirq. > > > > This is delayed_uprobe_lock, right? > > Not just delated_uprobe_lock, there is also uprobes_treelock (I forgot > to update the commit message to mention that). Oleg had concerns (see > [0]) with that being taken from the timer thread, so I just moved all > of the locking into deferred work callback. > > [0] https://lore.kernel.org/linux-trace-kernel/20240915144910.GA27726@redhat.com/ Right, but at least that's not a sleeping lock. He's right about it needing to become a softirq-safe lock though. And yeah, unfortunate that. > > So can't we do something like so instead? > > I'll need to look at this more thoroughly (and hopefully Oleg will get > a chance as well), dropping lock from delayed_ref_ctr_inc() is a bit > scary, but might be ok. So I figured that update_ref_ctr() is already doing the __update_ref_ctr() thing without holding the lock, so that lock really is only there to manage the list. And that list is super offensive... That really wants to be a per-mm rb-tree or somesuch. AFAICT the only reason it is a mutex, is because doing unbouded list iteration under a spinlock is a really bad idea. > But generally speaking, what's your concern with doing deferred work > in put_uprobe()? It's not a hot path by any means, worst case we'll > have maybe thousands of uprobes attached/detached. Mostly I got offended by the level of crap in that code, and working around crap instead of fixing crap just ain't right.
On Mon, Oct 21, 2024 at 3:31 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Oct 18, 2024 at 11:22:00AM -0700, Andrii Nakryiko wrote: > > On Fri, Oct 18, 2024 at 1:26 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Mon, Oct 07, 2024 at 05:25:55PM -0700, Andrii Nakryiko wrote: > > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which > > > > makes it unsuitable to be called from more restricted context like softirq. > > > > > > This is delayed_uprobe_lock, right? > > > > Not just delated_uprobe_lock, there is also uprobes_treelock (I forgot > > to update the commit message to mention that). Oleg had concerns (see > > [0]) with that being taken from the timer thread, so I just moved all > > of the locking into deferred work callback. > > > > [0] https://lore.kernel.org/linux-trace-kernel/20240915144910.GA27726@redhat.com/ > > Right, but at least that's not a sleeping lock. He's right about it > needing to become a softirq-safe lock though. And yeah, unfortunate > that. > > > > So can't we do something like so instead? > > > > I'll need to look at this more thoroughly (and hopefully Oleg will get > > a chance as well), dropping lock from delayed_ref_ctr_inc() is a bit > > scary, but might be ok. > > So I figured that update_ref_ctr() is already doing the > __update_ref_ctr() thing without holding the lock, so that lock really > is only there to manage the list. > > And that list is super offensive... That really wants to be a per-mm > rb-tree or somesuch. Probably hard to justify to add that to mm_struct, tbh, given that uprobe+refcnt case (which is USDT with semaphore) isn't all that frequent, and even then it will be active on a very small subset of processes in the system, most probably. But, even if (see below), probably should be a separate change. > > AFAICT the only reason it is a mutex, is because doing unbouded list > iteration under a spinlock is a really bad idea. > > > But generally speaking, what's your concern with doing deferred work > > in put_uprobe()? It's not a hot path by any means, worst case we'll > > have maybe thousands of uprobes attached/detached. > > Mostly I got offended by the level of crap in that code, and working > around crap instead of fixing crap just ain't right. > Ok, so where are we at? Do you insist on the delayed_ref_ctr_inc() rework, switching uprobe_treelock to be softirq-safe and leaving put_uprobe() mostly as is? Or is it ok, to do a quick deferred work change for put_uprobe() to unblock uretprobe+SRCU and land it sooner? What if we split this work into two independent patch sets, go with deferred work for uretprobe + SRCU, and then work with Oleg and you on simplifying and improving delayed_uprobe_lock-related stuff? After all, neither deferred work nor delayed_ref_ctr_inc() change has much practical bearing on real-world performance. WDYT?
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a2e6a57f79f2..9d3ab472200d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> #include <linux/uprobes.h> @@ -61,7 +62,10 @@ struct uprobe { struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct work; + }; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; @@ -627,10 +631,9 @@ static void uprobe_free_rcu(struct rcu_head *rcu) kfree(uprobe); } -static void put_uprobe(struct uprobe *uprobe) +static void uprobe_free_deferred(struct work_struct *work) { - if (!refcount_dec_and_test(&uprobe->ref)) - return; + struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); @@ -654,6 +657,15 @@ static void put_uprobe(struct uprobe *uprobe) call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu); } +static void put_uprobe(struct uprobe *uprobe) +{ + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); +} + static __always_inline int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, const struct uprobe *r)
Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which makes it unsuitable to be called from more restricted context like softirq. Let's make put_uprobe() agnostic to the context in which it is called, and use work queue to defer the mutex-protected clean up steps. RB tree removal step is also moved into work-deferred callback to avoid potential deadlock between softirq-based timer callback, added in the next patch, and the rest of uprobe code. Signed-off-by: Andrii Nakryiko <andrii@kernel.org> --- kernel/events/uprobes.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-)