diff mbox series

[5/8] uprobes: travers uprobe's consumer list locklessly under SRCU protection

Message ID 20240731214256.3588718-6-andrii@kernel.org (mailing list archive)
State Superseded
Headers show
Series uprobes: RCU-protected hot path optimizations | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Andrii Nakryiko July 31, 2024, 9:42 p.m. UTC
uprobe->register_rwsem is one of a few big bottlenecks to scalability of
uprobes, so we need to get rid of it to improve uprobe performance and
multi-CPU scalability.

First, we turn uprobe's consumer list to a typical doubly-linked list
and utilize existing RCU-aware helpers for traversing such lists, as
well as adding and removing elements from it.

For entry uprobes we already have SRCU protection active since before
uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
won't go away from under us, but we add SRCU protection around consumer
list traversal.

Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
we remember whether any removal was requested during handler calls, but
then we double-check the decision under a proper register_rwsem using
consumers' filter callbacks. Handler removal is very rare, so this extra
lock won't hurt performance, overall, but we also avoid the need for any
extra protection (e.g., seqcount locks).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/uprobes.h |  2 +-
 kernel/events/uprobes.c | 97 ++++++++++++++++++++---------------------
 2 files changed, 48 insertions(+), 51 deletions(-)

Comments

Jiri Olsa Aug. 1, 2024, 2:27 p.m. UTC | #1
On Wed, Jul 31, 2024 at 02:42:53PM -0700, Andrii Nakryiko wrote:

SNIP

>  static int __copy_insn(struct address_space *mapping, struct file *filp,
>  			void *insn, int nbytes, loff_t offset)
>  {
> @@ -924,7 +901,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
>  	bool ret = false;
>  
>  	down_read(&uprobe->consumer_rwsem);
> -	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
>  		ret = consumer_filter(uc, mm);
>  		if (ret)
>  			break;
> @@ -1120,17 +1098,19 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  	int err;
>  
>  	down_write(&uprobe->register_rwsem);
> -	if (WARN_ON(!consumer_del(uprobe, uc))) {
> -		err = -ENOENT;
> -	} else {
> -		err = register_for_each_vma(uprobe, NULL);
> -		/* TODO : cant unregister? schedule a worker thread */
> -		WARN(err, "leaking uprobe due to failed unregistration");
> -	}
> +
> +	list_del_rcu(&uc->cons_node);

hum, so previous code had a check to verify that consumer is actually
registered in the uprobe, so it'd survive wrong argument while the new
code could likely do things?

> +	err = register_for_each_vma(uprobe, NULL);
> +
>  	up_write(&uprobe->register_rwsem);
>  
> -	if (!err)
> -		put_uprobe(uprobe);
> +	/* TODO : cant unregister? schedule a worker thread */
> +	if (WARN(err, "leaking uprobe due to failed unregistration"))
> +		return;
> +
> +	put_uprobe(uprobe);
> +
> +	synchronize_srcu(&uprobes_srcu);

could you comment on why it's needed in here? there's already potential
call_srcu(&uprobes_srcu, ... ) call in put_uprobe above

thanks,
jirka
Andrii Nakryiko Aug. 1, 2024, 4:49 p.m. UTC | #2
On Thu, Aug 1, 2024 at 7:27 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Wed, Jul 31, 2024 at 02:42:53PM -0700, Andrii Nakryiko wrote:
>
> SNIP
>
> >  static int __copy_insn(struct address_space *mapping, struct file *filp,
> >                       void *insn, int nbytes, loff_t offset)
> >  {
> > @@ -924,7 +901,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
> >       bool ret = false;
> >
> >       down_read(&uprobe->consumer_rwsem);
> > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > +                              srcu_read_lock_held(&uprobes_srcu)) {
> >               ret = consumer_filter(uc, mm);
> >               if (ret)
> >                       break;
> > @@ -1120,17 +1098,19 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >       int err;
> >
> >       down_write(&uprobe->register_rwsem);
> > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > -             err = -ENOENT;
> > -     } else {
> > -             err = register_for_each_vma(uprobe, NULL);
> > -             /* TODO : cant unregister? schedule a worker thread */
> > -             WARN(err, "leaking uprobe due to failed unregistration");
> > -     }
> > +
> > +     list_del_rcu(&uc->cons_node);
>
> hum, so previous code had a check to verify that consumer is actually
> registered in the uprobe, so it'd survive wrong argument while the new
> code could likely do things?

correct, passing consumer that's not really registered to
uprobe_unregister() is a huge violation of uprobe API contract and it
should never happen (and it doesn't), so it feels like we can drop
this overly cautious and permissive part (we don't protect against
passing wrong pointers, NULLs, etc, right? so why would we protect
against wrong unregister or say double unregister?)

>
> > +     err = register_for_each_vma(uprobe, NULL);
> > +
> >       up_write(&uprobe->register_rwsem);
> >
> > -     if (!err)
> > -             put_uprobe(uprobe);
> > +     /* TODO : cant unregister? schedule a worker thread */
> > +     if (WARN(err, "leaking uprobe due to failed unregistration"))
> > +             return;
> > +
> > +     put_uprobe(uprobe);
> > +
> > +     synchronize_srcu(&uprobes_srcu);
>
> could you comment on why it's needed in here? there's already potential
> call_srcu(&uprobes_srcu, ... ) call in put_uprobe above
>

yep, I should. This is because we might have handle_swbp() traversing
the consumer list in parallel with unregistration, and so it might
have already seen this consumer and is calling its callback. So we
need to wait for srcu grace period to make sure we don't have any
calls to consumer's callback. If we don't do that, the caller can free
the consumer's memory as handle_swbp() is still using/calling into it.

> thanks,
> jirka
Oleg Nesterov Aug. 5, 2024, 3:59 p.m. UTC | #3
On 07/31, Andrii Nakryiko wrote:
>
> @@ -1120,17 +1098,19 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  	int err;
>
>  	down_write(&uprobe->register_rwsem);
> -	if (WARN_ON(!consumer_del(uprobe, uc))) {
> -		err = -ENOENT;

OK, I agree, this should never happen.

But if you remove this check, then

>  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
>  {
>  	struct uprobe_consumer *con;
> -	int ret = -ENOENT;
> +	int ret = -ENOENT, srcu_idx;
>
>  	down_write(&uprobe->register_rwsem);
> -	for (con = uprobe->consumers; con && con != uc ; con = con->next)
> -		;
> -	if (con)
> -		ret = register_for_each_vma(uprobe, add ? uc : NULL);
> +
> +	srcu_idx = srcu_read_lock(&uprobes_srcu);
> +	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
> +		if (con == uc) {
> +			ret = register_for_each_vma(uprobe, add ? uc : NULL);
> +			break;
> +		}
> +	}

we can probably remove the similar check above?

I mean, why do we need the list_for_each_entry_srcu() above? Is it possible
that uprobe_apply(uprobe, uc) is called when "uc" is not on the ->consumers
list?

At first glance I see no problems in this patch... but you know, my eyes are
already blurring, I'll continue tomorrow and read this patch again.

Oleg.
Andrii Nakryiko Aug. 5, 2024, 5:31 p.m. UTC | #4
On Mon, Aug 5, 2024 at 8:59 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 07/31, Andrii Nakryiko wrote:
> >
> > @@ -1120,17 +1098,19 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >       int err;
> >
> >       down_write(&uprobe->register_rwsem);
> > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > -             err = -ENOENT;
>
> OK, I agree, this should never happen.
>
> But if you remove this check, then
>
> >  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
> >  {
> >       struct uprobe_consumer *con;
> > -     int ret = -ENOENT;
> > +     int ret = -ENOENT, srcu_idx;
> >
> >       down_write(&uprobe->register_rwsem);
> > -     for (con = uprobe->consumers; con && con != uc ; con = con->next)
> > -             ;
> > -     if (con)
> > -             ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > +
> > +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> > +     list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > +             if (con == uc) {
> > +                     ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > +                     break;
> > +             }
> > +     }
>
> we can probably remove the similar check above?
>
> I mean, why do we need the list_for_each_entry_srcu() above? Is it possible
> that uprobe_apply(uprobe, uc) is called when "uc" is not on the ->consumers
> list?

Tbh, I just don't completely understand how (and why) uprobe_apply()
is used from kernel/trace/trace_uprobe.c, so I wanted to preserve the
logic exactly. I still don't see when this consumer is added before
uprobe_apply()... Exposing uprobe_apply() seems like a huge API
violation to me and I'd rather get rid of its users. But one step at a
time.


>
> At first glance I see no problems in this patch... but you know, my eyes are
> already blurring, I'll continue tomorrow and read this patch again.
>
> Oleg.
>
Oleg Nesterov Aug. 6, 2024, 10:54 a.m. UTC | #5
On 08/05, Andrii Nakryiko wrote:
>
> On Mon, Aug 5, 2024 at 8:59 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > >  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
> > >  {
> > >       struct uprobe_consumer *con;
> > > -     int ret = -ENOENT;
> > > +     int ret = -ENOENT, srcu_idx;
> > >
> > >       down_write(&uprobe->register_rwsem);
> > > -     for (con = uprobe->consumers; con && con != uc ; con = con->next)
> > > -             ;
> > > -     if (con)
> > > -             ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > > +
> > > +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> > > +     list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > > +             if (con == uc) {
> > > +                     ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > > +                     break;
> > > +             }
> > > +     }
> >
> > we can probably remove the similar check above?
> >
> > I mean, why do we need the list_for_each_entry_srcu() above? Is it possible
> > that uprobe_apply(uprobe, uc) is called when "uc" is not on the ->consumers
> > list?
>
> Tbh, I just don't completely understand how (and why) uprobe_apply()
> is used from kernel/trace/trace_uprobe.c, so I wanted to preserve the
> logic exactly. I still don't see when this consumer is added before
> uprobe_apply()... Exposing uprobe_apply() seems like a huge API
> violation to me and I'd rather get rid of its users. But one step at a
> time.

Agreed. Unlike uprobe_unregister(), uprobe_apply() doesn't WARN() or
even explains this check, lets preserve the current logic for now.

And just in case... I am not sure too that the con == NULL case is not
possible with the current code. The recent discussions forced me to recall
some bits in uprobe.c, but not in trace_uprobe.c ;)

Oleg.
diff mbox series

Patch

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 8d5bbad2048c..a1686c1ebcb6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -35,7 +35,7 @@  struct uprobe_consumer {
 				struct pt_regs *regs);
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
-	struct uprobe_consumer *next;
+	struct list_head cons_node;
 };
 
 #ifdef CONFIG_UPROBES
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 71a8886608b1..3b42fd355256 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -59,7 +59,7 @@  struct uprobe {
 	struct rw_semaphore	register_rwsem;
 	struct rw_semaphore	consumer_rwsem;
 	struct list_head	pending_list;
-	struct uprobe_consumer	*consumers;
+	struct list_head	consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
 	struct rcu_head		rcu;
 	loff_t			offset;
@@ -778,6 +778,7 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 	uprobe->inode = inode;
 	uprobe->offset = offset;
 	uprobe->ref_ctr_offset = ref_ctr_offset;
+	INIT_LIST_HEAD(&uprobe->consumers);
 	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
 	RB_CLEAR_NODE(&uprobe->rb_node);
@@ -803,34 +804,10 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	down_write(&uprobe->consumer_rwsem);
-	uc->next = uprobe->consumers;
-	uprobe->consumers = uc;
+	list_add_rcu(&uc->cons_node, &uprobe->consumers);
 	up_write(&uprobe->consumer_rwsem);
 }
 
-/*
- * For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
- */
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-	struct uprobe_consumer **con;
-	bool ret = false;
-
-	down_write(&uprobe->consumer_rwsem);
-	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
-		if (*con == uc) {
-			*con = uc->next;
-			ret = true;
-			break;
-		}
-	}
-	up_write(&uprobe->consumer_rwsem);
-
-	return ret;
-}
-
 static int __copy_insn(struct address_space *mapping, struct file *filp,
 			void *insn, int nbytes, loff_t offset)
 {
@@ -924,7 +901,8 @@  static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
 	bool ret = false;
 
 	down_read(&uprobe->consumer_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		ret = consumer_filter(uc, mm);
 		if (ret)
 			break;
@@ -1120,17 +1098,19 @@  void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 	int err;
 
 	down_write(&uprobe->register_rwsem);
-	if (WARN_ON(!consumer_del(uprobe, uc))) {
-		err = -ENOENT;
-	} else {
-		err = register_for_each_vma(uprobe, NULL);
-		/* TODO : cant unregister? schedule a worker thread */
-		WARN(err, "leaking uprobe due to failed unregistration");
-	}
+
+	list_del_rcu(&uc->cons_node);
+	err = register_for_each_vma(uprobe, NULL);
+
 	up_write(&uprobe->register_rwsem);
 
-	if (!err)
-		put_uprobe(uprobe);
+	/* TODO : cant unregister? schedule a worker thread */
+	if (WARN(err, "leaking uprobe due to failed unregistration"))
+		return;
+
+	put_uprobe(uprobe);
+
+	synchronize_srcu(&uprobes_srcu);
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
@@ -1208,13 +1188,20 @@  EXPORT_SYMBOL_GPL(uprobe_register);
 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
 {
 	struct uprobe_consumer *con;
-	int ret = -ENOENT;
+	int ret = -ENOENT, srcu_idx;
 
 	down_write(&uprobe->register_rwsem);
-	for (con = uprobe->consumers; con && con != uc ; con = con->next)
-		;
-	if (con)
-		ret = register_for_each_vma(uprobe, add ? uc : NULL);
+
+	srcu_idx = srcu_read_lock(&uprobes_srcu);
+	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
+		if (con == uc) {
+			ret = register_for_each_vma(uprobe, add ? uc : NULL);
+			break;
+		}
+	}
+	srcu_read_unlock(&uprobes_srcu, srcu_idx);
+
 	up_write(&uprobe->register_rwsem);
 
 	return ret;
@@ -2088,9 +2075,10 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 	struct uprobe_consumer *uc;
 	int remove = UPROBE_HANDLER_REMOVE;
 	bool need_prep = false; /* prepare return uprobe, when needed */
+	bool has_consumers = false;
 
-	down_read(&uprobe->register_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		int rc = 0;
 
 		if (uc->handler) {
@@ -2103,16 +2091,23 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 			need_prep = true;
 
 		remove &= rc;
+		has_consumers = true;
 	}
 
 	if (need_prep && !remove)
 		prepare_uretprobe(uprobe, regs); /* put bp at return */
 
-	if (remove && uprobe->consumers) {
-		WARN_ON(!uprobe_is_active(uprobe));
-		unapply_uprobe(uprobe, current->mm);
+	if (remove && has_consumers) {
+		down_read(&uprobe->register_rwsem);
+
+		/* re-check that removal is still required, this time under lock */
+		if (!filter_chain(uprobe, current->mm)) {
+			WARN_ON(!uprobe_is_active(uprobe));
+			unapply_uprobe(uprobe, current->mm);
+		}
+
+		up_read(&uprobe->register_rwsem);
 	}
-	up_read(&uprobe->register_rwsem);
 }
 
 static void
@@ -2120,13 +2115,15 @@  handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 {
 	struct uprobe *uprobe = ri->uprobe;
 	struct uprobe_consumer *uc;
+	int srcu_idx;
 
-	down_read(&uprobe->register_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	srcu_idx = srcu_read_lock(&uprobes_srcu);
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		if (uc->ret_handler)
 			uc->ret_handler(uc, ri->func, regs);
 	}
-	up_read(&uprobe->register_rwsem);
+	srcu_read_unlock(&uprobes_srcu, srcu_idx);
 }
 
 static struct return_instance *find_next_ret_chain(struct return_instance *ri)