diff mbox series

[v3,04/13] uprobes: travers uprobe's consumer list locklessly under SRCU protection

Message ID 20240813042917.506057-5-andrii@kernel.org (mailing list archive)
State New
Headers show
Series uprobes: RCU-protected hot path optimizations | expand

Commit Message

Andrii Nakryiko Aug. 13, 2024, 4:29 a.m. UTC
uprobe->register_rwsem is one of a few big bottlenecks to scalability of
uprobes, so we need to get rid of it to improve uprobe performance and
multi-CPU scalability.

First, we turn uprobe's consumer list to a typical doubly-linked list
and utilize existing RCU-aware helpers for traversing such lists, as
well as adding and removing elements from it.

For entry uprobes we already have SRCU protection active since before
uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
won't go away from under us, but we add SRCU protection around consumer
list traversal.

Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
we remember whether any removal was requested during handler calls, but
then we double-check the decision under a proper register_rwsem using
consumers' filter callbacks. Handler removal is very rare, so this extra
lock won't hurt performance, overall, but we also avoid the need for any
extra protection (e.g., seqcount locks).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/uprobes.h |   2 +-
 kernel/events/uprobes.c | 111 ++++++++++++++++++++++------------------
 2 files changed, 61 insertions(+), 52 deletions(-)

Comments

Jiri Olsa Aug. 22, 2024, 2:22 p.m. UTC | #1
On Mon, Aug 12, 2024 at 09:29:08PM -0700, Andrii Nakryiko wrote:

SNIP

> @@ -1125,18 +1103,31 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  	int err;
>  
>  	down_write(&uprobe->register_rwsem);
> -	if (WARN_ON(!consumer_del(uprobe, uc))) {
> -		err = -ENOENT;
> -	} else {
> -		err = register_for_each_vma(uprobe, NULL);
> -		/* TODO : cant unregister? schedule a worker thread */
> -		if (unlikely(err))
> -			uprobe_warn(current, "unregister, leaking uprobe");
> -	}
> +
> +	list_del_rcu(&uc->cons_node);

hi,
I'm using this patchset as base for my changes and stumbled on this today,
I'm probably missing something, but should we keep the 'uprobe->consumer_rwsem'
lock around the list_del_rcu?

jirka


> +	err = register_for_each_vma(uprobe, NULL);
> +
>  	up_write(&uprobe->register_rwsem);
>  
> -	if (!err)
> -		put_uprobe(uprobe);
> +	/* TODO : cant unregister? schedule a worker thread */
> +	if (unlikely(err)) {
> +		uprobe_warn(current, "unregister, leaking uprobe");
> +		goto out_sync;
> +	}
> +
> +	put_uprobe(uprobe);
> +
> +out_sync:
> +	/*
> +	 * Now that handler_chain() and handle_uretprobe_chain() iterate over
> +	 * uprobe->consumers list under RCU protection without holding
> +	 * uprobe->register_rwsem, we need to wait for RCU grace period to
> +	 * make sure that we can't call into just unregistered
> +	 * uprobe_consumer's callbacks anymore. If we don't do that, fast and
> +	 * unlucky enough caller can free consumer's memory and cause
> +	 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
> +	 */
> +	synchronize_srcu(&uprobes_srcu);
>  }
>  EXPORT_SYMBOL_GPL(uprobe_unregister);
>  
> @@ -1214,13 +1205,20 @@ EXPORT_SYMBOL_GPL(uprobe_register);
>  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
>  {
>  	struct uprobe_consumer *con;
> -	int ret = -ENOENT;
> +	int ret = -ENOENT, srcu_idx;
>  
>  	down_write(&uprobe->register_rwsem);
> -	for (con = uprobe->consumers; con && con != uc ; con = con->next)
> -		;
> -	if (con)
> -		ret = register_for_each_vma(uprobe, add ? uc : NULL);
> +
> +	srcu_idx = srcu_read_lock(&uprobes_srcu);
> +	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
> +		if (con == uc) {
> +			ret = register_for_each_vma(uprobe, add ? uc : NULL);
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&uprobes_srcu, srcu_idx);
> +
>  	up_write(&uprobe->register_rwsem);
>  
>  	return ret;
> @@ -2085,10 +2083,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
>  	struct uprobe_consumer *uc;
>  	int remove = UPROBE_HANDLER_REMOVE;
>  	bool need_prep = false; /* prepare return uprobe, when needed */
> +	bool has_consumers = false;
>  
> -	down_read(&uprobe->register_rwsem);
>  	current->utask->auprobe = &uprobe->arch;
> -	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +
> +	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
>  		int rc = 0;
>  
>  		if (uc->handler) {
> @@ -2101,17 +2101,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
>  			need_prep = true;
>  
>  		remove &= rc;
> +		has_consumers = true;
>  	}
>  	current->utask->auprobe = NULL;
>  
>  	if (need_prep && !remove)
>  		prepare_uretprobe(uprobe, regs); /* put bp at return */
>  
> -	if (remove && uprobe->consumers) {
> -		WARN_ON(!uprobe_is_active(uprobe));
> -		unapply_uprobe(uprobe, current->mm);
> +	if (remove && has_consumers) {
> +		down_read(&uprobe->register_rwsem);
> +
> +		/* re-check that removal is still required, this time under lock */
> +		if (!filter_chain(uprobe, current->mm)) {
> +			WARN_ON(!uprobe_is_active(uprobe));
> +			unapply_uprobe(uprobe, current->mm);
> +		}
> +
> +		up_read(&uprobe->register_rwsem);
>  	}
> -	up_read(&uprobe->register_rwsem);
>  }
>  
>  static void
> @@ -2119,13 +2126,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
>  {
>  	struct uprobe *uprobe = ri->uprobe;
>  	struct uprobe_consumer *uc;
> +	int srcu_idx;
>  
> -	down_read(&uprobe->register_rwsem);
> -	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +	srcu_idx = srcu_read_lock(&uprobes_srcu);
> +	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
>  		if (uc->ret_handler)
>  			uc->ret_handler(uc, ri->func, regs);
>  	}
> -	up_read(&uprobe->register_rwsem);
> +	srcu_read_unlock(&uprobes_srcu, srcu_idx);
>  }
>  
>  static struct return_instance *find_next_ret_chain(struct return_instance *ri)
> -- 
> 2.43.5
>
Andrii Nakryiko Aug. 22, 2024, 4:59 p.m. UTC | #2
On Thu, Aug 22, 2024 at 7:22 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Mon, Aug 12, 2024 at 09:29:08PM -0700, Andrii Nakryiko wrote:
>
> SNIP
>
> > @@ -1125,18 +1103,31 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >       int err;
> >
> >       down_write(&uprobe->register_rwsem);
> > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > -             err = -ENOENT;
> > -     } else {
> > -             err = register_for_each_vma(uprobe, NULL);
> > -             /* TODO : cant unregister? schedule a worker thread */
> > -             if (unlikely(err))
> > -                     uprobe_warn(current, "unregister, leaking uprobe");
> > -     }
> > +
> > +     list_del_rcu(&uc->cons_node);
>
> hi,
> I'm using this patchset as base for my changes and stumbled on this today,
> I'm probably missing something, but should we keep the 'uprobe->consumer_rwsem'
> lock around the list_del_rcu?
>

Note that original code also didn't take consumer_rwsem, but rather
kept register_rwsem (which we still use).

There is a bit of mix of using register_rwsem and consumer_rwsem for
working with consumer list. Code hints at this as being undesirable
and "temporary", but you know, it's not broken :)

Anyways, my point is that we didn't change the behavior, this should
be fine. That _rcu() in list_del_rcu() is not about lockless
modification of the list, but rather modification in such a way as to
keep lockless RCU-protected *readers* correct. It just does some more
memory barrier/release operations more carefully.

> jirka
>
>
> > +     err = register_for_each_vma(uprobe, NULL);
> > +
> >       up_write(&uprobe->register_rwsem);
> >
> > -     if (!err)
> > -             put_uprobe(uprobe);
> > +     /* TODO : cant unregister? schedule a worker thread */
> > +     if (unlikely(err)) {
> > +             uprobe_warn(current, "unregister, leaking uprobe");
> > +             goto out_sync;
> > +     }
> > +
> > +     put_uprobe(uprobe);
> > +
Jiri Olsa Aug. 22, 2024, 5:35 p.m. UTC | #3
On Thu, Aug 22, 2024 at 09:59:29AM -0700, Andrii Nakryiko wrote:
> On Thu, Aug 22, 2024 at 7:22 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Mon, Aug 12, 2024 at 09:29:08PM -0700, Andrii Nakryiko wrote:
> >
> > SNIP
> >
> > > @@ -1125,18 +1103,31 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >       int err;
> > >
> > >       down_write(&uprobe->register_rwsem);
> > > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > > -             err = -ENOENT;
> > > -     } else {
> > > -             err = register_for_each_vma(uprobe, NULL);
> > > -             /* TODO : cant unregister? schedule a worker thread */
> > > -             if (unlikely(err))
> > > -                     uprobe_warn(current, "unregister, leaking uprobe");
> > > -     }
> > > +
> > > +     list_del_rcu(&uc->cons_node);
> >
> > hi,
> > I'm using this patchset as base for my changes and stumbled on this today,
> > I'm probably missing something, but should we keep the 'uprobe->consumer_rwsem'
> > lock around the list_del_rcu?
> >
> 
> Note that original code also didn't take consumer_rwsem, but rather
> kept register_rwsem (which we still use).

humm, consumer_del took consumer_rwsem, right?

jirka

> 
> There is a bit of mix of using register_rwsem and consumer_rwsem for
> working with consumer list. Code hints at this as being undesirable
> and "temporary", but you know, it's not broken :)
> 
> Anyways, my point is that we didn't change the behavior, this should
> be fine. That _rcu() in list_del_rcu() is not about lockless
> modification of the list, but rather modification in such a way as to
> keep lockless RCU-protected *readers* correct. It just does some more
> memory barrier/release operations more carefully.
> 
> > jirka
> >
> >
> > > +     err = register_for_each_vma(uprobe, NULL);
> > > +
> > >       up_write(&uprobe->register_rwsem);
> > >
> > > -     if (!err)
> > > -             put_uprobe(uprobe);
> > > +     /* TODO : cant unregister? schedule a worker thread */
> > > +     if (unlikely(err)) {
> > > +             uprobe_warn(current, "unregister, leaking uprobe");
> > > +             goto out_sync;
> > > +     }
> > > +
> > > +     put_uprobe(uprobe);
> > > +
Andrii Nakryiko Aug. 22, 2024, 5:51 p.m. UTC | #4
On Thu, Aug 22, 2024 at 10:35 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Aug 22, 2024 at 09:59:29AM -0700, Andrii Nakryiko wrote:
> > On Thu, Aug 22, 2024 at 7:22 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >
> > > On Mon, Aug 12, 2024 at 09:29:08PM -0700, Andrii Nakryiko wrote:
> > >
> > > SNIP
> > >
> > > > @@ -1125,18 +1103,31 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > > >       int err;
> > > >
> > > >       down_write(&uprobe->register_rwsem);
> > > > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > > > -             err = -ENOENT;
> > > > -     } else {
> > > > -             err = register_for_each_vma(uprobe, NULL);
> > > > -             /* TODO : cant unregister? schedule a worker thread */
> > > > -             if (unlikely(err))
> > > > -                     uprobe_warn(current, "unregister, leaking uprobe");
> > > > -     }
> > > > +
> > > > +     list_del_rcu(&uc->cons_node);
> > >
> > > hi,
> > > I'm using this patchset as base for my changes and stumbled on this today,
> > > I'm probably missing something, but should we keep the 'uprobe->consumer_rwsem'
> > > lock around the list_del_rcu?
> > >
> >
> > Note that original code also didn't take consumer_rwsem, but rather
> > kept register_rwsem (which we still use).
>
> humm, consumer_del took consumer_rwsem, right?
>

Ah, it was inside consume_del(), sorry, my bad. I can add nested
consumer_rwsem back, but what I mentioned earlier, regiser_rwsem is
sort of interchangeable and sufficient enough for working with
consumer list, it seems. There are a bunch of places where we iterated
this list without holding consumer_rwsem lock and that doesn't break
anything.

Also, consumer_add() and consumer_del() are always called with
register_rwsem, so that consumer_rwsem isn't necessary.

We also have prepare_uprobe() holding consumer_rwsem and there is a
comment about abuse of that rwsem and suggestion to move it to
registration, I never completely understood that. But prepare_uprobe()
doesn't seem to modify consumers list at all.

And the one remaining use of consumer_rwsem is filter_chain(), which
for handler_chain() will be also called under register_rwsem, if
purely lockless traversal is not enough.

There are two other calls to filter_chain() that are not protected by
register_rwsem, so just because of those two maybe we should keep
consumer_rwsem, but so far all the stress testing never caught any
problem.


> jirka
>
> >
> > There is a bit of mix of using register_rwsem and consumer_rwsem for
> > working with consumer list. Code hints at this as being undesirable
> > and "temporary", but you know, it's not broken :)
> >
> > Anyways, my point is that we didn't change the behavior, this should
> > be fine. That _rcu() in list_del_rcu() is not about lockless
> > modification of the list, but rather modification in such a way as to
> > keep lockless RCU-protected *readers* correct. It just does some more
> > memory barrier/release operations more carefully.
> >
> > > jirka
> > >
> > >
> > > > +     err = register_for_each_vma(uprobe, NULL);
> > > > +
> > > >       up_write(&uprobe->register_rwsem);
> > > >
> > > > -     if (!err)
> > > > -             put_uprobe(uprobe);
> > > > +     /* TODO : cant unregister? schedule a worker thread */
> > > > +     if (unlikely(err)) {
> > > > +             uprobe_warn(current, "unregister, leaking uprobe");
> > > > +             goto out_sync;
> > > > +     }
> > > > +
> > > > +     put_uprobe(uprobe);
> > > > +
diff mbox series

Patch

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 9cf0dce62e4c..29c935b0d504 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -35,7 +35,7 @@  struct uprobe_consumer {
 				struct pt_regs *regs);
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
-	struct uprobe_consumer *next;
+	struct list_head cons_node;
 };
 
 #ifdef CONFIG_UPROBES
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8bdcdc6901b2..7de1aaf50394 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -59,7 +59,7 @@  struct uprobe {
 	struct rw_semaphore	register_rwsem;
 	struct rw_semaphore	consumer_rwsem;
 	struct list_head	pending_list;
-	struct uprobe_consumer	*consumers;
+	struct list_head	consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
 	struct rcu_head		rcu;
 	loff_t			offset;
@@ -783,6 +783,7 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 	uprobe->inode = inode;
 	uprobe->offset = offset;
 	uprobe->ref_ctr_offset = ref_ctr_offset;
+	INIT_LIST_HEAD(&uprobe->consumers);
 	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
 	RB_CLEAR_NODE(&uprobe->rb_node);
@@ -808,34 +809,10 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	down_write(&uprobe->consumer_rwsem);
-	uc->next = uprobe->consumers;
-	uprobe->consumers = uc;
+	list_add_rcu(&uc->cons_node, &uprobe->consumers);
 	up_write(&uprobe->consumer_rwsem);
 }
 
-/*
- * For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
- */
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-	struct uprobe_consumer **con;
-	bool ret = false;
-
-	down_write(&uprobe->consumer_rwsem);
-	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
-		if (*con == uc) {
-			*con = uc->next;
-			ret = true;
-			break;
-		}
-	}
-	up_write(&uprobe->consumer_rwsem);
-
-	return ret;
-}
-
 static int __copy_insn(struct address_space *mapping, struct file *filp,
 			void *insn, int nbytes, loff_t offset)
 {
@@ -929,7 +906,8 @@  static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
 	bool ret = false;
 
 	down_read(&uprobe->consumer_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		ret = consumer_filter(uc, mm);
 		if (ret)
 			break;
@@ -1125,18 +1103,31 @@  void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 	int err;
 
 	down_write(&uprobe->register_rwsem);
-	if (WARN_ON(!consumer_del(uprobe, uc))) {
-		err = -ENOENT;
-	} else {
-		err = register_for_each_vma(uprobe, NULL);
-		/* TODO : cant unregister? schedule a worker thread */
-		if (unlikely(err))
-			uprobe_warn(current, "unregister, leaking uprobe");
-	}
+
+	list_del_rcu(&uc->cons_node);
+	err = register_for_each_vma(uprobe, NULL);
+
 	up_write(&uprobe->register_rwsem);
 
-	if (!err)
-		put_uprobe(uprobe);
+	/* TODO : cant unregister? schedule a worker thread */
+	if (unlikely(err)) {
+		uprobe_warn(current, "unregister, leaking uprobe");
+		goto out_sync;
+	}
+
+	put_uprobe(uprobe);
+
+out_sync:
+	/*
+	 * Now that handler_chain() and handle_uretprobe_chain() iterate over
+	 * uprobe->consumers list under RCU protection without holding
+	 * uprobe->register_rwsem, we need to wait for RCU grace period to
+	 * make sure that we can't call into just unregistered
+	 * uprobe_consumer's callbacks anymore. If we don't do that, fast and
+	 * unlucky enough caller can free consumer's memory and cause
+	 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
+	 */
+	synchronize_srcu(&uprobes_srcu);
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
@@ -1214,13 +1205,20 @@  EXPORT_SYMBOL_GPL(uprobe_register);
 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
 {
 	struct uprobe_consumer *con;
-	int ret = -ENOENT;
+	int ret = -ENOENT, srcu_idx;
 
 	down_write(&uprobe->register_rwsem);
-	for (con = uprobe->consumers; con && con != uc ; con = con->next)
-		;
-	if (con)
-		ret = register_for_each_vma(uprobe, add ? uc : NULL);
+
+	srcu_idx = srcu_read_lock(&uprobes_srcu);
+	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
+		if (con == uc) {
+			ret = register_for_each_vma(uprobe, add ? uc : NULL);
+			break;
+		}
+	}
+	srcu_read_unlock(&uprobes_srcu, srcu_idx);
+
 	up_write(&uprobe->register_rwsem);
 
 	return ret;
@@ -2085,10 +2083,12 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 	struct uprobe_consumer *uc;
 	int remove = UPROBE_HANDLER_REMOVE;
 	bool need_prep = false; /* prepare return uprobe, when needed */
+	bool has_consumers = false;
 
-	down_read(&uprobe->register_rwsem);
 	current->utask->auprobe = &uprobe->arch;
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		int rc = 0;
 
 		if (uc->handler) {
@@ -2101,17 +2101,24 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 			need_prep = true;
 
 		remove &= rc;
+		has_consumers = true;
 	}
 	current->utask->auprobe = NULL;
 
 	if (need_prep && !remove)
 		prepare_uretprobe(uprobe, regs); /* put bp at return */
 
-	if (remove && uprobe->consumers) {
-		WARN_ON(!uprobe_is_active(uprobe));
-		unapply_uprobe(uprobe, current->mm);
+	if (remove && has_consumers) {
+		down_read(&uprobe->register_rwsem);
+
+		/* re-check that removal is still required, this time under lock */
+		if (!filter_chain(uprobe, current->mm)) {
+			WARN_ON(!uprobe_is_active(uprobe));
+			unapply_uprobe(uprobe, current->mm);
+		}
+
+		up_read(&uprobe->register_rwsem);
 	}
-	up_read(&uprobe->register_rwsem);
 }
 
 static void
@@ -2119,13 +2126,15 @@  handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 {
 	struct uprobe *uprobe = ri->uprobe;
 	struct uprobe_consumer *uc;
+	int srcu_idx;
 
-	down_read(&uprobe->register_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	srcu_idx = srcu_read_lock(&uprobes_srcu);
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		if (uc->ret_handler)
 			uc->ret_handler(uc, ri->func, regs);
 	}
-	up_read(&uprobe->register_rwsem);
+	srcu_read_unlock(&uprobes_srcu, srcu_idx);
 }
 
 static struct return_instance *find_next_ret_chain(struct return_instance *ri)