diff mbox series

[v4,4/8] uprobes: travers uprobe's consumer list locklessly under SRCU protection

Message ID 20240829183741.3331213-5-andrii@kernel.org (mailing list archive)
State Superseded
Headers show
Series uprobes: RCU-protected hot path optimizations | expand

Commit Message

Andrii Nakryiko Aug. 29, 2024, 6:37 p.m. UTC
uprobe->register_rwsem is one of a few big bottlenecks to scalability of
uprobes, so we need to get rid of it to improve uprobe performance and
multi-CPU scalability.

First, we turn uprobe's consumer list to a typical doubly-linked list
and utilize existing RCU-aware helpers for traversing such lists, as
well as adding and removing elements from it.

For entry uprobes we already have SRCU protection active since before
uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
won't go away from under us, but we add SRCU protection around consumer
list traversal.

Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
we remember whether any removal was requested during handler calls, but
then we double-check the decision under a proper register_rwsem using
consumers' filter callbacks. Handler removal is very rare, so this extra
lock won't hurt performance, overall, but we also avoid the need for any
extra protection (e.g., seqcount locks).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/uprobes.h |   2 +-
 kernel/events/uprobes.c | 104 +++++++++++++++++++++++-----------------
 2 files changed, 62 insertions(+), 44 deletions(-)

Comments

Jiri Olsa Aug. 29, 2024, 11:09 p.m. UTC | #1
On Thu, Aug 29, 2024 at 11:37:37AM -0700, Andrii Nakryiko wrote:
> uprobe->register_rwsem is one of a few big bottlenecks to scalability of
> uprobes, so we need to get rid of it to improve uprobe performance and
> multi-CPU scalability.
> 
> First, we turn uprobe's consumer list to a typical doubly-linked list
> and utilize existing RCU-aware helpers for traversing such lists, as
> well as adding and removing elements from it.
> 
> For entry uprobes we already have SRCU protection active since before
> uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
> won't go away from under us, but we add SRCU protection around consumer
> list traversal.
> 
> Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
> we remember whether any removal was requested during handler calls, but
> then we double-check the decision under a proper register_rwsem using
> consumers' filter callbacks. Handler removal is very rare, so this extra
> lock won't hurt performance, overall, but we also avoid the need for any
> extra protection (e.g., seqcount locks).
> 
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  include/linux/uprobes.h |   2 +-
>  kernel/events/uprobes.c | 104 +++++++++++++++++++++++-----------------
>  2 files changed, 62 insertions(+), 44 deletions(-)
> 
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index 9cf0dce62e4c..29c935b0d504 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -35,7 +35,7 @@ struct uprobe_consumer {
>  				struct pt_regs *regs);
>  	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
>  
> -	struct uprobe_consumer *next;
> +	struct list_head cons_node;
>  };
>  
>  #ifdef CONFIG_UPROBES
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 8bdcdc6901b2..97e58d160647 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -59,7 +59,7 @@ struct uprobe {
>  	struct rw_semaphore	register_rwsem;
>  	struct rw_semaphore	consumer_rwsem;
>  	struct list_head	pending_list;
> -	struct uprobe_consumer	*consumers;
> +	struct list_head	consumers;
>  	struct inode		*inode;		/* Also hold a ref to inode */
>  	struct rcu_head		rcu;
>  	loff_t			offset;
> @@ -783,6 +783,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
>  	uprobe->inode = inode;
>  	uprobe->offset = offset;
>  	uprobe->ref_ctr_offset = ref_ctr_offset;
> +	INIT_LIST_HEAD(&uprobe->consumers);
>  	init_rwsem(&uprobe->register_rwsem);
>  	init_rwsem(&uprobe->consumer_rwsem);
>  	RB_CLEAR_NODE(&uprobe->rb_node);
> @@ -808,32 +809,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
>  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  {
>  	down_write(&uprobe->consumer_rwsem);
> -	uc->next = uprobe->consumers;
> -	uprobe->consumers = uc;
> +	list_add_rcu(&uc->cons_node, &uprobe->consumers);
>  	up_write(&uprobe->consumer_rwsem);
>  }
>  
>  /*
>   * For uprobe @uprobe, delete the consumer @uc.
> - * Return true if the @uc is deleted successfully
> - * or return false.
> + * Should never be called with consumer that's not part of @uprobe->consumers.
>   */
> -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  {
> -	struct uprobe_consumer **con;
> -	bool ret = false;
> -
>  	down_write(&uprobe->consumer_rwsem);
> -	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> -		if (*con == uc) {
> -			*con = uc->next;
> -			ret = true;
> -			break;
> -		}
> -	}
> +	list_del_rcu(&uc->cons_node);
>  	up_write(&uprobe->consumer_rwsem);
> -
> -	return ret;
>  }
>  
>  static int __copy_insn(struct address_space *mapping, struct file *filp,
> @@ -929,7 +917,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
>  	bool ret = false;
>  
>  	down_read(&uprobe->consumer_rwsem);
> -	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
>  		ret = consumer_filter(uc, mm);
>  		if (ret)
>  			break;
> @@ -1125,18 +1114,29 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  	int err;
>  
>  	down_write(&uprobe->register_rwsem);
> -	if (WARN_ON(!consumer_del(uprobe, uc))) {
> -		err = -ENOENT;
> -	} else {
> -		err = register_for_each_vma(uprobe, NULL);
> -		/* TODO : cant unregister? schedule a worker thread */
> -		if (unlikely(err))
> -			uprobe_warn(current, "unregister, leaking uprobe");
> -	}
> +	consumer_del(uprobe, uc);
> +	err = register_for_each_vma(uprobe, NULL);
>  	up_write(&uprobe->register_rwsem);
>  
> -	if (!err)
> -		put_uprobe(uprobe);
> +	/* TODO : cant unregister? schedule a worker thread */
> +	if (unlikely(err)) {
> +		uprobe_warn(current, "unregister, leaking uprobe");
> +		goto out_sync;
> +	}
> +
> +	put_uprobe(uprobe);
> +
> +out_sync:
> +	/*
> +	 * Now that handler_chain() and handle_uretprobe_chain() iterate over
> +	 * uprobe->consumers list under RCU protection without holding
> +	 * uprobe->register_rwsem, we need to wait for RCU grace period to
> +	 * make sure that we can't call into just unregistered
> +	 * uprobe_consumer's callbacks anymore. If we don't do that, fast and
> +	 * unlucky enough caller can free consumer's memory and cause
> +	 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
> +	 */
> +	synchronize_srcu(&uprobes_srcu);
>  }
>  EXPORT_SYMBOL_GPL(uprobe_unregister);
>  
> @@ -1214,13 +1214,20 @@ EXPORT_SYMBOL_GPL(uprobe_register);
>  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
>  {
>  	struct uprobe_consumer *con;
> -	int ret = -ENOENT;
> +	int ret = -ENOENT, srcu_idx;
>  
>  	down_write(&uprobe->register_rwsem);
> -	for (con = uprobe->consumers; con && con != uc ; con = con->next)
> -		;
> -	if (con)
> -		ret = register_for_each_vma(uprobe, add ? uc : NULL);
> +
> +	srcu_idx = srcu_read_lock(&uprobes_srcu);
> +	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
> +		if (con == uc) {
> +			ret = register_for_each_vma(uprobe, add ? uc : NULL);
> +			break;
> +		}
> +	}
> +	srcu_read_unlock(&uprobes_srcu, srcu_idx);
> +
>  	up_write(&uprobe->register_rwsem);
>  
>  	return ret;
> @@ -2085,10 +2092,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
>  	struct uprobe_consumer *uc;
>  	int remove = UPROBE_HANDLER_REMOVE;
>  	bool need_prep = false; /* prepare return uprobe, when needed */
> +	bool has_consumers = false;
>  
> -	down_read(&uprobe->register_rwsem);
>  	current->utask->auprobe = &uprobe->arch;
> -	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +
> +	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
>  		int rc = 0;
>  
>  		if (uc->handler) {
> @@ -2101,17 +2110,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
>  			need_prep = true;
>  
>  		remove &= rc;
> +		has_consumers = true;
>  	}
>  	current->utask->auprobe = NULL;
>  
>  	if (need_prep && !remove)
>  		prepare_uretprobe(uprobe, regs); /* put bp at return */
>  
> -	if (remove && uprobe->consumers) {
> -		WARN_ON(!uprobe_is_active(uprobe));
> -		unapply_uprobe(uprobe, current->mm);
> +	if (remove && has_consumers) {
> +		down_read(&uprobe->register_rwsem);
> +
> +		/* re-check that removal is still required, this time under lock */
> +		if (!filter_chain(uprobe, current->mm)) {

sorry for late question, but I do not follow this change.. 

at this point we got 1 as handler's return value from all the uprobe's consumers,
why do we need to call filter_chain in here.. IIUC this will likely skip over
the removal?

with single uprobe_multi consumer:

  handler_chain
    uprobe_multi_link_handler
      uprobe_prog_run
        bpf_prog returns 1

    remove = 1

    if (remove && has_consumers) {

      filter_chain - uprobe_multi_link_filter returns true.. so the uprobe stays?

maybe I just need to write test for it ;-)

thanks,
jirka


> +			WARN_ON(!uprobe_is_active(uprobe));
> +			unapply_uprobe(uprobe, current->mm);
> +		}
> +
> +		up_read(&uprobe->register_rwsem);
>  	}
> -	up_read(&uprobe->register_rwsem);
>  }
>  
>  static void
> @@ -2119,13 +2135,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
>  {
>  	struct uprobe *uprobe = ri->uprobe;
>  	struct uprobe_consumer *uc;
> +	int srcu_idx;
>  
> -	down_read(&uprobe->register_rwsem);
> -	for (uc = uprobe->consumers; uc; uc = uc->next) {
> +	srcu_idx = srcu_read_lock(&uprobes_srcu);
> +	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> +				 srcu_read_lock_held(&uprobes_srcu)) {
>  		if (uc->ret_handler)
>  			uc->ret_handler(uc, ri->func, regs);
>  	}
> -	up_read(&uprobe->register_rwsem);
> +	srcu_read_unlock(&uprobes_srcu, srcu_idx);
>  }
>  
>  static struct return_instance *find_next_ret_chain(struct return_instance *ri)
> -- 
> 2.43.5
>
Andrii Nakryiko Aug. 29, 2024, 11:31 p.m. UTC | #2
On Thu, Aug 29, 2024 at 4:10 PM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Aug 29, 2024 at 11:37:37AM -0700, Andrii Nakryiko wrote:
> > uprobe->register_rwsem is one of a few big bottlenecks to scalability of
> > uprobes, so we need to get rid of it to improve uprobe performance and
> > multi-CPU scalability.
> >
> > First, we turn uprobe's consumer list to a typical doubly-linked list
> > and utilize existing RCU-aware helpers for traversing such lists, as
> > well as adding and removing elements from it.
> >
> > For entry uprobes we already have SRCU protection active since before
> > uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
> > won't go away from under us, but we add SRCU protection around consumer
> > list traversal.
> >
> > Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
> > we remember whether any removal was requested during handler calls, but
> > then we double-check the decision under a proper register_rwsem using
> > consumers' filter callbacks. Handler removal is very rare, so this extra
> > lock won't hurt performance, overall, but we also avoid the need for any
> > extra protection (e.g., seqcount locks).
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  include/linux/uprobes.h |   2 +-
> >  kernel/events/uprobes.c | 104 +++++++++++++++++++++++-----------------
> >  2 files changed, 62 insertions(+), 44 deletions(-)
> >
> > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > index 9cf0dce62e4c..29c935b0d504 100644
> > --- a/include/linux/uprobes.h
> > +++ b/include/linux/uprobes.h
> > @@ -35,7 +35,7 @@ struct uprobe_consumer {
> >                               struct pt_regs *regs);
> >       bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
> >
> > -     struct uprobe_consumer *next;
> > +     struct list_head cons_node;
> >  };
> >
> >  #ifdef CONFIG_UPROBES
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 8bdcdc6901b2..97e58d160647 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -59,7 +59,7 @@ struct uprobe {
> >       struct rw_semaphore     register_rwsem;
> >       struct rw_semaphore     consumer_rwsem;
> >       struct list_head        pending_list;
> > -     struct uprobe_consumer  *consumers;
> > +     struct list_head        consumers;
> >       struct inode            *inode;         /* Also hold a ref to inode */
> >       struct rcu_head         rcu;
> >       loff_t                  offset;
> > @@ -783,6 +783,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> >       uprobe->inode = inode;
> >       uprobe->offset = offset;
> >       uprobe->ref_ctr_offset = ref_ctr_offset;
> > +     INIT_LIST_HEAD(&uprobe->consumers);
> >       init_rwsem(&uprobe->register_rwsem);
> >       init_rwsem(&uprobe->consumer_rwsem);
> >       RB_CLEAR_NODE(&uprobe->rb_node);
> > @@ -808,32 +809,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> >  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >  {
> >       down_write(&uprobe->consumer_rwsem);
> > -     uc->next = uprobe->consumers;
> > -     uprobe->consumers = uc;
> > +     list_add_rcu(&uc->cons_node, &uprobe->consumers);
> >       up_write(&uprobe->consumer_rwsem);
> >  }
> >
> >  /*
> >   * For uprobe @uprobe, delete the consumer @uc.
> > - * Return true if the @uc is deleted successfully
> > - * or return false.
> > + * Should never be called with consumer that's not part of @uprobe->consumers.
> >   */
> > -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >  {
> > -     struct uprobe_consumer **con;
> > -     bool ret = false;
> > -
> >       down_write(&uprobe->consumer_rwsem);
> > -     for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> > -             if (*con == uc) {
> > -                     *con = uc->next;
> > -                     ret = true;
> > -                     break;
> > -             }
> > -     }
> > +     list_del_rcu(&uc->cons_node);
> >       up_write(&uprobe->consumer_rwsem);
> > -
> > -     return ret;
> >  }
> >
> >  static int __copy_insn(struct address_space *mapping, struct file *filp,
> > @@ -929,7 +917,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
> >       bool ret = false;
> >
> >       down_read(&uprobe->consumer_rwsem);
> > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > +                              srcu_read_lock_held(&uprobes_srcu)) {
> >               ret = consumer_filter(uc, mm);
> >               if (ret)
> >                       break;
> > @@ -1125,18 +1114,29 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >       int err;
> >
> >       down_write(&uprobe->register_rwsem);
> > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > -             err = -ENOENT;
> > -     } else {
> > -             err = register_for_each_vma(uprobe, NULL);
> > -             /* TODO : cant unregister? schedule a worker thread */
> > -             if (unlikely(err))
> > -                     uprobe_warn(current, "unregister, leaking uprobe");
> > -     }
> > +     consumer_del(uprobe, uc);
> > +     err = register_for_each_vma(uprobe, NULL);
> >       up_write(&uprobe->register_rwsem);
> >
> > -     if (!err)
> > -             put_uprobe(uprobe);
> > +     /* TODO : cant unregister? schedule a worker thread */
> > +     if (unlikely(err)) {
> > +             uprobe_warn(current, "unregister, leaking uprobe");
> > +             goto out_sync;
> > +     }
> > +
> > +     put_uprobe(uprobe);
> > +
> > +out_sync:
> > +     /*
> > +      * Now that handler_chain() and handle_uretprobe_chain() iterate over
> > +      * uprobe->consumers list under RCU protection without holding
> > +      * uprobe->register_rwsem, we need to wait for RCU grace period to
> > +      * make sure that we can't call into just unregistered
> > +      * uprobe_consumer's callbacks anymore. If we don't do that, fast and
> > +      * unlucky enough caller can free consumer's memory and cause
> > +      * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
> > +      */
> > +     synchronize_srcu(&uprobes_srcu);
> >  }
> >  EXPORT_SYMBOL_GPL(uprobe_unregister);
> >
> > @@ -1214,13 +1214,20 @@ EXPORT_SYMBOL_GPL(uprobe_register);
> >  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
> >  {
> >       struct uprobe_consumer *con;
> > -     int ret = -ENOENT;
> > +     int ret = -ENOENT, srcu_idx;
> >
> >       down_write(&uprobe->register_rwsem);
> > -     for (con = uprobe->consumers; con && con != uc ; con = con->next)
> > -             ;
> > -     if (con)
> > -             ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > +
> > +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> > +     list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > +             if (con == uc) {
> > +                     ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > +                     break;
> > +             }
> > +     }
> > +     srcu_read_unlock(&uprobes_srcu, srcu_idx);
> > +
> >       up_write(&uprobe->register_rwsem);
> >
> >       return ret;
> > @@ -2085,10 +2092,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> >       struct uprobe_consumer *uc;
> >       int remove = UPROBE_HANDLER_REMOVE;
> >       bool need_prep = false; /* prepare return uprobe, when needed */
> > +     bool has_consumers = false;
> >
> > -     down_read(&uprobe->register_rwsem);
> >       current->utask->auprobe = &uprobe->arch;
> > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > +
> > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > +                              srcu_read_lock_held(&uprobes_srcu)) {
> >               int rc = 0;
> >
> >               if (uc->handler) {
> > @@ -2101,17 +2110,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> >                       need_prep = true;
> >
> >               remove &= rc;
> > +             has_consumers = true;
> >       }
> >       current->utask->auprobe = NULL;
> >
> >       if (need_prep && !remove)
> >               prepare_uretprobe(uprobe, regs); /* put bp at return */
> >
> > -     if (remove && uprobe->consumers) {
> > -             WARN_ON(!uprobe_is_active(uprobe));
> > -             unapply_uprobe(uprobe, current->mm);
> > +     if (remove && has_consumers) {
> > +             down_read(&uprobe->register_rwsem);
> > +
> > +             /* re-check that removal is still required, this time under lock */
> > +             if (!filter_chain(uprobe, current->mm)) {
>
> sorry for late question, but I do not follow this change..
>
> at this point we got 1 as handler's return value from all the uprobe's consumers,
> why do we need to call filter_chain in here.. IIUC this will likely skip over
> the removal?
>

Because we don't hold register_rwsem we are now racing with
registration. So while we can get all consumers at the time we were
iterating over the consumer list to request deletion, a parallel CPU
can add another consumer that needs this uprobe+PID combination. So if
we don't double-check, we are risking having a consumer that will not
be triggered for the desired process.

Does it make sense? Given removal is rare, it's ok to take lock if we
*suspect* removal, and then check authoritatively again under lock.

> with single uprobe_multi consumer:
>
>   handler_chain
>     uprobe_multi_link_handler
>       uprobe_prog_run
>         bpf_prog returns 1
>
>     remove = 1
>
>     if (remove && has_consumers) {
>
>       filter_chain - uprobe_multi_link_filter returns true.. so the uprobe stays?
>
> maybe I just need to write test for it ;-)
>
> thanks,
> jirka
>
>
> > +                     WARN_ON(!uprobe_is_active(uprobe));
> > +                     unapply_uprobe(uprobe, current->mm);
> > +             }
> > +
> > +             up_read(&uprobe->register_rwsem);
> >       }
> > -     up_read(&uprobe->register_rwsem);
> >  }
> >
> >  static void
> > @@ -2119,13 +2135,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
> >  {
> >       struct uprobe *uprobe = ri->uprobe;
> >       struct uprobe_consumer *uc;
> > +     int srcu_idx;
> >
> > -     down_read(&uprobe->register_rwsem);
> > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > +                              srcu_read_lock_held(&uprobes_srcu)) {
> >               if (uc->ret_handler)
> >                       uc->ret_handler(uc, ri->func, regs);
> >       }
> > -     up_read(&uprobe->register_rwsem);
> > +     srcu_read_unlock(&uprobes_srcu, srcu_idx);
> >  }
> >
> >  static struct return_instance *find_next_ret_chain(struct return_instance *ri)
> > --
> > 2.43.5
> >
Jiri Olsa Aug. 30, 2024, 1:45 p.m. UTC | #3
On Thu, Aug 29, 2024 at 04:31:18PM -0700, Andrii Nakryiko wrote:
> On Thu, Aug 29, 2024 at 4:10 PM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Thu, Aug 29, 2024 at 11:37:37AM -0700, Andrii Nakryiko wrote:
> > > uprobe->register_rwsem is one of a few big bottlenecks to scalability of
> > > uprobes, so we need to get rid of it to improve uprobe performance and
> > > multi-CPU scalability.
> > >
> > > First, we turn uprobe's consumer list to a typical doubly-linked list
> > > and utilize existing RCU-aware helpers for traversing such lists, as
> > > well as adding and removing elements from it.
> > >
> > > For entry uprobes we already have SRCU protection active since before
> > > uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
> > > won't go away from under us, but we add SRCU protection around consumer
> > > list traversal.
> > >
> > > Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
> > > we remember whether any removal was requested during handler calls, but
> > > then we double-check the decision under a proper register_rwsem using
> > > consumers' filter callbacks. Handler removal is very rare, so this extra
> > > lock won't hurt performance, overall, but we also avoid the need for any
> > > extra protection (e.g., seqcount locks).
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > >  include/linux/uprobes.h |   2 +-
> > >  kernel/events/uprobes.c | 104 +++++++++++++++++++++++-----------------
> > >  2 files changed, 62 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > > index 9cf0dce62e4c..29c935b0d504 100644
> > > --- a/include/linux/uprobes.h
> > > +++ b/include/linux/uprobes.h
> > > @@ -35,7 +35,7 @@ struct uprobe_consumer {
> > >                               struct pt_regs *regs);
> > >       bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
> > >
> > > -     struct uprobe_consumer *next;
> > > +     struct list_head cons_node;
> > >  };
> > >
> > >  #ifdef CONFIG_UPROBES
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 8bdcdc6901b2..97e58d160647 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -59,7 +59,7 @@ struct uprobe {
> > >       struct rw_semaphore     register_rwsem;
> > >       struct rw_semaphore     consumer_rwsem;
> > >       struct list_head        pending_list;
> > > -     struct uprobe_consumer  *consumers;
> > > +     struct list_head        consumers;
> > >       struct inode            *inode;         /* Also hold a ref to inode */
> > >       struct rcu_head         rcu;
> > >       loff_t                  offset;
> > > @@ -783,6 +783,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> > >       uprobe->inode = inode;
> > >       uprobe->offset = offset;
> > >       uprobe->ref_ctr_offset = ref_ctr_offset;
> > > +     INIT_LIST_HEAD(&uprobe->consumers);
> > >       init_rwsem(&uprobe->register_rwsem);
> > >       init_rwsem(&uprobe->consumer_rwsem);
> > >       RB_CLEAR_NODE(&uprobe->rb_node);
> > > @@ -808,32 +809,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> > >  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >  {
> > >       down_write(&uprobe->consumer_rwsem);
> > > -     uc->next = uprobe->consumers;
> > > -     uprobe->consumers = uc;
> > > +     list_add_rcu(&uc->cons_node, &uprobe->consumers);
> > >       up_write(&uprobe->consumer_rwsem);
> > >  }
> > >
> > >  /*
> > >   * For uprobe @uprobe, delete the consumer @uc.
> > > - * Return true if the @uc is deleted successfully
> > > - * or return false.
> > > + * Should never be called with consumer that's not part of @uprobe->consumers.
> > >   */
> > > -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > > +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >  {
> > > -     struct uprobe_consumer **con;
> > > -     bool ret = false;
> > > -
> > >       down_write(&uprobe->consumer_rwsem);
> > > -     for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> > > -             if (*con == uc) {
> > > -                     *con = uc->next;
> > > -                     ret = true;
> > > -                     break;
> > > -             }
> > > -     }
> > > +     list_del_rcu(&uc->cons_node);
> > >       up_write(&uprobe->consumer_rwsem);
> > > -
> > > -     return ret;
> > >  }
> > >
> > >  static int __copy_insn(struct address_space *mapping, struct file *filp,
> > > @@ -929,7 +917,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
> > >       bool ret = false;
> > >
> > >       down_read(&uprobe->consumer_rwsem);
> > > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > >               ret = consumer_filter(uc, mm);
> > >               if (ret)
> > >                       break;
> > > @@ -1125,18 +1114,29 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >       int err;
> > >
> > >       down_write(&uprobe->register_rwsem);
> > > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > > -             err = -ENOENT;
> > > -     } else {
> > > -             err = register_for_each_vma(uprobe, NULL);
> > > -             /* TODO : cant unregister? schedule a worker thread */
> > > -             if (unlikely(err))
> > > -                     uprobe_warn(current, "unregister, leaking uprobe");
> > > -     }
> > > +     consumer_del(uprobe, uc);
> > > +     err = register_for_each_vma(uprobe, NULL);
> > >       up_write(&uprobe->register_rwsem);
> > >
> > > -     if (!err)
> > > -             put_uprobe(uprobe);
> > > +     /* TODO : cant unregister? schedule a worker thread */
> > > +     if (unlikely(err)) {
> > > +             uprobe_warn(current, "unregister, leaking uprobe");
> > > +             goto out_sync;
> > > +     }
> > > +
> > > +     put_uprobe(uprobe);
> > > +
> > > +out_sync:
> > > +     /*
> > > +      * Now that handler_chain() and handle_uretprobe_chain() iterate over
> > > +      * uprobe->consumers list under RCU protection without holding
> > > +      * uprobe->register_rwsem, we need to wait for RCU grace period to
> > > +      * make sure that we can't call into just unregistered
> > > +      * uprobe_consumer's callbacks anymore. If we don't do that, fast and
> > > +      * unlucky enough caller can free consumer's memory and cause
> > > +      * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
> > > +      */
> > > +     synchronize_srcu(&uprobes_srcu);
> > >  }
> > >  EXPORT_SYMBOL_GPL(uprobe_unregister);
> > >
> > > @@ -1214,13 +1214,20 @@ EXPORT_SYMBOL_GPL(uprobe_register);
> > >  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
> > >  {
> > >       struct uprobe_consumer *con;
> > > -     int ret = -ENOENT;
> > > +     int ret = -ENOENT, srcu_idx;
> > >
> > >       down_write(&uprobe->register_rwsem);
> > > -     for (con = uprobe->consumers; con && con != uc ; con = con->next)
> > > -             ;
> > > -     if (con)
> > > -             ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > > +
> > > +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> > > +     list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > > +             if (con == uc) {
> > > +                     ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > > +                     break;
> > > +             }
> > > +     }
> > > +     srcu_read_unlock(&uprobes_srcu, srcu_idx);
> > > +
> > >       up_write(&uprobe->register_rwsem);
> > >
> > >       return ret;
> > > @@ -2085,10 +2092,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> > >       struct uprobe_consumer *uc;
> > >       int remove = UPROBE_HANDLER_REMOVE;
> > >       bool need_prep = false; /* prepare return uprobe, when needed */
> > > +     bool has_consumers = false;
> > >
> > > -     down_read(&uprobe->register_rwsem);
> > >       current->utask->auprobe = &uprobe->arch;
> > > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > > +
> > > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > >               int rc = 0;
> > >
> > >               if (uc->handler) {
> > > @@ -2101,17 +2110,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> > >                       need_prep = true;
> > >
> > >               remove &= rc;
> > > +             has_consumers = true;
> > >       }
> > >       current->utask->auprobe = NULL;
> > >
> > >       if (need_prep && !remove)
> > >               prepare_uretprobe(uprobe, regs); /* put bp at return */
> > >
> > > -     if (remove && uprobe->consumers) {
> > > -             WARN_ON(!uprobe_is_active(uprobe));
> > > -             unapply_uprobe(uprobe, current->mm);
> > > +     if (remove && has_consumers) {
> > > +             down_read(&uprobe->register_rwsem);
> > > +
> > > +             /* re-check that removal is still required, this time under lock */
> > > +             if (!filter_chain(uprobe, current->mm)) {
> >
> > sorry for late question, but I do not follow this change..
> >
> > at this point we got 1 as handler's return value from all the uprobe's consumers,
> > why do we need to call filter_chain in here.. IIUC this will likely skip over
> > the removal?
> >
> 
> Because we don't hold register_rwsem we are now racing with
> registration. So while we can get all consumers at the time we were
> iterating over the consumer list to request deletion, a parallel CPU
> can add another consumer that needs this uprobe+PID combination. So if
> we don't double-check, we are risking having a consumer that will not
> be triggered for the desired process.
> 
> Does it make sense? Given removal is rare, it's ok to take lock if we
> *suspect* removal, and then check authoritatively again under lock.

with this change the probe will not get removed in the attached test,
it'll get 2 hits, without this change just 1 hit

but I'm not sure it's a big problem, because seems like that's not the
intended way the removal should be used anyway, as explained by Oleg [1]

jirka


[1] https://lore.kernel.org/linux-trace-kernel/ZtHKTtn7sqaLeVxV@krava/T/#m07cdc37307cfd06f17f5755a067c9b300a19ee78

---
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index bf6ca8e3eb13..86d37a8e6169 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -6,6 +6,7 @@
 #include "uprobe_multi.skel.h"
 #include "uprobe_multi_bench.skel.h"
 #include "uprobe_multi_usdt.skel.h"
+#include "uprobe_multi_removal.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -687,6 +688,28 @@ static void test_bench_attach_usdt(void)
 	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
 }
 
+static void test_removal(void)
+{
+	struct uprobe_multi_removal *skel = NULL;
+	int err;
+
+	skel = uprobe_multi_removal__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_removal__open_and_load"))
+		return;
+
+	err = uprobe_multi_removal__attach(skel);
+	if (!ASSERT_OK(err, "uprobe_multi_removal__attach"))
+		goto cleanup;
+
+	uprobe_multi_func_1();
+	uprobe_multi_func_1();
+
+	ASSERT_EQ(skel->bss->test, 1, "test");
+
+cleanup:
+	uprobe_multi_removal__destroy(skel);
+}
+
 void test_uprobe_multi_test(void)
 {
 	if (test__start_subtest("skel_api"))
@@ -703,4 +726,6 @@ void test_uprobe_multi_test(void)
 		test_bench_attach_usdt();
 	if (test__start_subtest("attach_api_fails"))
 		test_attach_api_fails();
+	if (test__start_subtest("removal"))
+		test_removal();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_removal.c b/tools/testing/selftests/bpf/progs/uprobe_multi_removal.c
new file mode 100644
index 000000000000..0a948cc1e05b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_removal.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/usdt.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+int test;
+
+SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_1")
+int uprobe(struct pt_regs *ctx)
+{
+	test++;
+	return 1;
+}
Oleg Nesterov Aug. 30, 2024, 2:18 p.m. UTC | #4
On 08/29, Andrii Nakryiko wrote:
>
> On Thu, Aug 29, 2024 at 4:10 PM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > > @@ -2101,17 +2110,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> > >                       need_prep = true;
> > >
> > >               remove &= rc;
> > > +             has_consumers = true;
> > >       }
> > >       current->utask->auprobe = NULL;
> > >
> > >       if (need_prep && !remove)
> > >               prepare_uretprobe(uprobe, regs); /* put bp at return */
> > >
> > > -     if (remove && uprobe->consumers) {
> > > -             WARN_ON(!uprobe_is_active(uprobe));
> > > -             unapply_uprobe(uprobe, current->mm);
> > > +     if (remove && has_consumers) {
> > > +             down_read(&uprobe->register_rwsem);
> > > +
> > > +             /* re-check that removal is still required, this time under lock */
> > > +             if (!filter_chain(uprobe, current->mm)) {
> >
> > sorry for late question, but I do not follow this change..
> >
> > at this point we got 1 as handler's return value from all the uprobe's consumers,
> > why do we need to call filter_chain in here.. IIUC this will likely skip over
> > the removal?
> >
>
> Because we don't hold register_rwsem we are now racing with
> registration. So while we can get all consumers at the time we were
> iterating over the consumer list to request deletion, a parallel CPU
> can add another consumer that needs this uprobe+PID combination. So if
> we don't double-check, we are risking having a consumer that will not
> be triggered for the desired process.

Oh, yes, but this logic is wrong in that it assumes that uc->filter != NULL.
At least it adds the noticeable change in behaviour.

Suppose we have a singler consumer UC with ->filter == NULL. Now suppose
that UC->handler() returns UPROBE_HANDLER_REMOVE.

Before this patch handler_chain() calls unapply_uprobe(), and I think
we should keep this behaviour.

After this patch unapply_uprobe() won't be called: consumer_filter(UC)
returns true, UC->filter == NULL means "probe everything". But I think
that UPROBE_HANDLER_REMOVE must be respected in this case anyway.

Thanks Jiri, I missed that too :/

Oleg.
Oleg Nesterov Aug. 30, 2024, 2:31 p.m. UTC | #5
On 08/30, Jiri Olsa wrote:
>
> with this change the probe will not get removed in the attached test,
> it'll get 2 hits, without this change just 1 hit

I don't understand the code in tools/...bpf../ at all, can't comment,

> but I'm not sure it's a big problem, because seems like that's not the
> intended way the removal should be used anyway, as explained by Oleg [1]

It seems that I confused you again ;)

No, I think you found a problem. UPROBE_HANDLER_REMOVE can be lost if
uc->filter == NULL of if it returns true. See another reply I sent a
minute ago.

I think the fix is simple, plus we need to cleanup this logic anyway,
I'll try to send some code on Monday.

Oleg.
Andrii Nakryiko Aug. 30, 2024, 3:44 p.m. UTC | #6
On Fri, Aug 30, 2024 at 7:33 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 08/30, Jiri Olsa wrote:
> >
> > with this change the probe will not get removed in the attached test,
> > it'll get 2 hits, without this change just 1 hit
>
> I don't understand the code in tools/...bpf../ at all, can't comment,
>
> > but I'm not sure it's a big problem, because seems like that's not the
> > intended way the removal should be used anyway, as explained by Oleg [1]
>
> It seems that I confused you again ;)
>
> No, I think you found a problem. UPROBE_HANDLER_REMOVE can be lost if
> uc->filter == NULL of if it returns true. See another reply I sent a
> minute ago.
>

For better or worse, but I think there is (or has to be) and implicit
contract that if uprobe (or uretprobe for that matter as well, but
that's a separate issue) handler can return UPROBE_HANDLER_REMOVE,
then it *has to* also provide filter. If it doesn't provide filter
callback, it doesn't care about PID filtering and thus can't and
shouldn't cause unregistration.

In ideal world, we wouldn't need handler to do the filtering, and
instead generic uprobe/uretprobe code would just call uc->filter to
know whether to trigger consumer or not. Unfortunately, that's a lot
of overhead due to indirect function call, especially with retpolines
and stuff like that.

So I think it's reasonable to have an (implicit, yeah...) contract
that whoever cares about UPROBE_HANDLER_REMOVE has to provide filter,
they go together.

Jiri, the fact that uprobe/uretprobe can cause detachment by returning
1 is a bug, we should not allow that. But that's a separate issue
which we can fix in bpf-next tree. Please send a patch.

> I think the fix is simple, plus we need to cleanup this logic anyway,
> I'll try to send some code on Monday.

Can we please let me land these patches first? It's been a while. I
don't think anything is really broken with the logic.

>
> Oleg.
>
Oleg Nesterov Aug. 30, 2024, 8:20 p.m. UTC | #7
On 08/30, Andrii Nakryiko wrote:
>

Andrii, let me reply to your email "out of order". First of all:

> Can we please let me land these patches first? It's been a while. I
> don't think anything is really broken with the logic.

OK, agreed.

I'll probably write another email (too late for me today), but I agree
that "avoid register_rwsem in handler_chain" is obviously a good goal,
lets discuss the possible cleanups or even fixlets later, when this
series is already applied.



> On Fri, Aug 30, 2024 at 7:33 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > No, I think you found a problem. UPROBE_HANDLER_REMOVE can be lost if
> > uc->filter == NULL of if it returns true. See another reply I sent a
> > minute ago.
> >
>
> For better or worse, but I think there is (or has to be) and implicit
> contract that if uprobe (or uretprobe for that matter as well, but
> that's a separate issue) handler can return UPROBE_HANDLER_REMOVE,
> then it *has to* also provide filter.

IOW, uc->handler and uc->filter must be consistent. But the current API
doesn't require this contract, so this patch adds a difference which I
didn't notice when I reviewed this change.

(In fact I noticed the difference, but I thought that it should be fine).

> If it doesn't provide filter
> callback, it doesn't care about PID filtering and thus can't and
> shouldn't cause unregistration.

At first glance I disagree, but see above.

> > I think the fix is simple, plus we need to cleanup this logic anyway,
> > I'll try to send some code on Monday.

Damn I am stupid. Nothing new ;) The "simple" fix I had in mind can't work.
But we can do other things which we can discuss later.

Oleg.
Andrii Nakryiko Aug. 30, 2024, 8:43 p.m. UTC | #8
On Fri, Aug 30, 2024 at 1:21 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 08/30, Andrii Nakryiko wrote:
> >
>
> Andrii, let me reply to your email "out of order". First of all:
>
> > Can we please let me land these patches first? It's been a while. I
> > don't think anything is really broken with the logic.
>
> OK, agreed.
>
> I'll probably write another email (too late for me today), but I agree
> that "avoid register_rwsem in handler_chain" is obviously a good goal,
> lets discuss the possible cleanups or even fixlets later, when this
> series is already applied.
>

Sounds good. It seems like I'll need another revision due to missing
include, so if there is any reasonably straightforward clean up we
should do, I can just incorporate that into my series.

>
>
> > On Fri, Aug 30, 2024 at 7:33 AM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > No, I think you found a problem. UPROBE_HANDLER_REMOVE can be lost if
> > > uc->filter == NULL of if it returns true. See another reply I sent a
> > > minute ago.
> > >
> >
> > For better or worse, but I think there is (or has to be) and implicit
> > contract that if uprobe (or uretprobe for that matter as well, but
> > that's a separate issue) handler can return UPROBE_HANDLER_REMOVE,
> > then it *has to* also provide filter.
>
> IOW, uc->handler and uc->filter must be consistent. But the current API
> doesn't require this contract, so this patch adds a difference which I
> didn't notice when I reviewed this change.
>
> (In fact I noticed the difference, but I thought that it should be fine).
>
> > If it doesn't provide filter
> > callback, it doesn't care about PID filtering and thus can't and
> > shouldn't cause unregistration.
>
> At first glance I disagree, but see above.

I still think it's fine, tbh. Which uprobe user violates this contract
in the kernel? Maybe we should just fix that while at it? Because
otherwise we are allowing some frankly too-dynamic and unnecessarily
complicated behavior where we can dynamically unsubscribe without
actually having corresponding filter logic.

As I mentioned earlier, I actually considered calling filter
explicitly to enforce this contract, but then got concerned about
indirect callback overhead and dropped that idea.

>
> > > I think the fix is simple, plus we need to cleanup this logic anyway,
> > > I'll try to send some code on Monday.
>
> Damn I am stupid. Nothing new ;) The "simple" fix I had in mind can't work.
> But we can do other things which we can discuss later.
>

I actually don't see how anything reasonably simple and
straightforward (short of just taking register_rwsem) can work if we
want this weird out-of-sync filter and dynamic UPROBE_HANDLER_REMOVE
behavior to remain. But again, does anyone actually rely on that and
should it be even allowed?

> Oleg.
>
Oleg Nesterov Aug. 31, 2024, 4:19 p.m. UTC | #9
On 08/30, Andrii Nakryiko wrote:
>
> On Fri, Aug 30, 2024 at 1:21 PM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > I'll probably write another email (too late for me today), but I agree
> > that "avoid register_rwsem in handler_chain" is obviously a good goal,
> > lets discuss the possible cleanups or even fixlets later, when this
> > series is already applied.
> >
>
> Sounds good. It seems like I'll need another revision due to missing
> include, so if there is any reasonably straightforward clean up we
> should do, I can just incorporate that into my series.

I was thinking about another seq counter incremented in register(), so
that handler_chain() can detect the race with uprobe_register() and skip
unapply_uprobe() in this case. This is what Peter did in one of his series.
Still changes the current behaviour, but not too much.

But see below,

> I still think it's fine, tbh.

and perhaps you are right,

> Which uprobe user violates this contract
> in the kernel?

The only in-kernel user of UPROBE_HANDLER_REMOVE is perf, and it is fine.

But there are out-of-tree users, say systemtap, I have no idea if this
change can affect them.

And in general, this change makes the API less "flexible".

But once again, I agree that it would be better to apply your series first,
then add the fixes in (unlikely) case it breaks something.

But. Since you are going to send another version, may I ask you to add a
note into the changelog to explain that this patch assumes (and enforces)
the rule about handler/filter consistency?

Oleg.
Oleg Nesterov Aug. 31, 2024, 5:25 p.m. UTC | #10
On 08/30, Jiri Olsa wrote:
>
> with this change the probe will not get removed in the attached test,
> it'll get 2 hits, without this change just 1 hit

Thanks again for pointing out the subtle change in behaviour, but could
you add more details for me? ;)

I was going to read the test below today, but no. As I said many times
I know nothing about bpf, I simply can't understand what this test-case
actually do in kernel-space.

According to git grep, the only in kernel user of UPROBE_HANDLER_REMOVE
is uprobe_perf_func(), but if it returns UPROBE_HANDLER_REMOVE then
consumer->filter == uprobe_perf_filter() should return false?

So could you explay how/why exactly this changes affects your test-case?


But perhaps it uses bpf_uprobe_multi_link_attach() and ->handler is
uprobe_multi_link_handler() ? But uprobe_prog_run() returns zero if
current->mm != link->task->mm.

OTOH, otherwise it returns the error code from bpf_prog_run() and this looks
confusing to me. I have no idea what prog->bpf_func(ctx, insnsi) can return
in this case, but note the WARN(rc & ~UPROBE_HANDLER_MASK) in handler_chain...

Hmm... looking at your test-case again,

> +SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_1")
> +int uprobe(struct pt_regs *ctx)
> +{
> +	test++;
> +	return 1;
> +}

So may be this (compiled to ebpf) is what prog->bpf_func() actually executes?
If yes, everything is clear. And this "proves" that the patch makes the current
API less flexible, as I mentioned in my reply to Andrii.

If I got it right, I'd suggest to add a comment into this code to explain
that we return UPROBE_HANDLER_REMOVE after the 1st hit, for git-grep.

Oleg.
Jiri Olsa Sept. 1, 2024, 9:24 a.m. UTC | #11
On Sat, Aug 31, 2024 at 07:25:44PM +0200, Oleg Nesterov wrote:
> On 08/30, Jiri Olsa wrote:
> >
> > with this change the probe will not get removed in the attached test,
> > it'll get 2 hits, without this change just 1 hit
> 
> Thanks again for pointing out the subtle change in behaviour, but could
> you add more details for me? ;)
> 
> I was going to read the test below today, but no. As I said many times
> I know nothing about bpf, I simply can't understand what this test-case
> actually do in kernel-space.
> 
> According to git grep, the only in kernel user of UPROBE_HANDLER_REMOVE
> is uprobe_perf_func(), but if it returns UPROBE_HANDLER_REMOVE then
> consumer->filter == uprobe_perf_filter() should return false?
> 
> So could you explay how/why exactly this changes affects your test-case?
> 
> 
> But perhaps it uses bpf_uprobe_multi_link_attach() and ->handler is
> uprobe_multi_link_handler() ? But uprobe_prog_run() returns zero if
> current->mm != link->task->mm.
> 
> OTOH, otherwise it returns the error code from bpf_prog_run() and this looks
> confusing to me. I have no idea what prog->bpf_func(ctx, insnsi) can return
> in this case, but note the WARN(rc & ~UPROBE_HANDLER_MASK) in handler_chain...
> 
> Hmm... looking at your test-case again,
> 
> > +SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_1")
> > +int uprobe(struct pt_regs *ctx)
> > +{
> > +	test++;
> > +	return 1;
> > +}
> 
> So may be this (compiled to ebpf) is what prog->bpf_func() actually executes?

yep, that's correct, it goes like:

  uprobe_multi_link_handler
    uprobe_prog_run
    {
      err = bpf_prog_run - runs above bpf program and returns its return
                           value (1 - UPROBE_HANDLER_REMOVE)

      return err;
    }       

> If yes, everything is clear. And this "proves" that the patch makes the current
> API less flexible, as I mentioned in my reply to Andrii.
> 
> If I got it right, I'd suggest to add a comment into this code to explain
> that we return UPROBE_HANDLER_REMOVE after the 1st hit, for git-grep.

ok, I'll add comment with that

thanks,
jirka
Jiri Olsa Sept. 2, 2024, 9:14 a.m. UTC | #12
On Sat, Aug 31, 2024 at 06:19:15PM +0200, Oleg Nesterov wrote:
> On 08/30, Andrii Nakryiko wrote:
> >
> > On Fri, Aug 30, 2024 at 1:21 PM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > I'll probably write another email (too late for me today), but I agree
> > > that "avoid register_rwsem in handler_chain" is obviously a good goal,
> > > lets discuss the possible cleanups or even fixlets later, when this
> > > series is already applied.
> > >
> >
> > Sounds good. It seems like I'll need another revision due to missing
> > include, so if there is any reasonably straightforward clean up we
> > should do, I can just incorporate that into my series.
> 
> I was thinking about another seq counter incremented in register(), so
> that handler_chain() can detect the race with uprobe_register() and skip
> unapply_uprobe() in this case. This is what Peter did in one of his series.
> Still changes the current behaviour, but not too much.
> 
> But see below,
> 
> > I still think it's fine, tbh.
> 
> and perhaps you are right,
> 
> > Which uprobe user violates this contract
> > in the kernel?
> 
> The only in-kernel user of UPROBE_HANDLER_REMOVE is perf, and it is fine.
> 
> But there are out-of-tree users, say systemtap, I have no idea if this
> change can affect them.
> 
> And in general, this change makes the API less "flexible".
> 
> But once again, I agree that it would be better to apply your series first,
> then add the fixes in (unlikely) case it breaks something.

FWIW I (strongly) agree with merging this change and fixing the rest as follow up

thanks,
jirka

> 
> But. Since you are going to send another version, may I ask you to add a
> note into the changelog to explain that this patch assumes (and enforces)
> the rule about handler/filter consistency?
> 
> Oleg.
>
Andrii Nakryiko Sept. 3, 2024, 5:27 p.m. UTC | #13
On Sat, Aug 31, 2024 at 9:19 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 08/30, Andrii Nakryiko wrote:
> >
> > On Fri, Aug 30, 2024 at 1:21 PM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > I'll probably write another email (too late for me today), but I agree
> > > that "avoid register_rwsem in handler_chain" is obviously a good goal,
> > > lets discuss the possible cleanups or even fixlets later, when this
> > > series is already applied.
> > >
> >
> > Sounds good. It seems like I'll need another revision due to missing
> > include, so if there is any reasonably straightforward clean up we
> > should do, I can just incorporate that into my series.
>
> I was thinking about another seq counter incremented in register(), so
> that handler_chain() can detect the race with uprobe_register() and skip
> unapply_uprobe() in this case. This is what Peter did in one of his series.
> Still changes the current behaviour, but not too much.

We could do that, but then worst case, when we do detect registration
race, what do we do? We still have to do the same. So instead of
polluting the logic with seq counter it's best to just codify the
protocol and take advantage of that.

But as you said, this all can/should be addressed as a follow up
discussion. You mentioned some clean ups you wanted to do, let's
discuss all that as part of that?

>
> But see below,
>
> > I still think it's fine, tbh.
>
> and perhaps you are right,
>
> > Which uprobe user violates this contract
> > in the kernel?
>
> The only in-kernel user of UPROBE_HANDLER_REMOVE is perf, and it is fine.
>

Well, BPF program can accidentally trigger this as well, but that's a
bug, we should fix it ASAP in the bpf tree.


> But there are out-of-tree users, say systemtap, I have no idea if this
> change can affect them.
>
> And in general, this change makes the API less "flexible".

it maybe makes a weird and too-flexible case a bit more work to
implement. Because if consumer want to be that flexible, they can
still define filter that will be coordinated between filter() and
handler() implementation.

>
> But once again, I agree that it would be better to apply your series first,
> then add the fixes in (unlikely) case it breaks something.

Yep, agreed, thanks! Will send a new version ASAP, so we have a common
base to work on top of.

>
> But. Since you are going to send another version, may I ask you to add a
> note into the changelog to explain that this patch assumes (and enforces)
> the rule about handler/filter consistency?

Yep, will do. I will also leave a comment next to the filter callback
definition in uprobe_consumer about this.

>
> Oleg.
>
Andrii Nakryiko Sept. 3, 2024, 5:35 p.m. UTC | #14
On Tue, Sep 3, 2024 at 10:27 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sat, Aug 31, 2024 at 9:19 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > On 08/30, Andrii Nakryiko wrote:
> > >
> > > On Fri, Aug 30, 2024 at 1:21 PM Oleg Nesterov <oleg@redhat.com> wrote:
> > > >
> > > > I'll probably write another email (too late for me today), but I agree
> > > > that "avoid register_rwsem in handler_chain" is obviously a good goal,
> > > > lets discuss the possible cleanups or even fixlets later, when this
> > > > series is already applied.
> > > >
> > >
> > > Sounds good. It seems like I'll need another revision due to missing
> > > include, so if there is any reasonably straightforward clean up we
> > > should do, I can just incorporate that into my series.
> >
> > I was thinking about another seq counter incremented in register(), so
> > that handler_chain() can detect the race with uprobe_register() and skip
> > unapply_uprobe() in this case. This is what Peter did in one of his series.
> > Still changes the current behaviour, but not too much.
>
> We could do that, but then worst case, when we do detect registration
> race, what do we do? We still have to do the same. So instead of
> polluting the logic with seq counter it's best to just codify the
> protocol and take advantage of that.
>
> But as you said, this all can/should be addressed as a follow up
> discussion. You mentioned some clean ups you wanted to do, let's
> discuss all that as part of that?
>
> >
> > But see below,
> >
> > > I still think it's fine, tbh.
> >
> > and perhaps you are right,
> >
> > > Which uprobe user violates this contract
> > > in the kernel?
> >
> > The only in-kernel user of UPROBE_HANDLER_REMOVE is perf, and it is fine.
> >
>
> Well, BPF program can accidentally trigger this as well, but that's a
> bug, we should fix it ASAP in the bpf tree.
>
>
> > But there are out-of-tree users, say systemtap, I have no idea if this
> > change can affect them.
> >
> > And in general, this change makes the API less "flexible".
>
> it maybe makes a weird and too-flexible case a bit more work to
> implement. Because if consumer want to be that flexible, they can
> still define filter that will be coordinated between filter() and
> handler() implementation.
>
> >
> > But once again, I agree that it would be better to apply your series first,
> > then add the fixes in (unlikely) case it breaks something.
>
> Yep, agreed, thanks! Will send a new version ASAP, so we have a common
> base to work on top of.
>
> >
> > But. Since you are going to send another version, may I ask you to add a
> > note into the changelog to explain that this patch assumes (and enforces)
> > the rule about handler/filter consistency?
>
> Yep, will do. I will also leave a comment next to the filter callback
> definition in uprobe_consumer about this.
>

Ok, I'm adding this:

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 29c935b0d504..33236d689d60 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -29,6 +29,14 @@ struct page;
 #define MAX_URETPROBE_DEPTH            64

 struct uprobe_consumer {
+       /*
+        * handler() can return UPROBE_HANDLER_REMOVE to signal the need to
+        * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is
+        * returned, filter() callback has to be implemented as well and it
+        * should return false to "confirm" the decision to uninstall uprobe
+        * for the current process. If filter() is omitted or returns true,
+        * UPROBE_HANDLER_REMOVE is effectively ignored.
+        */
        int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
        int (*ret_handler)(struct uprobe_consumer *self,
                                unsigned long func,


> >
> > Oleg.
> >
Oleg Nesterov Sept. 3, 2024, 6:25 p.m. UTC | #15
On 09/03, Andrii Nakryiko wrote:
>
> On Sat, Aug 31, 2024 at 9:19 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > I was thinking about another seq counter incremented in register(), so
> > that handler_chain() can detect the race with uprobe_register() and skip
> > unapply_uprobe() in this case. This is what Peter did in one of his series.
> > Still changes the current behaviour, but not too much.
>
> We could do that, but then worst case, when we do detect registration
> race, what do we do?

Do nothing and skip unapply_uprobe().

> But as you said, this all can/should be addressed as a follow up
> discussion.

Yes, yes,

> You mentioned some clean ups you wanted to do, let's
> discuss all that as part of that?

Yes, sure.

And please note that in reply to myself I also mentioned that I am stupid
and these cleanups can't help to change/improve this behaviour ;)

> > The only in-kernel user of UPROBE_HANDLER_REMOVE is perf, and it is fine.
> >
>
> Well, BPF program can accidentally trigger this as well, but that's a
> bug, we should fix it ASAP in the bpf tree.

not sure, but...

> > And in general, this change makes the API less "flexible".
>
> it maybe makes a weird and too-flexible case a bit more work to
> implement. Because if consumer want to be that flexible, they can
> still define filter that will be coordinated between filter() and
> handler() implementation.

perhaps, but lets discuss this later, on top of your series.

> > But once again, I agree that it would be better to apply your series first,
> > then add the fixes in (unlikely) case it breaks something.
>
> Yep, agreed, thanks! Will send a new version ASAP, so we have a common
> base to work on top of.

Thanks. Hopefully Peter will queue your V5 soon.

Oleg.
Oleg Nesterov Sept. 3, 2024, 6:27 p.m. UTC | #16
On 09/03, Andrii Nakryiko wrote:
>
> On Tue, Sep 3, 2024 at 10:27 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Sat, Aug 31, 2024 at 9:19 AM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > But. Since you are going to send another version, may I ask you to add a
> > > note into the changelog to explain that this patch assumes (and enforces)
> > > the rule about handler/filter consistency?
> >
> > Yep, will do. I will also leave a comment next to the filter callback
> > definition in uprobe_consumer about this.
> >
>
> Ok, I'm adding this:
>
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index 29c935b0d504..33236d689d60 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -29,6 +29,14 @@ struct page;
>  #define MAX_URETPROBE_DEPTH            64
> 
>  struct uprobe_consumer {
> +       /*
> +        * handler() can return UPROBE_HANDLER_REMOVE to signal the need to
> +        * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is
> +        * returned, filter() callback has to be implemented as well and it
> +        * should return false to "confirm" the decision to uninstall uprobe
> +        * for the current process. If filter() is omitted or returns true,
> +        * UPROBE_HANDLER_REMOVE is effectively ignored.
> +        */

Thanks, LGTM.

Oleg.
diff mbox series

Patch

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 9cf0dce62e4c..29c935b0d504 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -35,7 +35,7 @@  struct uprobe_consumer {
 				struct pt_regs *regs);
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
-	struct uprobe_consumer *next;
+	struct list_head cons_node;
 };
 
 #ifdef CONFIG_UPROBES
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8bdcdc6901b2..97e58d160647 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -59,7 +59,7 @@  struct uprobe {
 	struct rw_semaphore	register_rwsem;
 	struct rw_semaphore	consumer_rwsem;
 	struct list_head	pending_list;
-	struct uprobe_consumer	*consumers;
+	struct list_head	consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
 	struct rcu_head		rcu;
 	loff_t			offset;
@@ -783,6 +783,7 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 	uprobe->inode = inode;
 	uprobe->offset = offset;
 	uprobe->ref_ctr_offset = ref_ctr_offset;
+	INIT_LIST_HEAD(&uprobe->consumers);
 	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
 	RB_CLEAR_NODE(&uprobe->rb_node);
@@ -808,32 +809,19 @@  static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	down_write(&uprobe->consumer_rwsem);
-	uc->next = uprobe->consumers;
-	uprobe->consumers = uc;
+	list_add_rcu(&uc->cons_node, &uprobe->consumers);
 	up_write(&uprobe->consumer_rwsem);
 }
 
 /*
  * For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
+ * Should never be called with consumer that's not part of @uprobe->consumers.
  */
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-	struct uprobe_consumer **con;
-	bool ret = false;
-
 	down_write(&uprobe->consumer_rwsem);
-	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
-		if (*con == uc) {
-			*con = uc->next;
-			ret = true;
-			break;
-		}
-	}
+	list_del_rcu(&uc->cons_node);
 	up_write(&uprobe->consumer_rwsem);
-
-	return ret;
 }
 
 static int __copy_insn(struct address_space *mapping, struct file *filp,
@@ -929,7 +917,8 @@  static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
 	bool ret = false;
 
 	down_read(&uprobe->consumer_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		ret = consumer_filter(uc, mm);
 		if (ret)
 			break;
@@ -1125,18 +1114,29 @@  void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 	int err;
 
 	down_write(&uprobe->register_rwsem);
-	if (WARN_ON(!consumer_del(uprobe, uc))) {
-		err = -ENOENT;
-	} else {
-		err = register_for_each_vma(uprobe, NULL);
-		/* TODO : cant unregister? schedule a worker thread */
-		if (unlikely(err))
-			uprobe_warn(current, "unregister, leaking uprobe");
-	}
+	consumer_del(uprobe, uc);
+	err = register_for_each_vma(uprobe, NULL);
 	up_write(&uprobe->register_rwsem);
 
-	if (!err)
-		put_uprobe(uprobe);
+	/* TODO : cant unregister? schedule a worker thread */
+	if (unlikely(err)) {
+		uprobe_warn(current, "unregister, leaking uprobe");
+		goto out_sync;
+	}
+
+	put_uprobe(uprobe);
+
+out_sync:
+	/*
+	 * Now that handler_chain() and handle_uretprobe_chain() iterate over
+	 * uprobe->consumers list under RCU protection without holding
+	 * uprobe->register_rwsem, we need to wait for RCU grace period to
+	 * make sure that we can't call into just unregistered
+	 * uprobe_consumer's callbacks anymore. If we don't do that, fast and
+	 * unlucky enough caller can free consumer's memory and cause
+	 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
+	 */
+	synchronize_srcu(&uprobes_srcu);
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
@@ -1214,13 +1214,20 @@  EXPORT_SYMBOL_GPL(uprobe_register);
 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
 {
 	struct uprobe_consumer *con;
-	int ret = -ENOENT;
+	int ret = -ENOENT, srcu_idx;
 
 	down_write(&uprobe->register_rwsem);
-	for (con = uprobe->consumers; con && con != uc ; con = con->next)
-		;
-	if (con)
-		ret = register_for_each_vma(uprobe, add ? uc : NULL);
+
+	srcu_idx = srcu_read_lock(&uprobes_srcu);
+	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
+		if (con == uc) {
+			ret = register_for_each_vma(uprobe, add ? uc : NULL);
+			break;
+		}
+	}
+	srcu_read_unlock(&uprobes_srcu, srcu_idx);
+
 	up_write(&uprobe->register_rwsem);
 
 	return ret;
@@ -2085,10 +2092,12 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 	struct uprobe_consumer *uc;
 	int remove = UPROBE_HANDLER_REMOVE;
 	bool need_prep = false; /* prepare return uprobe, when needed */
+	bool has_consumers = false;
 
-	down_read(&uprobe->register_rwsem);
 	current->utask->auprobe = &uprobe->arch;
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		int rc = 0;
 
 		if (uc->handler) {
@@ -2101,17 +2110,24 @@  static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 			need_prep = true;
 
 		remove &= rc;
+		has_consumers = true;
 	}
 	current->utask->auprobe = NULL;
 
 	if (need_prep && !remove)
 		prepare_uretprobe(uprobe, regs); /* put bp at return */
 
-	if (remove && uprobe->consumers) {
-		WARN_ON(!uprobe_is_active(uprobe));
-		unapply_uprobe(uprobe, current->mm);
+	if (remove && has_consumers) {
+		down_read(&uprobe->register_rwsem);
+
+		/* re-check that removal is still required, this time under lock */
+		if (!filter_chain(uprobe, current->mm)) {
+			WARN_ON(!uprobe_is_active(uprobe));
+			unapply_uprobe(uprobe, current->mm);
+		}
+
+		up_read(&uprobe->register_rwsem);
 	}
-	up_read(&uprobe->register_rwsem);
 }
 
 static void
@@ -2119,13 +2135,15 @@  handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 {
 	struct uprobe *uprobe = ri->uprobe;
 	struct uprobe_consumer *uc;
+	int srcu_idx;
 
-	down_read(&uprobe->register_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
+	srcu_idx = srcu_read_lock(&uprobes_srcu);
+	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+				 srcu_read_lock_held(&uprobes_srcu)) {
 		if (uc->ret_handler)
 			uc->ret_handler(uc, ri->func, regs);
 	}
-	up_read(&uprobe->register_rwsem);
+	srcu_read_unlock(&uprobes_srcu, srcu_idx);
 }
 
 static struct return_instance *find_next_ret_chain(struct return_instance *ri)