diff mbox series

[v2,bpf-next,3/5] ftrace: introduce FTRACE_OPS_FL_SHARE_IPMODIFY

Message ID 20220602193706.2607681-4-song@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series ftrace: host klp and bpf trampoline together | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 7168 this patch: 7176
netdev/cc_maintainers warning 6 maintainers not CCed: songliubraving@fb.com mingo@redhat.com yhs@fb.com john.fastabend@gmail.com kafai@fb.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 525 this patch: 525
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 6093 this patch: 6101
netdev/checkpatch warning CHECK: Blank lines aren't necessary before a close brace '}' CHECK: Please don't use multiple blank lines WARNING: line length of 81 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 27 this patch: 27
netdev/source_inline fail Was 0 now: 1
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 fail Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 fail Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc

Commit Message

Song Liu June 2, 2022, 7:37 p.m. UTC
live patch and BPF trampoline (kfunc/kretfunc in bpftrace) are important
features for modern systems. Currently, it is not possible to use live
patch and BPF trampoline on the same kernel function at the same time.
This is because of the resitriction that only one ftrace_ops with flag
FTRACE_OPS_FL_IPMODIFY on the same kernel function.

BPF trampoline uses direct ftrace_ops, which assumes IPMODIFY. However,
not all direct ftrace_ops would overwrite the actual function. This means
it is possible to have a non-IPMODIFY direct ftrace_ops to share the same
kernel function with an IPMODIFY ftrace_ops.

Introduce FTRACE_OPS_FL_SHARE_IPMODIFY, which allows the direct ftrace_ops
to share with IPMODIFY ftrace_ops. With FTRACE_OPS_FL_SHARE_IPMODIFY flag
set, the direct ftrace_ops would call the target function picked by the
IPMODIFY ftrace_ops.

Comment "IPMODIFY, DIRECT, and SHARE_IPMODIFY" in include/linux/ftrace.h
contains more information about how SHARE_IPMODIFY interacts with IPMODIFY
and DIRECT flags.

Signed-off-by: Song Liu <song@kernel.org>
---
 include/linux/ftrace.h |  74 +++++++++++++++++
 kernel/trace/ftrace.c  | 179 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 242 insertions(+), 11 deletions(-)

Comments

Jiri Olsa June 6, 2022, 8:20 a.m. UTC | #1
On Thu, Jun 02, 2022 at 12:37:04PM -0700, Song Liu wrote:
> live patch and BPF trampoline (kfunc/kretfunc in bpftrace) are important
> features for modern systems. Currently, it is not possible to use live
> patch and BPF trampoline on the same kernel function at the same time.
> This is because of the resitriction that only one ftrace_ops with flag
> FTRACE_OPS_FL_IPMODIFY on the same kernel function.

is it hard to make live patch test? would be great to have
selftest for this, or at least sample module that does that,
there are already sample modules for direct interface

> 
> BPF trampoline uses direct ftrace_ops, which assumes IPMODIFY. However,
> not all direct ftrace_ops would overwrite the actual function. This means
> it is possible to have a non-IPMODIFY direct ftrace_ops to share the same
> kernel function with an IPMODIFY ftrace_ops.
> 
> Introduce FTRACE_OPS_FL_SHARE_IPMODIFY, which allows the direct ftrace_ops
> to share with IPMODIFY ftrace_ops. With FTRACE_OPS_FL_SHARE_IPMODIFY flag
> set, the direct ftrace_ops would call the target function picked by the
> IPMODIFY ftrace_ops.
> 
> Comment "IPMODIFY, DIRECT, and SHARE_IPMODIFY" in include/linux/ftrace.h
> contains more information about how SHARE_IPMODIFY interacts with IPMODIFY
> and DIRECT flags.
> 
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  include/linux/ftrace.h |  74 +++++++++++++++++
>  kernel/trace/ftrace.c  | 179 ++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 242 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 9023bf69f675..bfacf608de9c 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -98,6 +98,18 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
>  }
>  #endif
>  
> +/*
> + * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes
> + * to a ftrace_ops.
> + *
> + * ENABLE_SHARE_IPMODIFY - enable FTRACE_OPS_FL_SHARE_IPMODIFY.
> + * DISABLE_SHARE_IPMODIFY - disable FTRACE_OPS_FL_SHARE_IPMODIFY.
> + */
> +enum ftrace_ops_cmd {
> +	FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY,
> +	FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY,
> +};
> +
>  #ifdef CONFIG_FUNCTION_TRACER
>  
>  extern int ftrace_enabled;
> @@ -189,6 +201,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
>   *             ftrace_enabled.
>   * DIRECT - Used by the direct ftrace_ops helper for direct functions
>   *            (internal ftrace only, should not be used by others)
> + * SHARE_IPMODIFY - For direct ftrace_ops only. Set when the direct function
> + *            is ready to share same kernel function with IPMODIFY function
> + *            (live patch, etc.).
>   */
>  enum {
>  	FTRACE_OPS_FL_ENABLED			= BIT(0),
> @@ -209,8 +224,66 @@ enum {
>  	FTRACE_OPS_FL_TRACE_ARRAY		= BIT(15),
>  	FTRACE_OPS_FL_PERMANENT                 = BIT(16),
>  	FTRACE_OPS_FL_DIRECT			= BIT(17),
> +	FTRACE_OPS_FL_SHARE_IPMODIFY		= BIT(18),
>  };
>  
> +/*
> + * IPMODIFY, DIRECT, and SHARE_IPMODIFY.
> + *
> + * ftrace provides IPMODIFY flag for users to replace existing kernel
> + * function with a different version. This is achieved by setting regs->ip.
> + * The top user of IPMODIFY is live patch.
> + *
> + * DIRECT allows user to load custom trampoline on top of ftrace. DIRECT
> + * ftrace does not overwrite regs->ip. Instead, the custom trampoline is
> + * saved separately (for example, orig_ax on x86). The top user of DIRECT
> + * is bpf trampoline.
> + *
> + * It is not super rare to have both live patch and bpf trampoline on the
> + * same kernel function. Therefore, it is necessary to allow the two work
> + * with each other. Given that IPMODIFY and DIRECT target addressese are
> + * saved separately, this is feasible, but we need to be careful.
> + *
> + * The policy between IPMODIFY and DIRECT is:
> + *
> + *  1. Each kernel function can only have one IPMODIFY ftrace_ops;
> + *  2. Each kernel function can only have one DIRECT ftrace_ops;
> + *  3. DIRECT ftrace_ops may have IPMODIFY or not;
> + *  4. Each kernel function may have one non-DIRECT IPMODIFY ftrace_ops,
> + *     and one non-IPMODIFY DIRECT ftrace_ops at the same time. This
> + *     requires support from the DIRECT ftrace_ops. Specifically, the
> + *     DIRECT trampoline should call the kernel function at regs->ip.
> + *     If the DIRECT ftrace_ops supports sharing a function with ftrace_ops
> + *     with IPMODIFY, it should set flag SHARE_IPMODIFY.
> + *
> + * Some DIRECT ftrace_ops has an option to enable SHARE_IPMODIFY or not.
> + * Usually, the non-SHARE_IPMODIFY option gives better performance. To take
> + * advantage of this performance benefit, is necessary to only enable
> + * SHARE_IPMODIFY only when it is on the same function as an IPMODIFY
> + * ftrace_ops. There are two cases to consider:
> + *
> + *  1. IPMODIFY ftrace_ops is registered first. When the (non-IPMODIFY, and
> + *     non-SHARE_IPMODIFY) DIRECT ftrace_ops is registered later,
> + *     register_ftrace_direct_multi() returns -EAGAIN. If the user of
> + *     the DIRECT ftrace_ops can support SHARE_IPMODIFY, it should enable
> + *     SHARE_IPMODIFY and retry.
> + *  2. (non-IPMODIFY, and non-SHARE_IPMODIFY) DIRECT ftrace_ops is
> + *     registered first. When the IPMODIFY ftrace_ops is registered later,
> + *     it is necessary to ask the direct ftrace_ops to enable
> + *     SHARE_IPMODIFY support. This is achieved via ftrace_ops->ops_func
> + *     cmd=FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY. For more details on this
> + *     condition, check out prepare_direct_functions_for_ipmodify().
> + */
> +
> +/*
> + * For most ftrace_ops_cmd,
> + * Returns:
> + *        0 - Success.
> + *        -EBUSY - The operation cannot process
> + *        -EAGAIN - The operation cannot process tempoorarily.
> + */
> +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
> +
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  /* The hash used to know what functions callbacks trace */
>  struct ftrace_ops_hash {
> @@ -253,6 +326,7 @@ struct ftrace_ops {
>  	unsigned long			trampoline;
>  	unsigned long			trampoline_size;
>  	struct list_head		list;
> +	ftrace_ops_func_t		ops_func;
>  #endif
>  };
>  
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 6a419f6bbbf0..868bbc753803 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -1865,7 +1865,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
>  /*
>   * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
>   * or no-needed to update, -EBUSY if it detects a conflict of the flag
> - * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
> + * on a ftrace_rec, -EINVAL if the new_hash tries to trace all recs, and
> + * -EAGAIN if the ftrace_ops need to enable SHARE_IPMODIFY.
>   * Note that old_hash and new_hash has below meanings
>   *  - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
>   *  - If the hash is EMPTY_HASH, it hits nothing
> @@ -1875,6 +1876,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  					 struct ftrace_hash *old_hash,
>  					 struct ftrace_hash *new_hash)
>  {
> +	bool is_ipmodify, is_direct, share_ipmodify;
>  	struct ftrace_page *pg;
>  	struct dyn_ftrace *rec, *end = NULL;
>  	int in_old, in_new;
> @@ -1883,7 +1885,24 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
>  		return 0;
>  
> -	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
> +	/*
> +	 * The following are all the valid combinations of is_ipmodify,
> +	 * is_direct, and share_ipmodify
> +	 *
> +	 *             is_ipmodify     is_direct     share_ipmodify
> +	 *  #1              0               0                0
> +	 *  #2              1               0                0
> +	 *  #3              1               1                0
> +	 *  #4              0               1                0
> +	 *  #5              0               1                1
> +	 */
> +
> +
> +	is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
> +	is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
> +
> +	/* either ipmodify nor direct, skip */
> +	if (!is_ipmodify && !is_direct)   /* combinations #1 */
>  		return 0;
>  
>  	/*
> @@ -1893,6 +1912,30 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  	if (!new_hash || !old_hash)
>  		return -EINVAL;
>  
> +	share_ipmodify = ops->flags & FTRACE_OPS_FL_SHARE_IPMODIFY;
> +
> +	/*
> +	 * This ops itself doesn't do ip_modify and it can share a fentry
> +	 * with other ops with ipmodify, nothing to do.
> +	 */
> +	if (!is_ipmodify && share_ipmodify)   /* combinations #5 */
> +		return 0;
> +
> +	/*
> +	 * Only three combinations of is_ipmodify, is_direct, and
> +	 * share_ipmodify for the logic below:
> +	 * #2 live patch
> +	 * #3 direct with ipmodify
> +	 * #4 direct without ipmodify
> +	 *
> +	 *             is_ipmodify     is_direct     share_ipmodify
> +	 *  #2              1               0                0
> +	 *  #3              1               1                0
> +	 *  #4              0               1                0
> +	 *
> +	 * Only update/rollback rec->flags for is_ipmodify == 1 (#2 and #3)
> +	 */
> +
>  	/* Update rec->flags */
>  	do_for_each_ftrace_rec(pg, rec) {
>  
> @@ -1906,12 +1949,18 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  			continue;
>  
>  		if (in_new) {
> -			/* New entries must ensure no others are using it */
> -			if (rec->flags & FTRACE_FL_IPMODIFY)
> -				goto rollback;
> -			rec->flags |= FTRACE_FL_IPMODIFY;
> -		} else /* Removed entry */
> +			if (rec->flags & FTRACE_FL_IPMODIFY) {
> +				/* cannot have two ipmodify on same rec */
> +				if (is_ipmodify)  /* combination #2 and #3 */
> +					goto rollback;
> +				/* let user enable share_ipmodify and retry */
> +				return  -EAGAIN;  /* combination #4 */
> +			} else if (is_ipmodify) {
> +				rec->flags |= FTRACE_FL_IPMODIFY;
> +			}
> +		} else if (is_ipmodify) {/* Removed entry */
>  			rec->flags &= ~FTRACE_FL_IPMODIFY;
> +		}
>  	} while_for_each_ftrace_rec();
>  
>  	return 0;
> @@ -3115,14 +3164,14 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
>  }
>  
>  /*
> - * Check if the current ops references the record.
> + * Check if the current ops references the given ip.
>   *
>   * If the ops traces all functions, then it was already accounted for.
>   * If the ops does not trace the current record function, skip it.
>   * If the ops ignores the function via notrace filter, skip it.
>   */
>  static inline bool
> -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
> +ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
>  {
>  	/* If ops isn't enabled, ignore it */
>  	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
> @@ -3134,16 +3183,29 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
>  
>  	/* The function must be in the filter */
>  	if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
> -	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
> +	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
>  		return false;
>  
>  	/* If in notrace hash, we ignore it too */
> -	if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
> +	if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
>  		return false;
>  
>  	return true;
>  }
>  
> +/*
> + * Check if the current ops references the record.
> + *
> + * If the ops traces all functions, then it was already accounted for.
> + * If the ops does not trace the current record function, skip it.
> + * If the ops ignores the function via notrace filter, skip it.
> + */
> +static inline bool
> +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
> +{
> +	return ops_references_ip(ops, rec->ip);
> +}
> +
>  static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
>  {
>  	bool init_nop = ftrace_need_init_nop();
> @@ -5519,6 +5581,14 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
>  	if (ops->flags & FTRACE_OPS_FL_ENABLED)
>  		return -EINVAL;
>  
> +	/*
> +	 * if the ops does ipmodify, it cannot share the same fentry with
> +	 * other functions with ipmodify.
> +	 */
> +	if ((ops->flags & FTRACE_OPS_FL_IPMODIFY) &&
> +	    (ops->flags & FTRACE_OPS_FL_SHARE_IPMODIFY))
> +		return -EINVAL;
> +
>  	hash = ops->func_hash->filter_hash;
>  	if (ftrace_hash_empty(hash))
>  		return -EINVAL;
> @@ -7901,6 +7971,83 @@ int ftrace_is_dead(void)
>  	return ftrace_disabled;
>  }
>  
> +/*
> + * When registering ftrace_ops with IPMODIFY (not direct), it is necessary
> + * to make sure it doesn't conflict with any direct ftrace_ops. If there is
> + * existing direct ftrace_ops on a kernel function being patched, call
> + * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY on it to enable sharing.
> + *
> + * @ops:     ftrace_ops being registered.
> + *
> + * Returns:
> + *         0 - @ops does have IPMODIFY or @ops itself is DIRECT, no change
> + *             needed;
> + *         1 - @ops has IPMODIFY, hold direct_mutex;
> + *         -EBUSY - currently registered DIRECT ftrace_ops does not support
> + *                  SHARE_IPMODIFY, we need to abort the register.
> + *         -EAGAIN - cannot make changes to currently registered DIRECT
> + *                   ftrace_ops at the moment, but we can retry later. This
> + *                   is needed to avoid potential deadlocks.
> + */
> +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
> +	__acquires(&direct_mutex)
> +{
> +	struct ftrace_func_entry *entry;
> +	struct ftrace_hash *hash;
> +	struct ftrace_ops *op;
> +	int size, i, ret;
> +
> +	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY) ||
> +	    (ops->flags & FTRACE_OPS_FL_DIRECT))
> +		return 0;
> +
> +	mutex_lock(&direct_mutex);
> +
> +	hash = ops->func_hash->filter_hash;
> +	size = 1 << hash->size_bits;
> +	for (i = 0; i < size; i++) {
> +		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
> +			unsigned long ip = entry->ip;
> +			bool found_op = false;
> +
> +			mutex_lock(&ftrace_lock);
> +			do_for_each_ftrace_op(op, ftrace_ops_list) {

would it be better to iterate direct_functions hash instead?
all the registered direct functions should be there

hm maybe you would not have the 'op' then..

> +				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
> +					continue;
> +				if (op->flags & FTRACE_OPS_FL_SHARE_IPMODIFY)
> +					break;
> +				if (ops_references_ip(op, ip)) {
> +					found_op = true;
> +					break;
> +				}
> +			} while_for_each_ftrace_op(op);
> +			mutex_unlock(&ftrace_lock);

so the 'op' can't go away because it's direct and we hold direct_mutex
even though we unlocked ftrace_lock, right?

> +
> +			if (found_op) {
> +				if (!op->ops_func) {
> +					ret = -EBUSY;
> +					goto err_out;
> +				}
> +				ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY);

I did not find call with FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY flag

jirka

> +				if (ret)
> +					goto err_out;
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Didn't find any overlap with any direct function, or the direct
> +	 * function can share with ipmodify. Hold direct_mutex to make sure
> +	 * this doesn't change until we are done.
> +	 */
> +	return 1;
> +
> +err_out:
> +	mutex_unlock(&direct_mutex);
> +	return ret;
> +
> +}
> +
>  /**
>   * register_ftrace_function - register a function for profiling
>   * @ops:	ops structure that holds the function for profiling.
> @@ -7913,17 +8060,27 @@ int ftrace_is_dead(void)
>   *       recursive loop.
>   */
>  int register_ftrace_function(struct ftrace_ops *ops)
> +	__releases(&direct_mutex)
>  {
> +	bool direct_mutex_locked;
>  	int ret;
>  
>  	ftrace_ops_init(ops);
>  
> +	ret = prepare_direct_functions_for_ipmodify(ops);
> +	if (ret < 0)
> +		return ret;
> +
> +	direct_mutex_locked = ret == 1;
> +
>  	mutex_lock(&ftrace_lock);
>  
>  	ret = ftrace_startup(ops, 0);
>  
>  	mutex_unlock(&ftrace_lock);
>  
> +	if (direct_mutex_locked)
> +		mutex_unlock(&direct_mutex);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(register_ftrace_function);
> -- 
> 2.30.2
>
Song Liu June 6, 2022, 3:35 p.m. UTC | #2
> On Jun 6, 2022, at 1:20 AM, Jiri Olsa <olsajiri@gmail.com> wrote:
> 
> On Thu, Jun 02, 2022 at 12:37:04PM -0700, Song Liu wrote:
>> live patch and BPF trampoline (kfunc/kretfunc in bpftrace) are important
>> features for modern systems. Currently, it is not possible to use live
>> patch and BPF trampoline on the same kernel function at the same time.
>> This is because of the resitriction that only one ftrace_ops with flag
>> FTRACE_OPS_FL_IPMODIFY on the same kernel function.
> 
> is it hard to make live patch test? would be great to have
> selftest for this, or at least sample module that does that,
> there are already sample modules for direct interface

It is possible, but a little tricky. I can add some when selftests or
samples in later version. 

> 
>> 
>> BPF trampoline uses direct ftrace_ops, which assumes IPMODIFY. However,
>> not all direct ftrace_ops would overwrite the actual function. This means
>> it is possible to have a non-IPMODIFY direct ftrace_ops to share the same
>> kernel function with an IPMODIFY ftrace_ops.
>> 
>> Introduce FTRACE_OPS_FL_SHARE_IPMODIFY, which allows the direct ftrace_ops
>> to share with IPMODIFY ftrace_ops. With FTRACE_OPS_FL_SHARE_IPMODIFY flag
>> set, the direct ftrace_ops would call the target function picked by the
>> IPMODIFY ftrace_ops.
>> 
>> Comment "IPMODIFY, DIRECT, and SHARE_IPMODIFY" in include/linux/ftrace.h
>> contains more information about how SHARE_IPMODIFY interacts with IPMODIFY
>> and DIRECT flags.
>> 
>> Signed-off-by: Song Liu <song@kernel.org>
>> 

[...]

>> +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
>> +	__acquires(&direct_mutex)
>> +{
>> +	struct ftrace_func_entry *entry;
>> +	struct ftrace_hash *hash;
>> +	struct ftrace_ops *op;
>> +	int size, i, ret;
>> +
>> +	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY) ||
>> +	    (ops->flags & FTRACE_OPS_FL_DIRECT))
>> +		return 0;
>> +
>> +	mutex_lock(&direct_mutex);
>> +
>> +	hash = ops->func_hash->filter_hash;
>> +	size = 1 << hash->size_bits;
>> +	for (i = 0; i < size; i++) {
>> +		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
>> +			unsigned long ip = entry->ip;
>> +			bool found_op = false;
>> +
>> +			mutex_lock(&ftrace_lock);
>> +			do_for_each_ftrace_op(op, ftrace_ops_list) {
> 
> would it be better to iterate direct_functions hash instead?
> all the registered direct functions should be there
> 
> hm maybe you would not have the 'op' then..

Yeah, we need ftrace_ops here. 

> 
>> +				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
>> +					continue;
>> +				if (op->flags & FTRACE_OPS_FL_SHARE_IPMODIFY)
>> +					break;
>> +				if (ops_references_ip(op, ip)) {
>> +					found_op = true;
>> +					break;
>> +				}
>> +			} while_for_each_ftrace_op(op);
>> +			mutex_unlock(&ftrace_lock);
> 
> so the 'op' can't go away because it's direct and we hold direct_mutex
> even though we unlocked ftrace_lock, right?

Yep, we need to hold direct_mutex here. 

> 
>> +
>> +			if (found_op) {
>> +				if (!op->ops_func) {
>> +					ret = -EBUSY;
>> +					goto err_out;
>> +				}
>> +				ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY);
> 
> I did not find call with FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY flag

We don't have it yet, and I think we probably don't really need it. 
AFAICT, unloading live patch is not a common operation. So not 
recovering the performance of !SHARE_IPMODIFY should be acceptable
in those cases. That said, I can add that path if we think it is
important. 

Thanks,
Song

[...]
Steven Rostedt July 14, 2022, 12:33 a.m. UTC | #3
On Thu, 2 Jun 2022 12:37:04 -0700
Song Liu <song@kernel.org> wrote:

> live patch and BPF trampoline (kfunc/kretfunc in bpftrace) are important
> features for modern systems. Currently, it is not possible to use live
> patch and BPF trampoline on the same kernel function at the same time.
> This is because of the resitriction that only one ftrace_ops with flag
> FTRACE_OPS_FL_IPMODIFY on the same kernel function.
> 
> BPF trampoline uses direct ftrace_ops, which assumes IPMODIFY. However,
> not all direct ftrace_ops would overwrite the actual function. This means
> it is possible to have a non-IPMODIFY direct ftrace_ops to share the same
> kernel function with an IPMODIFY ftrace_ops.
> 
> Introduce FTRACE_OPS_FL_SHARE_IPMODIFY, which allows the direct ftrace_ops
> to share with IPMODIFY ftrace_ops. With FTRACE_OPS_FL_SHARE_IPMODIFY flag
> set, the direct ftrace_ops would call the target function picked by the
> IPMODIFY ftrace_ops.
> 
> Comment "IPMODIFY, DIRECT, and SHARE_IPMODIFY" in include/linux/ftrace.h
> contains more information about how SHARE_IPMODIFY interacts with IPMODIFY
> and DIRECT flags.
> 
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  include/linux/ftrace.h |  74 +++++++++++++++++
>  kernel/trace/ftrace.c  | 179 ++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 242 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 9023bf69f675..bfacf608de9c 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -98,6 +98,18 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
>  }
>  #endif
>  
> +/*
> + * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes
> + * to a ftrace_ops.
> + *
> + * ENABLE_SHARE_IPMODIFY - enable FTRACE_OPS_FL_SHARE_IPMODIFY.
> + * DISABLE_SHARE_IPMODIFY - disable FTRACE_OPS_FL_SHARE_IPMODIFY.

The above comment is basically:

	/* Set x to 1 */
	x = 1;

Probably something like this:

 * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request
   changes
 * to a ftrace_ops. Note, the requests may fail.
 *
 *	ENABLE_SHARE_IPMODIFY - Request setting the ftrace ops
 *				SHARE_IPMODIFY flag.
 *	DISABLE_SHARE_IPMODIFY - Request disabling the ftrace ops
 *				SHARE_IPMODIFY flag.


> + */
> +enum ftrace_ops_cmd {
> +	FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY,
> +	FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY,
> +};
> +
>  #ifdef CONFIG_FUNCTION_TRACER
>  
>  extern int ftrace_enabled;
> @@ -189,6 +201,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
>   *             ftrace_enabled.
>   * DIRECT - Used by the direct ftrace_ops helper for direct functions
>   *            (internal ftrace only, should not be used by others)
> + * SHARE_IPMODIFY - For direct ftrace_ops only. Set when the direct function
> + *            is ready to share same kernel function with IPMODIFY function
> + *            (live patch, etc.).
>   */
>  enum {
>  	FTRACE_OPS_FL_ENABLED			= BIT(0),
> @@ -209,8 +224,66 @@ enum {
>  	FTRACE_OPS_FL_TRACE_ARRAY		= BIT(15),
>  	FTRACE_OPS_FL_PERMANENT                 = BIT(16),
>  	FTRACE_OPS_FL_DIRECT			= BIT(17),
> +	FTRACE_OPS_FL_SHARE_IPMODIFY		= BIT(18),
>  };
>  
> +/*
> + * IPMODIFY, DIRECT, and SHARE_IPMODIFY.
> + *
> + * ftrace provides IPMODIFY flag for users to replace existing kernel
> + * function with a different version. This is achieved by setting regs->ip.
> + * The top user of IPMODIFY is live patch.
> + *
> + * DIRECT allows user to load custom trampoline on top of ftrace. DIRECT
> + * ftrace does not overwrite regs->ip. Instead, the custom trampoline is

No need to state if DIRECT modifies regs->ip or not. ftrace must assume
that it does (more below).

> + * saved separately (for example, orig_ax on x86). The top user of DIRECT
> + * is bpf trampoline.
> + *
> + * It is not super rare to have both live patch and bpf trampoline on the
> + * same kernel function. Therefore, it is necessary to allow the two work

					"the two to work"

> + * with each other. Given that IPMODIFY and DIRECT target addressese are

						"addresses"

> + * saved separately, this is feasible, but we need to be careful.
> + *
> + * The policy between IPMODIFY and DIRECT is:
> + *
> + *  1. Each kernel function can only have one IPMODIFY ftrace_ops;
> + *  2. Each kernel function can only have one DIRECT ftrace_ops;
> + *  3. DIRECT ftrace_ops may have IPMODIFY or not;

I was thinking about this more, and I think by default we should
consider all DIRECT ftrace_ops as the same as IPMODIFY. So perhaps the
first patch is to just remove the IPMODIFY from direct (as you did) but
then make all checks for multiple IPMODIFY also check DIRECT as well.

That is because there's no way that ftrace can verify that a direct
trampoline modifies the IP or not. Thus, it must assume that all do.

> + *  4. Each kernel function may have one non-DIRECT IPMODIFY ftrace_ops,
> + *     and one non-IPMODIFY DIRECT ftrace_ops at the same time. This
> + *     requires support from the DIRECT ftrace_ops. Specifically, the
> + *     DIRECT trampoline should call the kernel function at regs->ip.
> + *     If the DIRECT ftrace_ops supports sharing a function with ftrace_ops
> + *     with IPMODIFY, it should set flag SHARE_IPMODIFY.
> + *
> + * Some DIRECT ftrace_ops has an option to enable SHARE_IPMODIFY or not.
> + * Usually, the non-SHARE_IPMODIFY option gives better performance. To take
> + * advantage of this performance benefit, is necessary to only enable

The performance part of this comment should not be in ftrace. It's an
implementation detail of the direct trampoline and may not even be
accurate with other implementations.

> + * SHARE_IPMODIFY only when it is on the same function as an IPMODIFY
> + * ftrace_ops. There are two cases to consider:
> + *
> + *  1. IPMODIFY ftrace_ops is registered first. When the (non-IPMODIFY, and
> + *     non-SHARE_IPMODIFY) DIRECT ftrace_ops is registered later,
> + *     register_ftrace_direct_multi() returns -EAGAIN. If the user of
> + *     the DIRECT ftrace_ops can support SHARE_IPMODIFY, it should enable
> + *     SHARE_IPMODIFY and retry.

If this ftrace_ops being registered can support SHARE_IPMODIFY, then it
should have the ops_func defined, in which case, why not have it just
call that instead of having to return -EAGAIN?


> + *  2. (non-IPMODIFY, and non-SHARE_IPMODIFY) DIRECT ftrace_ops is
> + *     registered first. When the IPMODIFY ftrace_ops is registered later,
> + *     it is necessary to ask the direct ftrace_ops to enable
> + *     SHARE_IPMODIFY support. This is achieved via ftrace_ops->ops_func
> + *     cmd=FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY. For more details on this
> + *     condition, check out prepare_direct_functions_for_ipmodify().
> + */
> +
> +/*
> + * For most ftrace_ops_cmd,
> + * Returns:
> + *        0 - Success.
> + *        -EBUSY - The operation cannot process
> + *        -EAGAIN - The operation cannot process tempoorarily.
> + */
> +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
> +
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  /* The hash used to know what functions callbacks trace */
>  struct ftrace_ops_hash {
> @@ -253,6 +326,7 @@ struct ftrace_ops {
>  	unsigned long			trampoline;
>  	unsigned long			trampoline_size;
>  	struct list_head		list;
> +	ftrace_ops_func_t		ops_func;
>  #endif
>  };
>  
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 6a419f6bbbf0..868bbc753803 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -1865,7 +1865,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
>  /*
>   * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
>   * or no-needed to update, -EBUSY if it detects a conflict of the flag
> - * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
> + * on a ftrace_rec, -EINVAL if the new_hash tries to trace all recs, and
> + * -EAGAIN if the ftrace_ops need to enable SHARE_IPMODIFY.

It should just call the ftrace_ops() with the command to set it. If you
want, we could add another CMD enum that can be passed for this case.

>   * Note that old_hash and new_hash has below meanings
>   *  - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
>   *  - If the hash is EMPTY_HASH, it hits nothing
> @@ -1875,6 +1876,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  					 struct ftrace_hash *old_hash,
>  					 struct ftrace_hash *new_hash)
>  {
> +	bool is_ipmodify, is_direct, share_ipmodify;
>  	struct ftrace_page *pg;
>  	struct dyn_ftrace *rec, *end = NULL;
>  	int in_old, in_new;
> @@ -1883,7 +1885,24 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
>  		return 0;
>  
> -	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
> +	/*
> +	 * The following are all the valid combinations of is_ipmodify,
> +	 * is_direct, and share_ipmodify
> +	 *
> +	 *             is_ipmodify     is_direct     share_ipmodify
> +	 *  #1              0               0                0
> +	 *  #2              1               0                0
> +	 *  #3              1               1                0

I still think that DIRECT should automatically be considered IPMODIFY
(at least in the view of ftrace, whether or not the direct function
modifies the IP).

> +	 *  #4              0               1                0
> +	 *  #5              0               1                1
> +	 */
> +
> +
> +	is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
> +	is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
> +
> +	/* either ipmodify nor direct, skip */
> +	if (!is_ipmodify && !is_direct)   /* combinations #1 */
>  		return 0;
>  
>  	/*
> @@ -1893,6 +1912,30 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  	if (!new_hash || !old_hash)
>  		return -EINVAL;
>  
> +	share_ipmodify = ops->flags & FTRACE_OPS_FL_SHARE_IPMODIFY;
> +
> +	/*
> +	 * This ops itself doesn't do ip_modify and it can share a fentry
> +	 * with other ops with ipmodify, nothing to do.
> +	 */
> +	if (!is_ipmodify && share_ipmodify)   /* combinations #5 */
> +		return 0;
> +

Really, if connecting to a function that already has IPMODIFY, then the
ops_func() needs to be called, and if the ops supports SHARED_IPMODIFY
then it should get set and then continue. 

Make sense?

-- Steve

> +	/*
> +	 * Only three combinations of is_ipmodify, is_direct, and
> +	 * share_ipmodify for the logic below:
> +	 * #2 live patch
> +	 * #3 direct with ipmodify
> +	 * #4 direct without ipmodify
> +	 *
> +	 *             is_ipmodify     is_direct     share_ipmodify
> +	 *  #2              1               0                0
> +	 *  #3              1               1                0
> +	 *  #4              0               1                0
> +	 *
> +	 * Only update/rollback rec->flags for is_ipmodify == 1 (#2 and #3)
> +	 */
> +
>  	/* Update rec->flags */
>  	do_for_each_ftrace_rec(pg, rec) {
>  
> @@ -1906,12 +1949,18 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>  			continue;
>  
>  		if (in_new) {
> -			/* New entries must ensure no others are using it */
> -			if (rec->flags & FTRACE_FL_IPMODIFY)
> -				goto rollback;
> -			rec->flags |= FTRACE_FL_IPMODIFY;
> -		} else /* Removed entry */
> +			if (rec->flags & FTRACE_FL_IPMODIFY) {
> +				/* cannot have two ipmodify on same rec */
> +				if (is_ipmodify)  /* combination #2 and #3 */
> +					goto rollback;
> +				/* let user enable share_ipmodify and retry */
> +				return  -EAGAIN;  /* combination #4 */
> +			} else if (is_ipmodify) {
> +				rec->flags |= FTRACE_FL_IPMODIFY;
> +			}
> +		} else if (is_ipmodify) {/* Removed entry */
>  			rec->flags &= ~FTRACE_FL_IPMODIFY;
> +		}
>  	} while_for_each_ftrace_rec();
>  
>  	return 0;
Song Liu July 15, 2022, 12:13 a.m. UTC | #4
> On Jul 13, 2022, at 5:33 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Thu, 2 Jun 2022 12:37:04 -0700
> Song Liu <song@kernel.org> wrote:
> 
>> live patch and BPF trampoline (kfunc/kretfunc in bpftrace) are important
>> features for modern systems. Currently, it is not possible to use live
>> patch and BPF trampoline on the same kernel function at the same time.
>> This is because of the resitriction that only one ftrace_ops with flag
>> FTRACE_OPS_FL_IPMODIFY on the same kernel function.
>> 
>> BPF trampoline uses direct ftrace_ops, which assumes IPMODIFY. However,
>> not all direct ftrace_ops would overwrite the actual function. This means
>> it is possible to have a non-IPMODIFY direct ftrace_ops to share the same
>> kernel function with an IPMODIFY ftrace_ops.
>> 
>> Introduce FTRACE_OPS_FL_SHARE_IPMODIFY, which allows the direct ftrace_ops
>> to share with IPMODIFY ftrace_ops. With FTRACE_OPS_FL_SHARE_IPMODIFY flag
>> set, the direct ftrace_ops would call the target function picked by the
>> IPMODIFY ftrace_ops.
>> 
>> Comment "IPMODIFY, DIRECT, and SHARE_IPMODIFY" in include/linux/ftrace.h
>> contains more information about how SHARE_IPMODIFY interacts with IPMODIFY
>> and DIRECT flags.
>> 
>> Signed-off-by: Song Liu <song@kernel.org>
>> ---
>> include/linux/ftrace.h | 74 +++++++++++++++++
>> kernel/trace/ftrace.c | 179 ++++++++++++++++++++++++++++++++++++++---
>> 2 files changed, 242 insertions(+), 11 deletions(-)
>> 
>> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
>> index 9023bf69f675..bfacf608de9c 100644
>> --- a/include/linux/ftrace.h
>> +++ b/include/linux/ftrace.h
>> @@ -98,6 +98,18 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
>> }
>> #endif
>> 
>> +/*
>> + * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes
>> + * to a ftrace_ops.
>> + *
>> + * ENABLE_SHARE_IPMODIFY - enable FTRACE_OPS_FL_SHARE_IPMODIFY.
>> + * DISABLE_SHARE_IPMODIFY - disable FTRACE_OPS_FL_SHARE_IPMODIFY.
> 
> The above comment is basically:
> 
> 	/* Set x to 1 */
> 	x = 1;
> 
> Probably something like this:
> 
> * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request
> changes
> * to a ftrace_ops. Note, the requests may fail.
> *
> *	ENABLE_SHARE_IPMODIFY - Request setting the ftrace ops
> *				SHARE_IPMODIFY flag.
> *	DISABLE_SHARE_IPMODIFY - Request disabling the ftrace ops
> *				SHARE_IPMODIFY flag.
> 
> 
>> + */
>> +enum ftrace_ops_cmd {
>> +	FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY,
>> +	FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY,
>> +};
>> +
>> #ifdef CONFIG_FUNCTION_TRACER
>> 
>> extern int ftrace_enabled;
>> @@ -189,6 +201,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
>> * ftrace_enabled.
>> * DIRECT - Used by the direct ftrace_ops helper for direct functions
>> * (internal ftrace only, should not be used by others)
>> + * SHARE_IPMODIFY - For direct ftrace_ops only. Set when the direct function
>> + * is ready to share same kernel function with IPMODIFY function
>> + * (live patch, etc.).
>> */
>> enum {
>> 	FTRACE_OPS_FL_ENABLED			= BIT(0),
>> @@ -209,8 +224,66 @@ enum {
>> 	FTRACE_OPS_FL_TRACE_ARRAY		= BIT(15),
>> 	FTRACE_OPS_FL_PERMANENT = BIT(16),
>> 	FTRACE_OPS_FL_DIRECT			= BIT(17),
>> +	FTRACE_OPS_FL_SHARE_IPMODIFY		= BIT(18),
>> };
>> 
>> +/*
>> + * IPMODIFY, DIRECT, and SHARE_IPMODIFY.
>> + *
>> + * ftrace provides IPMODIFY flag for users to replace existing kernel
>> + * function with a different version. This is achieved by setting regs->ip.
>> + * The top user of IPMODIFY is live patch.
>> + *
>> + * DIRECT allows user to load custom trampoline on top of ftrace. DIRECT
>> + * ftrace does not overwrite regs->ip. Instead, the custom trampoline is
> 
> No need to state if DIRECT modifies regs->ip or not. ftrace must assume
> that it does (more below).
> 
>> + * saved separately (for example, orig_ax on x86). The top user of DIRECT
>> + * is bpf trampoline.
>> + *
>> + * It is not super rare to have both live patch and bpf trampoline on the
>> + * same kernel function. Therefore, it is necessary to allow the two work
> 
> 					"the two to work"
> 
>> + * with each other. Given that IPMODIFY and DIRECT target addressese are
> 
> 						"addresses"
> 
>> + * saved separately, this is feasible, but we need to be careful.
>> + *
>> + * The policy between IPMODIFY and DIRECT is:
>> + *
>> + * 1. Each kernel function can only have one IPMODIFY ftrace_ops;
>> + * 2. Each kernel function can only have one DIRECT ftrace_ops;
>> + * 3. DIRECT ftrace_ops may have IPMODIFY or not;
> 
> I was thinking about this more, and I think by default we should
> consider all DIRECT ftrace_ops as the same as IPMODIFY. So perhaps the
> first patch is to just remove the IPMODIFY from direct (as you did) but
> then make all checks for multiple IPMODIFY also check DIRECT as well.
> 
> That is because there's no way that ftrace can verify that a direct
> trampoline modifies the IP or not. Thus, it must assume that all do.
> 
>> + * 4. Each kernel function may have one non-DIRECT IPMODIFY ftrace_ops,
>> + * and one non-IPMODIFY DIRECT ftrace_ops at the same time. This
>> + * requires support from the DIRECT ftrace_ops. Specifically, the
>> + * DIRECT trampoline should call the kernel function at regs->ip.
>> + * If the DIRECT ftrace_ops supports sharing a function with ftrace_ops
>> + * with IPMODIFY, it should set flag SHARE_IPMODIFY.
>> + *
>> + * Some DIRECT ftrace_ops has an option to enable SHARE_IPMODIFY or not.
>> + * Usually, the non-SHARE_IPMODIFY option gives better performance. To take
>> + * advantage of this performance benefit, is necessary to only enable
> 
> The performance part of this comment should not be in ftrace. It's an
> implementation detail of the direct trampoline and may not even be
> accurate with other implementations.
> 
>> + * SHARE_IPMODIFY only when it is on the same function as an IPMODIFY
>> + * ftrace_ops. There are two cases to consider:
>> + *
>> + * 1. IPMODIFY ftrace_ops is registered first. When the (non-IPMODIFY, and
>> + * non-SHARE_IPMODIFY) DIRECT ftrace_ops is registered later,
>> + * register_ftrace_direct_multi() returns -EAGAIN. If the user of
>> + * the DIRECT ftrace_ops can support SHARE_IPMODIFY, it should enable
>> + * SHARE_IPMODIFY and retry.
> 
> If this ftrace_ops being registered can support SHARE_IPMODIFY, then it
> should have the ops_func defined, in which case, why not have it just
> call that instead of having to return -EAGAIN?
> 
> 
>> + * 2. (non-IPMODIFY, and non-SHARE_IPMODIFY) DIRECT ftrace_ops is
>> + * registered first. When the IPMODIFY ftrace_ops is registered later,
>> + * it is necessary to ask the direct ftrace_ops to enable
>> + * SHARE_IPMODIFY support. This is achieved via ftrace_ops->ops_func
>> + * cmd=FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY. For more details on this
>> + * condition, check out prepare_direct_functions_for_ipmodify().
>> + */
>> +
>> +/*
>> + * For most ftrace_ops_cmd,
>> + * Returns:
>> + * 0 - Success.
>> + * -EBUSY - The operation cannot process
>> + * -EAGAIN - The operation cannot process tempoorarily.
>> + */
>> +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
>> +
>> #ifdef CONFIG_DYNAMIC_FTRACE
>> /* The hash used to know what functions callbacks trace */
>> struct ftrace_ops_hash {
>> @@ -253,6 +326,7 @@ struct ftrace_ops {
>> 	unsigned long			trampoline;
>> 	unsigned long			trampoline_size;
>> 	struct list_head		list;
>> +	ftrace_ops_func_t		ops_func;
>> #endif
>> };
>> 
>> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
>> index 6a419f6bbbf0..868bbc753803 100644
>> --- a/kernel/trace/ftrace.c
>> +++ b/kernel/trace/ftrace.c
>> @@ -1865,7 +1865,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
>> /*
>> * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
>> * or no-needed to update, -EBUSY if it detects a conflict of the flag
>> - * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
>> + * on a ftrace_rec, -EINVAL if the new_hash tries to trace all recs, and
>> + * -EAGAIN if the ftrace_ops need to enable SHARE_IPMODIFY.
> 
> It should just call the ftrace_ops() with the command to set it. If you
> want, we could add another CMD enum that can be passed for this case.
> 
>> * Note that old_hash and new_hash has below meanings
>> * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
>> * - If the hash is EMPTY_HASH, it hits nothing
>> @@ -1875,6 +1876,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>> 					 struct ftrace_hash *old_hash,
>> 					 struct ftrace_hash *new_hash)
>> {
>> +	bool is_ipmodify, is_direct, share_ipmodify;
>> 	struct ftrace_page *pg;
>> 	struct dyn_ftrace *rec, *end = NULL;
>> 	int in_old, in_new;
>> @@ -1883,7 +1885,24 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>> 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
>> 		return 0;
>> 
>> -	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
>> +	/*
>> +	 * The following are all the valid combinations of is_ipmodify,
>> +	 * is_direct, and share_ipmodify
>> +	 *
>> +	 * is_ipmodify is_direct share_ipmodify
>> +	 * #1 0 0 0
>> +	 * #2 1 0 0
>> +	 * #3 1 1 0
> 
> I still think that DIRECT should automatically be considered IPMODIFY
> (at least in the view of ftrace, whether or not the direct function
> modifies the IP).
> 
>> +	 * #4 0 1 0
>> +	 * #5 0 1 1
>> +	 */
>> +
>> +
>> +	is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
>> +	is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
>> +
>> +	/* either ipmodify nor direct, skip */
>> +	if (!is_ipmodify && !is_direct) /* combinations #1 */
>> 		return 0;
>> 
>> 	/*
>> @@ -1893,6 +1912,30 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>> 	if (!new_hash || !old_hash)
>> 		return -EINVAL;
>> 
>> +	share_ipmodify = ops->flags & FTRACE_OPS_FL_SHARE_IPMODIFY;
>> +
>> +	/*
>> +	 * This ops itself doesn't do ip_modify and it can share a fentry
>> +	 * with other ops with ipmodify, nothing to do.
>> +	 */
>> +	if (!is_ipmodify && share_ipmodify) /* combinations #5 */
>> +		return 0;
>> +
> 
> Really, if connecting to a function that already has IPMODIFY, then the
> ops_func() needs to be called, and if the ops supports SHARED_IPMODIFY
> then it should get set and then continue. 
> 
> Make sense?
> 
> -- Steve
> 

I think there is one more problem here. If we force all direct trampoline
set IPMODIFY, and remove the SHARE_IPMODIFY flag. It may cause confusion 
and/or extra work here (__ftrace_hash_update_ipmodify). 

Say __ftrace_hash_update_ipmodify() tries to attach an ops with IPMODIFY, 
and found the rec already has IPMODIFY. At this point, we have to iterate
all ftrace ops (do_for_each_ftrace_op) to confirm whether the IPMODIFY is 
from 

1) a direct ops that can share IPMODIFY, or 
2) a direct ops that cannot share IPMODIFY, or 
3) another non-direct ops with IPMODIFY. 

For the 1), this attach works, while for 2) and 3), the attach doesn't work. 

OTOH, with current version (v2), we trust the direct ops to set or clear 
IPMODIFY. rec with IPMODIFY makes it clear that it cannot share with another
ops with IPMODIFY. Then we don't have to iterate over all ftrace ops here. 

Does this make sense? Did I miss some better solutions?

Thanks,
Song

>> +	/*
>> +	 * Only three combinations of is_ipmodify, is_direct, and
>> +	 * share_ipmodify for the logic below:
>> +	 * #2 live patch
>> +	 * #3 direct with ipmodify
>> +	 * #4 direct without ipmodify
>> +	 *
>> +	 * is_ipmodify is_direct share_ipmodify
>> +	 * #2 1 0 0
>> +	 * #3 1 1 0
>> +	 * #4 0 1 0
>> +	 *
>> +	 * Only update/rollback rec->flags for is_ipmodify == 1 (#2 and #3)
>> +	 */
>> +
>> 	/* Update rec->flags */
>> 	do_for_each_ftrace_rec(pg, rec) {
>> 
>> @@ -1906,12 +1949,18 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
>> 			continue;
>> 
>> 		if (in_new) {
>> -			/* New entries must ensure no others are using it */
>> -			if (rec->flags & FTRACE_FL_IPMODIFY)
>> -				goto rollback;
>> -			rec->flags |= FTRACE_FL_IPMODIFY;
>> -		} else /* Removed entry */
>> +			if (rec->flags & FTRACE_FL_IPMODIFY) {
>> +				/* cannot have two ipmodify on same rec */
>> +				if (is_ipmodify) /* combination #2 and #3 */
>> +					goto rollback;
>> +				/* let user enable share_ipmodify and retry */
>> +				return -EAGAIN; /* combination #4 */
>> +			} else if (is_ipmodify) {
>> +				rec->flags |= FTRACE_FL_IPMODIFY;
>> +			}
>> +		} else if (is_ipmodify) {/* Removed entry */
>> 			rec->flags &= ~FTRACE_FL_IPMODIFY;
>> +		}
>> 	} while_for_each_ftrace_rec();
>> 
>> 	return 0;
Steven Rostedt July 15, 2022, 12:48 a.m. UTC | #5
On Fri, 15 Jul 2022 00:13:51 +0000
Song Liu <songliubraving@fb.com> wrote:

> I think there is one more problem here. If we force all direct trampoline
> set IPMODIFY, and remove the SHARE_IPMODIFY flag. It may cause confusion 
> and/or extra work here (__ftrace_hash_update_ipmodify). 

I'm saying that the caller (BPF) does not need to set IPMODIFY. Just
change the ftrace.c to treat IPMODIFY the same as direct. In fact, the
two should be mutually exclusive (from a ftrace point of view). That
is, if DIRECT is set, then IPMODIFY must not be.

Again, ftrace will take care of the accounting of if a rec has both
IPMODIFY and DIRECT on it.

> 
> Say __ftrace_hash_update_ipmodify() tries to attach an ops with IPMODIFY, 
> and found the rec already has IPMODIFY. At this point, we have to iterate
> all ftrace ops (do_for_each_ftrace_op) to confirm whether the IPMODIFY is 
> from 

What I'm suggesting is that a DIRECT ops will never set IPMODIFY. Like
I mentioned before, ftrace doesn't care if the direct trampoline
modifies IP or not. All ftrace will do is ask the owner of the direct
ops if it is safe to have an IP modify attached to it or not. And at
the same time will tell the direct ops owner that it is adding an
IPMODIFY ops such that it can take the appropriate actions.

In other words, IPMODIFY on the rec means that it can not attach
anything else with an IPMODIFY on it.

The direct ops should only set the DIRECT flag. If an IPMODIFY ops is
being added, if it already has an IPMODIFY then it will fail right there.

If direct is set, then a loop for the direct ops will be done and the
callback is made. If the direct ops is OK with the IPMODIFY then it
will adjust itself to work with the IPMODIFY and the IPMODIFY will
continue to be added (and then set the rec IPMODIFY flag).

> 
> 1) a direct ops that can share IPMODIFY, or 
> 2) a direct ops that cannot share IPMODIFY, or 
> 3) another non-direct ops with IPMODIFY. 
> 
> For the 1), this attach works, while for 2) and 3), the attach doesn't work. 
> 
> OTOH, with current version (v2), we trust the direct ops to set or clear 
> IPMODIFY. rec with IPMODIFY makes it clear that it cannot share with another
> ops with IPMODIFY. Then we don't have to iterate over all ftrace ops here. 

The only time an iterate should be done is if rec->flags is DIRECT and !IPMODIFY.

> 
> Does this make sense? Did I miss some better solutions?

Did I fill in the holes? ;-)

-- Steve
Song Liu July 15, 2022, 2:04 a.m. UTC | #6
> On Jul 14, 2022, at 5:48 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Fri, 15 Jul 2022 00:13:51 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>> I think there is one more problem here. If we force all direct trampoline
>> set IPMODIFY, and remove the SHARE_IPMODIFY flag. It may cause confusion 
>> and/or extra work here (__ftrace_hash_update_ipmodify). 
> 
> I'm saying that the caller (BPF) does not need to set IPMODIFY. Just
> change the ftrace.c to treat IPMODIFY the same as direct. In fact, the
> two should be mutually exclusive (from a ftrace point of view). That
> is, if DIRECT is set, then IPMODIFY must not be.
> 
> Again, ftrace will take care of the accounting of if a rec has both
> IPMODIFY and DIRECT on it.
> 
>> 
>> Say __ftrace_hash_update_ipmodify() tries to attach an ops with IPMODIFY, 
>> and found the rec already has IPMODIFY. At this point, we have to iterate
>> all ftrace ops (do_for_each_ftrace_op) to confirm whether the IPMODIFY is 
>> from 
> 
> What I'm suggesting is that a DIRECT ops will never set IPMODIFY.

Aha, this the point I misunderstood. I thought DIRECT ops would always
set IPMODIFY (as it does now). 

> Like
> I mentioned before, ftrace doesn't care if the direct trampoline
> modifies IP or not. All ftrace will do is ask the owner of the direct
> ops if it is safe to have an IP modify attached to it or not. And at
> the same time will tell the direct ops owner that it is adding an
> IPMODIFY ops such that it can take the appropriate actions.
> 
> In other words, IPMODIFY on the rec means that it can not attach
> anything else with an IPMODIFY on it.
> 
> The direct ops should only set the DIRECT flag. If an IPMODIFY ops is
> being added, if it already has an IPMODIFY then it will fail right there.
> 
> If direct is set, then a loop for the direct ops will be done and the
> callback is made. If the direct ops is OK with the IPMODIFY then it
> will adjust itself to work with the IPMODIFY and the IPMODIFY will
> continue to be added (and then set the rec IPMODIFY flag).
> 
>> 
>> 1) a direct ops that can share IPMODIFY, or 
>> 2) a direct ops that cannot share IPMODIFY, or 
>> 3) another non-direct ops with IPMODIFY. 
>> 
>> For the 1), this attach works, while for 2) and 3), the attach doesn't work. 
>> 
>> OTOH, with current version (v2), we trust the direct ops to set or clear 
>> IPMODIFY. rec with IPMODIFY makes it clear that it cannot share with another
>> ops with IPMODIFY. Then we don't have to iterate over all ftrace ops here. 
> 
> The only time an iterate should be done is if rec->flags is DIRECT and !IPMODIFY.

Yeah, this makes sense. And this shouldn't be too expensive.

> 
>> 
>> Does this make sense? Did I miss some better solutions?
> 
> Did I fill in the holes? ;-)

You sure did. :)

Thanks,
Song
Steven Rostedt July 15, 2022, 2:46 a.m. UTC | #7
On Fri, 15 Jul 2022 02:04:33 +0000
Song Liu <songliubraving@fb.com> wrote:

> > What I'm suggesting is that a DIRECT ops will never set IPMODIFY.  
> 
> Aha, this the point I misunderstood. I thought DIRECT ops would always
> set IPMODIFY (as it does now). 

My fault. I was probably not being clear when I was suggesting that
DIRECT should *act* like an IPMODIFY, but never explicitly stated that
it should not set the IPMODIFY flag.

The only reason it does today was to make it easy to act like an
IPMODIFY (because it set the flag). But I'm now suggesting to get rid
of that and just make DIRECT act like an IPMDOFIY as there can only be
one of them on a function, but now we have some cases where DIRECT can
work with IPMODIFY via the callbacks.

-- Steve
Song Liu July 15, 2022, 2:50 a.m. UTC | #8
> On Jul 14, 2022, at 7:46 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Fri, 15 Jul 2022 02:04:33 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>> What I'm suggesting is that a DIRECT ops will never set IPMODIFY.  
>> 
>> Aha, this the point I misunderstood. I thought DIRECT ops would always
>> set IPMODIFY (as it does now). 
> 
> My fault. I was probably not being clear when I was suggesting that
> DIRECT should *act* like an IPMODIFY, but never explicitly stated that
> it should not set the IPMODIFY flag.
> 
> The only reason it does today was to make it easy to act like an
> IPMODIFY (because it set the flag). But I'm now suggesting to get rid
> of that and just make DIRECT act like an IPMDOFIY as there can only be
> one of them on a function, but now we have some cases where DIRECT can
> work with IPMODIFY via the callbacks.

Thanks for the clarification. I think we are finally on the same page on
this. :)

Song
Song Liu July 15, 2022, 5:42 p.m. UTC | #9
Hi Steven,

> On Jul 14, 2022, at 7:50 PM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Jul 14, 2022, at 7:46 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>> 
>> On Fri, 15 Jul 2022 02:04:33 +0000
>> Song Liu <songliubraving@fb.com> wrote:
>> 
>>>> What I'm suggesting is that a DIRECT ops will never set IPMODIFY.  
>>> 
>>> Aha, this the point I misunderstood. I thought DIRECT ops would always
>>> set IPMODIFY (as it does now). 
>> 
>> My fault. I was probably not being clear when I was suggesting that
>> DIRECT should *act* like an IPMODIFY, but never explicitly stated that
>> it should not set the IPMODIFY flag.
>> 
>> The only reason it does today was to make it easy to act like an
>> IPMODIFY (because it set the flag). But I'm now suggesting to get rid
>> of that and just make DIRECT act like an IPMDOFIY as there can only be
>> one of them on a function, but now we have some cases where DIRECT can
>> work with IPMODIFY via the callbacks.
> 
> Thanks for the clarification. I think we are finally on the same page on
> this. :)

A quick update and ask for feedback/clarification.

Based on my understanding, you recommended calling ops_func() from 
__ftrace_hash_update_ipmodify() and in ops_func() the direct trampoline
may make changes to the trampoline. Did I get this right?


I am going towards this direction, but hit some issue. Specifically, in 
__ftrace_hash_update_ipmodify(), ftrace_lock is already locked, so the 
direct trampoline cannot easily make changes with 
modify_ftrace_direct_multi(), which locks both direct_mutex and 
ftrace_mutex. 

One solution would be have no-lock version of all the functions called
by modify_ftrace_direct_multi(), but that's a lot of functions and the
code will be pretty ugly. The alternative would be the logic in v2: 
__ftrace_hash_update_ipmodify() returns -EAGAIN, and we make changes to 
the direct trampoline in other places: 

1) if DIRECT ops attached first, the trampoline is updated in 
   prepare_direct_functions_for_ipmodify(), see 3/5 of v2;

2) if IPMODIFY ops attached first, the trampoline is updated in
   bpf_trampoline_update(), see "goto again" path in 5/5 of v2. 

Overall, I think this way is still cleaner. What do you think about this?

Thanks,
Song
Steven Rostedt July 15, 2022, 7:12 p.m. UTC | #10
On Fri, 15 Jul 2022 17:42:55 +0000
Song Liu <songliubraving@fb.com> wrote:


> A quick update and ask for feedback/clarification.
> 
> Based on my understanding, you recommended calling ops_func() from 
> __ftrace_hash_update_ipmodify() and in ops_func() the direct trampoline
> may make changes to the trampoline. Did I get this right?
> 
> 
> I am going towards this direction, but hit some issue. Specifically, in 
> __ftrace_hash_update_ipmodify(), ftrace_lock is already locked, so the 
> direct trampoline cannot easily make changes with 
> modify_ftrace_direct_multi(), which locks both direct_mutex and 
> ftrace_mutex. 
> 
> One solution would be have no-lock version of all the functions called
> by modify_ftrace_direct_multi(), but that's a lot of functions and the
> code will be pretty ugly. The alternative would be the logic in v2: 
> __ftrace_hash_update_ipmodify() returns -EAGAIN, and we make changes to 
> the direct trampoline in other places: 
> 
> 1) if DIRECT ops attached first, the trampoline is updated in 
>    prepare_direct_functions_for_ipmodify(), see 3/5 of v2;
> 
> 2) if IPMODIFY ops attached first, the trampoline is updated in
>    bpf_trampoline_update(), see "goto again" path in 5/5 of v2. 
> 
> Overall, I think this way is still cleaner. What do you think about this?

What about if we release the lock when doing the callback?

Then we just need to make sure things are the same after reacquiring the
lock, and if they are different, we release the lock again and do the
callback with the new update. Wash, rinse, repeat, until the state is the
same before and after the callback with locks acquired?

This is a common way to handle callbacks that need to do something that
takes the lock held before doing a callback.

The reason I say this, is because the more we can keep the accounting
inside of ftrace the better.

Wouldn't this need to be done anyway if BPF was first and live kernel
patching needed the update? An -EAGAIN would not suffice.

-- Steve
Song Liu July 15, 2022, 7:49 p.m. UTC | #11
> On Jul 15, 2022, at 12:12 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Fri, 15 Jul 2022 17:42:55 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
> 
>> A quick update and ask for feedback/clarification.
>> 
>> Based on my understanding, you recommended calling ops_func() from 
>> __ftrace_hash_update_ipmodify() and in ops_func() the direct trampoline
>> may make changes to the trampoline. Did I get this right?
>> 
>> 
>> I am going towards this direction, but hit some issue. Specifically, in 
>> __ftrace_hash_update_ipmodify(), ftrace_lock is already locked, so the 
>> direct trampoline cannot easily make changes with 
>> modify_ftrace_direct_multi(), which locks both direct_mutex and 
>> ftrace_mutex. 
>> 
>> One solution would be have no-lock version of all the functions called
>> by modify_ftrace_direct_multi(), but that's a lot of functions and the
>> code will be pretty ugly. The alternative would be the logic in v2: 
>> __ftrace_hash_update_ipmodify() returns -EAGAIN, and we make changes to 
>> the direct trampoline in other places: 
>> 
>> 1) if DIRECT ops attached first, the trampoline is updated in 
>>   prepare_direct_functions_for_ipmodify(), see 3/5 of v2;
>> 
>> 2) if IPMODIFY ops attached first, the trampoline is updated in
>>   bpf_trampoline_update(), see "goto again" path in 5/5 of v2. 
>> 
>> Overall, I think this way is still cleaner. What do you think about this?
> 
> What about if we release the lock when doing the callback?

We can probably unlock ftrace_lock here. But we may break locking order 
with direct mutex (see below).

> 
> Then we just need to make sure things are the same after reacquiring the
> lock, and if they are different, we release the lock again and do the
> callback with the new update. Wash, rinse, repeat, until the state is the
> same before and after the callback with locks acquired?

Personally, I would like to avoid wash-rinse-repeat here.

> 
> This is a common way to handle callbacks that need to do something that
> takes the lock held before doing a callback.
> 
> The reason I say this, is because the more we can keep the accounting
> inside of ftrace the better.
> 
> Wouldn't this need to be done anyway if BPF was first and live kernel
> patching needed the update? An -EAGAIN would not suffice.

prepare_direct_functions_for_ipmodify handles BPF-first-livepatch-later
case. The benefit of prepare_direct_functions_for_ipmodify() is that it 
holds direct_mutex before ftrace_lock, and keeps holding it if necessary. 
This is enough to make sure we don't need the wash-rinse-repeat. 

OTOH, if we wait until __ftrace_hash_update_ipmodify(), we already hold
ftrace_lock, but not direct_mutex. To make changes to bpf trampoline, we
have to unlock ftrace_lock and lock direct_mutex to avoid deadlock. 
However, this means we will need the wash-rinse-repeat. 


For livepatch-first-BPF-later case, we can probably handle this in 
__ftrace_hash_update_ipmodify(), since we hold both direct_mutex and 
ftrace_lock. We can unlock ftrace_lock and update the BPF trampoline. 
It is safe against changes to direct ops, because we are still holding 
direct_mutex. But, is this safe against another IPMODIFY ops? I am not 
sure yet... Also, this is pretty weird because, we are updating a 
direct trampoline before we finish registering it for the first time. 
IOW, we are calling modify_ftrace_direct_multi_nolock for the same 
trampoline before register_ftrace_direct_multi() returns.

The approach in v2 propagates the -EAGAIN to BPF side, so these are two
independent calls of register_ftrace_direct_multi(). This does require
some protocol between ftrace core and its user, but I still think this 
is a cleaner approach. 

Does this make sense?

Thanks,
Song
Steven Rostedt July 15, 2022, 7:59 p.m. UTC | #12
On Fri, 15 Jul 2022 19:49:00 +0000
Song Liu <songliubraving@fb.com> wrote:

> > 
> > What about if we release the lock when doing the callback?  
> 
> We can probably unlock ftrace_lock here. But we may break locking order 
> with direct mutex (see below).

You're talking about the multi registering case, right?

> 
> > 
> > Then we just need to make sure things are the same after reacquiring the
> > lock, and if they are different, we release the lock again and do the
> > callback with the new update. Wash, rinse, repeat, until the state is the
> > same before and after the callback with locks acquired?  
> 
> Personally, I would like to avoid wash-rinse-repeat here.

But it's common to do. Keeps your hair cleaner that way ;-)

> 
> > 
> > This is a common way to handle callbacks that need to do something that
> > takes the lock held before doing a callback.
> > 
> > The reason I say this, is because the more we can keep the accounting
> > inside of ftrace the better.
> > 
> > Wouldn't this need to be done anyway if BPF was first and live kernel
> > patching needed the update? An -EAGAIN would not suffice.  
> 
> prepare_direct_functions_for_ipmodify handles BPF-first-livepatch-later
> case. The benefit of prepare_direct_functions_for_ipmodify() is that it 
> holds direct_mutex before ftrace_lock, and keeps holding it if necessary. 
> This is enough to make sure we don't need the wash-rinse-repeat. 
> 
> OTOH, if we wait until __ftrace_hash_update_ipmodify(), we already hold
> ftrace_lock, but not direct_mutex. To make changes to bpf trampoline, we
> have to unlock ftrace_lock and lock direct_mutex to avoid deadlock. 
> However, this means we will need the wash-rinse-repeat. 
> 
> 
> For livepatch-first-BPF-later case, we can probably handle this in 
> __ftrace_hash_update_ipmodify(), since we hold both direct_mutex and 
> ftrace_lock. We can unlock ftrace_lock and update the BPF trampoline. 
> It is safe against changes to direct ops, because we are still holding 
> direct_mutex. But, is this safe against another IPMODIFY ops? I am not 
> sure yet... Also, this is pretty weird because, we are updating a 
> direct trampoline before we finish registering it for the first time. 
> IOW, we are calling modify_ftrace_direct_multi_nolock for the same 
> trampoline before register_ftrace_direct_multi() returns.
> 
> The approach in v2 propagates the -EAGAIN to BPF side, so these are two
> independent calls of register_ftrace_direct_multi(). This does require
> some protocol between ftrace core and its user, but I still think this 
> is a cleaner approach. 

The issue I have with this approach is it couples BPF and ftrace a bit too
much.

But there is a way with my approach you can still do your approach. That
is, have ops_func() return zero if everything is fine, and otherwise returns
a negative value. Then have the register function fail and return whatever
value that gets returned by the ops_func()

Then have the bpf ops_func() check (does this direct caller handle
IPMODIFY? if yes, return 0, else return -EAGAIN). Then the registering of
ftrace fails with your -EAGAIN, and then you can change the direct
trampoline to handle IPMODIFY and try again. This time when ops_func() is
called, it sees that the direct trampoline can handle the IPMODIFY and
returns 0.

Basically, it's a way to still implement my suggestion, but let BPF decide
to use -EAGAIN to try again. And then BPF and ftrace don't need to have
these special flags to change the behavior of each other.

-- Steve
Song Liu July 15, 2022, 8:21 p.m. UTC | #13
> On Jul 15, 2022, at 12:59 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Fri, 15 Jul 2022 19:49:00 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>> 
>>> What about if we release the lock when doing the callback?  
>> 
>> We can probably unlock ftrace_lock here. But we may break locking order 
>> with direct mutex (see below).
> 
> You're talking about the multi registering case, right?

We are using the *_ftrace_direct_multi() API here, to be able to specify
ops_func. The direct single API just uses the shared direct_ops. 

> 
>> 
>>> 
>>> Then we just need to make sure things are the same after reacquiring the
>>> lock, and if they are different, we release the lock again and do the
>>> callback with the new update. Wash, rinse, repeat, until the state is the
>>> same before and after the callback with locks acquired?  
>> 
>> Personally, I would like to avoid wash-rinse-repeat here.
> 
> But it's common to do. Keeps your hair cleaner that way ;-)
> 
>> 
>>> 
>>> This is a common way to handle callbacks that need to do something that
>>> takes the lock held before doing a callback.
>>> 
>>> The reason I say this, is because the more we can keep the accounting
>>> inside of ftrace the better.
>>> 
>>> Wouldn't this need to be done anyway if BPF was first and live kernel
>>> patching needed the update? An -EAGAIN would not suffice.  
>> 
>> prepare_direct_functions_for_ipmodify handles BPF-first-livepatch-later
>> case. The benefit of prepare_direct_functions_for_ipmodify() is that it 
>> holds direct_mutex before ftrace_lock, and keeps holding it if necessary. 
>> This is enough to make sure we don't need the wash-rinse-repeat. 
>> 
>> OTOH, if we wait until __ftrace_hash_update_ipmodify(), we already hold
>> ftrace_lock, but not direct_mutex. To make changes to bpf trampoline, we
>> have to unlock ftrace_lock and lock direct_mutex to avoid deadlock. 
>> However, this means we will need the wash-rinse-repeat. 

What do you think about the prepare_direct_functions_for_ipmodify() 
approach? If this is not ideal, maybe we can simplify it so that it only
holds direct_mutex (when necessary). The benefit is that we are sure
direct_mutex is already held in __ftrace_hash_update_ipmodify(). However, 
I think it is not safe to unlock ftrace_lock in __ftrace_hash_update_ipmodify(). 
We can get parallel do_for_each_ftrace_rec(), which is dangerous, no? 

>> 
>> 
>> For livepatch-first-BPF-later case, we can probably handle this in 
>> __ftrace_hash_update_ipmodify(), since we hold both direct_mutex and 
>> ftrace_lock. We can unlock ftrace_lock and update the BPF trampoline. 
>> It is safe against changes to direct ops, because we are still holding 
>> direct_mutex. But, is this safe against another IPMODIFY ops? I am not 
>> sure yet... Also, this is pretty weird because, we are updating a 
>> direct trampoline before we finish registering it for the first time. 
>> IOW, we are calling modify_ftrace_direct_multi_nolock for the same 
>> trampoline before register_ftrace_direct_multi() returns.
>> 
>> The approach in v2 propagates the -EAGAIN to BPF side, so these are two
>> independent calls of register_ftrace_direct_multi(). This does require
>> some protocol between ftrace core and its user, but I still think this 
>> is a cleaner approach. 
> 
> The issue I have with this approach is it couples BPF and ftrace a bit too
> much.
> 
> But there is a way with my approach you can still do your approach. That
> is, have ops_func() return zero if everything is fine, and otherwise returns
> a negative value. Then have the register function fail and return whatever
> value that gets returned by the ops_func()
> 
> Then have the bpf ops_func() check (does this direct caller handle
> IPMODIFY? if yes, return 0, else return -EAGAIN). Then the registering of
> ftrace fails with your -EAGAIN, and then you can change the direct
> trampoline to handle IPMODIFY and try again. This time when ops_func() is
> called, it sees that the direct trampoline can handle the IPMODIFY and
> returns 0.
> 
> Basically, it's a way to still implement my suggestion, but let BPF decide
> to use -EAGAIN to try again. And then BPF and ftrace don't need to have
> these special flags to change the behavior of each other.

I like this one. So there is no protocol about the return value here. 

Thanks,
Song
Steven Rostedt July 15, 2022, 9:29 p.m. UTC | #14
On Fri, 15 Jul 2022 20:21:49 +0000
Song Liu <songliubraving@fb.com> wrote:

> >>> Wouldn't this need to be done anyway if BPF was first and live kernel
> >>> patching needed the update? An -EAGAIN would not suffice.    
> >> 
> >> prepare_direct_functions_for_ipmodify handles BPF-first-livepatch-later
> >> case. The benefit of prepare_direct_functions_for_ipmodify() is that it 
> >> holds direct_mutex before ftrace_lock, and keeps holding it if necessary. 
> >> This is enough to make sure we don't need the wash-rinse-repeat. 
> >> 
> >> OTOH, if we wait until __ftrace_hash_update_ipmodify(), we already hold
> >> ftrace_lock, but not direct_mutex. To make changes to bpf trampoline, we
> >> have to unlock ftrace_lock and lock direct_mutex to avoid deadlock. 
> >> However, this means we will need the wash-rinse-repeat.   
> 
> What do you think about the prepare_direct_functions_for_ipmodify() 
> approach? If this is not ideal, maybe we can simplify it so that it only
> holds direct_mutex (when necessary). The benefit is that we are sure
> direct_mutex is already held in __ftrace_hash_update_ipmodify(). However, 
> I think it is not safe to unlock ftrace_lock in __ftrace_hash_update_ipmodify(). 
> We can get parallel do_for_each_ftrace_rec(), which is dangerous, no? 

I'm fine with it. But one nit on the logic:

>  int register_ftrace_function(struct ftrace_ops *ops)
> +	__releases(&direct_mutex)
>  {
> +	bool direct_mutex_locked;
>  	int ret;
>  
>  	ftrace_ops_init(ops);
>  
> +	ret = prepare_direct_functions_for_ipmodify(ops);
> +	if (ret < 0)
> +		return ret;
> +
> +	direct_mutex_locked = ret == 1;
> +

Please make the above:

	if (ret < 0)
		return ret;
	else if (ret == 1)
		direct_mutex_locked = true;

It's much easier to read that way.

-- Steve

>  	mutex_lock(&ftrace_lock);
>  
>  	ret = ftrace_startup(ops, 0);
>  
>  	mutex_unlock(&ftrace_lock);
>  
> +	if (direct_mutex_locked)
> +		mutex_unlock(&direct_mutex);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(register_ftrace_function);
> --
Song Liu July 15, 2022, 9:48 p.m. UTC | #15
> On Jul 15, 2022, at 2:29 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Fri, 15 Jul 2022 20:21:49 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>>>> Wouldn't this need to be done anyway if BPF was first and live kernel
>>>>> patching needed the update? An -EAGAIN would not suffice.    
>>>> 
>>>> prepare_direct_functions_for_ipmodify handles BPF-first-livepatch-later
>>>> case. The benefit of prepare_direct_functions_for_ipmodify() is that it 
>>>> holds direct_mutex before ftrace_lock, and keeps holding it if necessary. 
>>>> This is enough to make sure we don't need the wash-rinse-repeat. 
>>>> 
>>>> OTOH, if we wait until __ftrace_hash_update_ipmodify(), we already hold
>>>> ftrace_lock, but not direct_mutex. To make changes to bpf trampoline, we
>>>> have to unlock ftrace_lock and lock direct_mutex to avoid deadlock. 
>>>> However, this means we will need the wash-rinse-repeat.   
>> 
>> What do you think about the prepare_direct_functions_for_ipmodify() 
>> approach? If this is not ideal, maybe we can simplify it so that it only
>> holds direct_mutex (when necessary). The benefit is that we are sure
>> direct_mutex is already held in __ftrace_hash_update_ipmodify(). However, 
>> I think it is not safe to unlock ftrace_lock in __ftrace_hash_update_ipmodify(). 
>> We can get parallel do_for_each_ftrace_rec(), which is dangerous, no? 
> 
> I'm fine with it. But one nit on the logic:
> 
>> int register_ftrace_function(struct ftrace_ops *ops)
>> +	__releases(&direct_mutex)
>> {
>> +	bool direct_mutex_locked;
>> 	int ret;
>> 
>> 	ftrace_ops_init(ops);
>> 
>> +	ret = prepare_direct_functions_for_ipmodify(ops);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	direct_mutex_locked = ret == 1;
>> +
> 
> Please make the above:
> 
> 	if (ret < 0)
> 		return ret;
> 	else if (ret == 1)
> 		direct_mutex_locked = true;
> 
> It's much easier to read that way.

Thanks for the clarification! 

Song
Steven Rostedt July 15, 2022, 9:50 p.m. UTC | #16
On Fri, 15 Jul 2022 21:48:21 +0000
Song Liu <songliubraving@fb.com> wrote:

> >> int register_ftrace_function(struct ftrace_ops *ops)
> >> +	__releases(&direct_mutex)
> >> {
> >> +	bool direct_mutex_locked;

You'll need:

	bool direct_mutex_locked = false;

obviously ;-)

-- Steve

> >> 	int ret;
> >> 
> >> 	ftrace_ops_init(ops);
> >> 
> >> +	ret = prepare_direct_functions_for_ipmodify(ops);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >> +	direct_mutex_locked = ret == 1;
> >> +  
> > 
> > Please make the above:
> > 
> > 	if (ret < 0)
> > 		return ret;
> > 	else if (ret == 1)
> > 		direct_mutex_locked = true;
> > 
> > It's much easier to read that way.  
> 
> Thanks for the clarification!
diff mbox series

Patch

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9023bf69f675..bfacf608de9c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -98,6 +98,18 @@  static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
 }
 #endif
 
+/*
+ * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes
+ * to a ftrace_ops.
+ *
+ * ENABLE_SHARE_IPMODIFY - enable FTRACE_OPS_FL_SHARE_IPMODIFY.
+ * DISABLE_SHARE_IPMODIFY - disable FTRACE_OPS_FL_SHARE_IPMODIFY.
+ */
+enum ftrace_ops_cmd {
+	FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY,
+	FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY,
+};
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -189,6 +201,9 @@  ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  *             ftrace_enabled.
  * DIRECT - Used by the direct ftrace_ops helper for direct functions
  *            (internal ftrace only, should not be used by others)
+ * SHARE_IPMODIFY - For direct ftrace_ops only. Set when the direct function
+ *            is ready to share same kernel function with IPMODIFY function
+ *            (live patch, etc.).
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= BIT(0),
@@ -209,8 +224,66 @@  enum {
 	FTRACE_OPS_FL_TRACE_ARRAY		= BIT(15),
 	FTRACE_OPS_FL_PERMANENT                 = BIT(16),
 	FTRACE_OPS_FL_DIRECT			= BIT(17),
+	FTRACE_OPS_FL_SHARE_IPMODIFY		= BIT(18),
 };
 
+/*
+ * IPMODIFY, DIRECT, and SHARE_IPMODIFY.
+ *
+ * ftrace provides IPMODIFY flag for users to replace existing kernel
+ * function with a different version. This is achieved by setting regs->ip.
+ * The top user of IPMODIFY is live patch.
+ *
+ * DIRECT allows user to load custom trampoline on top of ftrace. DIRECT
+ * ftrace does not overwrite regs->ip. Instead, the custom trampoline is
+ * saved separately (for example, orig_ax on x86). The top user of DIRECT
+ * is bpf trampoline.
+ *
+ * It is not super rare to have both live patch and bpf trampoline on the
+ * same kernel function. Therefore, it is necessary to allow the two work
+ * with each other. Given that IPMODIFY and DIRECT target addressese are
+ * saved separately, this is feasible, but we need to be careful.
+ *
+ * The policy between IPMODIFY and DIRECT is:
+ *
+ *  1. Each kernel function can only have one IPMODIFY ftrace_ops;
+ *  2. Each kernel function can only have one DIRECT ftrace_ops;
+ *  3. DIRECT ftrace_ops may have IPMODIFY or not;
+ *  4. Each kernel function may have one non-DIRECT IPMODIFY ftrace_ops,
+ *     and one non-IPMODIFY DIRECT ftrace_ops at the same time. This
+ *     requires support from the DIRECT ftrace_ops. Specifically, the
+ *     DIRECT trampoline should call the kernel function at regs->ip.
+ *     If the DIRECT ftrace_ops supports sharing a function with ftrace_ops
+ *     with IPMODIFY, it should set flag SHARE_IPMODIFY.
+ *
+ * Some DIRECT ftrace_ops has an option to enable SHARE_IPMODIFY or not.
+ * Usually, the non-SHARE_IPMODIFY option gives better performance. To take
+ * advantage of this performance benefit, is necessary to only enable
+ * SHARE_IPMODIFY only when it is on the same function as an IPMODIFY
+ * ftrace_ops. There are two cases to consider:
+ *
+ *  1. IPMODIFY ftrace_ops is registered first. When the (non-IPMODIFY, and
+ *     non-SHARE_IPMODIFY) DIRECT ftrace_ops is registered later,
+ *     register_ftrace_direct_multi() returns -EAGAIN. If the user of
+ *     the DIRECT ftrace_ops can support SHARE_IPMODIFY, it should enable
+ *     SHARE_IPMODIFY and retry.
+ *  2. (non-IPMODIFY, and non-SHARE_IPMODIFY) DIRECT ftrace_ops is
+ *     registered first. When the IPMODIFY ftrace_ops is registered later,
+ *     it is necessary to ask the direct ftrace_ops to enable
+ *     SHARE_IPMODIFY support. This is achieved via ftrace_ops->ops_func
+ *     cmd=FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY. For more details on this
+ *     condition, check out prepare_direct_functions_for_ipmodify().
+ */
+
+/*
+ * For most ftrace_ops_cmd,
+ * Returns:
+ *        0 - Success.
+ *        -EBUSY - The operation cannot process
+ *        -EAGAIN - The operation cannot process tempoorarily.
+ */
+typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* The hash used to know what functions callbacks trace */
 struct ftrace_ops_hash {
@@ -253,6 +326,7 @@  struct ftrace_ops {
 	unsigned long			trampoline;
 	unsigned long			trampoline_size;
 	struct list_head		list;
+	ftrace_ops_func_t		ops_func;
 #endif
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6a419f6bbbf0..868bbc753803 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1865,7 +1865,8 @@  static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
 /*
  * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
  * or no-needed to update, -EBUSY if it detects a conflict of the flag
- * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * on a ftrace_rec, -EINVAL if the new_hash tries to trace all recs, and
+ * -EAGAIN if the ftrace_ops need to enable SHARE_IPMODIFY.
  * Note that old_hash and new_hash has below meanings
  *  - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
  *  - If the hash is EMPTY_HASH, it hits nothing
@@ -1875,6 +1876,7 @@  static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 					 struct ftrace_hash *old_hash,
 					 struct ftrace_hash *new_hash)
 {
+	bool is_ipmodify, is_direct, share_ipmodify;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec, *end = NULL;
 	int in_old, in_new;
@@ -1883,7 +1885,24 @@  static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
 		return 0;
 
-	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+	/*
+	 * The following are all the valid combinations of is_ipmodify,
+	 * is_direct, and share_ipmodify
+	 *
+	 *             is_ipmodify     is_direct     share_ipmodify
+	 *  #1              0               0                0
+	 *  #2              1               0                0
+	 *  #3              1               1                0
+	 *  #4              0               1                0
+	 *  #5              0               1                1
+	 */
+
+
+	is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
+	is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
+
+	/* either ipmodify nor direct, skip */
+	if (!is_ipmodify && !is_direct)   /* combinations #1 */
 		return 0;
 
 	/*
@@ -1893,6 +1912,30 @@  static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 	if (!new_hash || !old_hash)
 		return -EINVAL;
 
+	share_ipmodify = ops->flags & FTRACE_OPS_FL_SHARE_IPMODIFY;
+
+	/*
+	 * This ops itself doesn't do ip_modify and it can share a fentry
+	 * with other ops with ipmodify, nothing to do.
+	 */
+	if (!is_ipmodify && share_ipmodify)   /* combinations #5 */
+		return 0;
+
+	/*
+	 * Only three combinations of is_ipmodify, is_direct, and
+	 * share_ipmodify for the logic below:
+	 * #2 live patch
+	 * #3 direct with ipmodify
+	 * #4 direct without ipmodify
+	 *
+	 *             is_ipmodify     is_direct     share_ipmodify
+	 *  #2              1               0                0
+	 *  #3              1               1                0
+	 *  #4              0               1                0
+	 *
+	 * Only update/rollback rec->flags for is_ipmodify == 1 (#2 and #3)
+	 */
+
 	/* Update rec->flags */
 	do_for_each_ftrace_rec(pg, rec) {
 
@@ -1906,12 +1949,18 @@  static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 			continue;
 
 		if (in_new) {
-			/* New entries must ensure no others are using it */
-			if (rec->flags & FTRACE_FL_IPMODIFY)
-				goto rollback;
-			rec->flags |= FTRACE_FL_IPMODIFY;
-		} else /* Removed entry */
+			if (rec->flags & FTRACE_FL_IPMODIFY) {
+				/* cannot have two ipmodify on same rec */
+				if (is_ipmodify)  /* combination #2 and #3 */
+					goto rollback;
+				/* let user enable share_ipmodify and retry */
+				return  -EAGAIN;  /* combination #4 */
+			} else if (is_ipmodify) {
+				rec->flags |= FTRACE_FL_IPMODIFY;
+			}
+		} else if (is_ipmodify) {/* Removed entry */
 			rec->flags &= ~FTRACE_FL_IPMODIFY;
+		}
 	} while_for_each_ftrace_rec();
 
 	return 0;
@@ -3115,14 +3164,14 @@  static inline int ops_traces_mod(struct ftrace_ops *ops)
 }
 
 /*
- * Check if the current ops references the record.
+ * Check if the current ops references the given ip.
  *
  * If the ops traces all functions, then it was already accounted for.
  * If the ops does not trace the current record function, skip it.
  * If the ops ignores the function via notrace filter, skip it.
  */
 static inline bool
-ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
 {
 	/* If ops isn't enabled, ignore it */
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -3134,16 +3183,29 @@  ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
 
 	/* The function must be in the filter */
 	if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
-	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
 		return false;
 
 	/* If in notrace hash, we ignore it too */
-	if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
+	if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
 		return false;
 
 	return true;
 }
 
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static inline bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+	return ops_references_ip(ops, rec->ip);
+}
+
 static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 {
 	bool init_nop = ftrace_need_init_nop();
@@ -5519,6 +5581,14 @@  int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	if (ops->flags & FTRACE_OPS_FL_ENABLED)
 		return -EINVAL;
 
+	/*
+	 * if the ops does ipmodify, it cannot share the same fentry with
+	 * other functions with ipmodify.
+	 */
+	if ((ops->flags & FTRACE_OPS_FL_IPMODIFY) &&
+	    (ops->flags & FTRACE_OPS_FL_SHARE_IPMODIFY))
+		return -EINVAL;
+
 	hash = ops->func_hash->filter_hash;
 	if (ftrace_hash_empty(hash))
 		return -EINVAL;
@@ -7901,6 +7971,83 @@  int ftrace_is_dead(void)
 	return ftrace_disabled;
 }
 
+/*
+ * When registering ftrace_ops with IPMODIFY (not direct), it is necessary
+ * to make sure it doesn't conflict with any direct ftrace_ops. If there is
+ * existing direct ftrace_ops on a kernel function being patched, call
+ * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY on it to enable sharing.
+ *
+ * @ops:     ftrace_ops being registered.
+ *
+ * Returns:
+ *         0 - @ops does have IPMODIFY or @ops itself is DIRECT, no change
+ *             needed;
+ *         1 - @ops has IPMODIFY, hold direct_mutex;
+ *         -EBUSY - currently registered DIRECT ftrace_ops does not support
+ *                  SHARE_IPMODIFY, we need to abort the register.
+ *         -EAGAIN - cannot make changes to currently registered DIRECT
+ *                   ftrace_ops at the moment, but we can retry later. This
+ *                   is needed to avoid potential deadlocks.
+ */
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+	__acquires(&direct_mutex)
+{
+	struct ftrace_func_entry *entry;
+	struct ftrace_hash *hash;
+	struct ftrace_ops *op;
+	int size, i, ret;
+
+	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY) ||
+	    (ops->flags & FTRACE_OPS_FL_DIRECT))
+		return 0;
+
+	mutex_lock(&direct_mutex);
+
+	hash = ops->func_hash->filter_hash;
+	size = 1 << hash->size_bits;
+	for (i = 0; i < size; i++) {
+		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+			unsigned long ip = entry->ip;
+			bool found_op = false;
+
+			mutex_lock(&ftrace_lock);
+			do_for_each_ftrace_op(op, ftrace_ops_list) {
+				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+					continue;
+				if (op->flags & FTRACE_OPS_FL_SHARE_IPMODIFY)
+					break;
+				if (ops_references_ip(op, ip)) {
+					found_op = true;
+					break;
+				}
+			} while_for_each_ftrace_op(op);
+			mutex_unlock(&ftrace_lock);
+
+			if (found_op) {
+				if (!op->ops_func) {
+					ret = -EBUSY;
+					goto err_out;
+				}
+				ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY);
+				if (ret)
+					goto err_out;
+			}
+		}
+	}
+
+	/*
+	 * Didn't find any overlap with any direct function, or the direct
+	 * function can share with ipmodify. Hold direct_mutex to make sure
+	 * this doesn't change until we are done.
+	 */
+	return 1;
+
+err_out:
+	mutex_unlock(&direct_mutex);
+	return ret;
+
+}
+
 /**
  * register_ftrace_function - register a function for profiling
  * @ops:	ops structure that holds the function for profiling.
@@ -7913,17 +8060,27 @@  int ftrace_is_dead(void)
  *       recursive loop.
  */
 int register_ftrace_function(struct ftrace_ops *ops)
+	__releases(&direct_mutex)
 {
+	bool direct_mutex_locked;
 	int ret;
 
 	ftrace_ops_init(ops);
 
+	ret = prepare_direct_functions_for_ipmodify(ops);
+	if (ret < 0)
+		return ret;
+
+	direct_mutex_locked = ret == 1;
+
 	mutex_lock(&ftrace_lock);
 
 	ret = ftrace_startup(ops, 0);
 
 	mutex_unlock(&ftrace_lock);
 
+	if (direct_mutex_locked)
+		mutex_unlock(&direct_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_function);