diff mbox series

[v8,31/35] fprobe: Rewrite fprobe on function-graph tracer

Message ID 170887444346.564249.9630774181398892961.stgit@devnote2 (mailing list archive)
State Not Applicable
Headers show
Series tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
netdev/series_format fail Series longer than 15 patches (and no cover letter)
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 5530 this patch: 8500
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 24 maintainers not CCed: mpe@ellerman.id.au mingo@redhat.com linux-arm-kernel@lists.infradead.org npiggin@gmail.com x86@kernel.org hca@linux.ibm.com naveen.n.rao@linux.ibm.com linux-s390@vger.kernel.org christophe.leroy@csgroup.eu bp@alien8.de chenhuacai@kernel.org hpa@zytor.com catalin.marinas@arm.com will@kernel.org gor@linux.ibm.com mathieu.desnoyers@efficios.com linuxppc-dev@lists.ozlabs.org kernel@xen0n.name akpm@linux-foundation.org aneesh.kumar@kernel.org dave.hansen@linux.intel.com borntraeger@linux.ibm.com loongarch@lists.linux.dev agordeev@linux.ibm.com
netdev/build_clang success Errors and warnings before: 1148 this patch: 1148
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6242 this patch: 6242
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Please use a blank line after function/struct/union/enum declarations CHECK: Prefer using the BIT macro CHECK: multiple assignments should be avoided WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 3 this patch: 4
netdev/source_inline fail Was 1 now: 4

Commit Message

Masami Hiramatsu (Google) Feb. 25, 2024, 3:20 p.m. UTC
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Rewrite fprobe implementation on function-graph tracer.
Major API changes are:
 -  'nr_maxactive' field is deprecated.
 -  This depends on CONFIG_DYNAMIC_FTRACE_WITH_ARGS or
    !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS, and
    CONFIG_HAVE_FUNCTION_GRAPH_FREGS. So currently works only
    on x86_64.
 -  Currently the entry size is limited in 15 * sizeof(long).
 -  If there is too many fprobe exit handler set on the same
    function, it will fail to probe.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - Use trace_func_graph_ret/ent_t for fgraph_ops.
  - Update CONFIG_FPROBE dependencies.
  - Add ftrace_regs_get_return_address() for each arch.
 Changes in v3:
  - Update for new reserve_data/retrieve_data API.
  - Fix internal push/pop on fgraph data logic so that it can
    correctly save/restore the returning fprobes.
 Changes in v2:
  - Add more lockdep_assert_held(fprobe_mutex)
  - Use READ_ONCE() and WRITE_ONCE() for fprobe_hlist_node::fp.
  - Add NOKPROBE_SYMBOL() for the functions which is called from
    entry/exit callback.
---
 arch/arm64/include/asm/ftrace.h     |    6 
 arch/loongarch/include/asm/ftrace.h |    6 
 arch/powerpc/include/asm/ftrace.h   |    6 
 arch/s390/include/asm/ftrace.h      |    6 
 arch/x86/include/asm/ftrace.h       |    6 
 include/linux/fprobe.h              |   54 ++-
 include/linux/ftrace.h              |    7 
 kernel/trace/Kconfig                |    8 
 kernel/trace/fprobe.c               |  638 +++++++++++++++++++++++++----------
 lib/test_fprobe.c                   |   45 --
 10 files changed, 536 insertions(+), 246 deletions(-)

Comments

Jiri Olsa Feb. 26, 2024, 12:56 p.m. UTC | #1
On Mon, Feb 26, 2024 at 12:20:43AM +0900, Masami Hiramatsu (Google) wrote:
> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> Rewrite fprobe implementation on function-graph tracer.
> Major API changes are:
>  -  'nr_maxactive' field is deprecated.
>  -  This depends on CONFIG_DYNAMIC_FTRACE_WITH_ARGS or
>     !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS, and
>     CONFIG_HAVE_FUNCTION_GRAPH_FREGS. So currently works only
>     on x86_64.
>  -  Currently the entry size is limited in 15 * sizeof(long).
>  -  If there is too many fprobe exit handler set on the same
>     function, it will fail to probe.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  Changes in v8:
>   - Use trace_func_graph_ret/ent_t for fgraph_ops.
>   - Update CONFIG_FPROBE dependencies.
>   - Add ftrace_regs_get_return_address() for each arch.
>  Changes in v3:
>   - Update for new reserve_data/retrieve_data API.
>   - Fix internal push/pop on fgraph data logic so that it can
>     correctly save/restore the returning fprobes.
>  Changes in v2:
>   - Add more lockdep_assert_held(fprobe_mutex)
>   - Use READ_ONCE() and WRITE_ONCE() for fprobe_hlist_node::fp.
>   - Add NOKPROBE_SYMBOL() for the functions which is called from
>     entry/exit callback.
> ---
>  arch/arm64/include/asm/ftrace.h     |    6 
>  arch/loongarch/include/asm/ftrace.h |    6 
>  arch/powerpc/include/asm/ftrace.h   |    6 
>  arch/s390/include/asm/ftrace.h      |    6 
>  arch/x86/include/asm/ftrace.h       |    6 
>  include/linux/fprobe.h              |   54 ++-
>  include/linux/ftrace.h              |    7 
>  kernel/trace/Kconfig                |    8 
>  kernel/trace/fprobe.c               |  638 +++++++++++++++++++++++++----------
>  lib/test_fprobe.c                   |   45 --
>  10 files changed, 536 insertions(+), 246 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
> index 95a8f349f871..800c75f46a13 100644
> --- a/arch/arm64/include/asm/ftrace.h
> +++ b/arch/arm64/include/asm/ftrace.h
> @@ -143,6 +143,12 @@ ftrace_regs_get_frame_pointer(const struct ftrace_regs *fregs)
>  	return fregs->fp;
>  }
>  
> +static __always_inline unsigned long
> +ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
> +{
> +	return fregs->lr;
> +}
> +
>  static __always_inline struct pt_regs *
>  ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs *regs)
>  {
> diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h
> index 14a1576bf948..b8432b7cc9d4 100644
> --- a/arch/loongarch/include/asm/ftrace.h
> +++ b/arch/loongarch/include/asm/ftrace.h
> @@ -81,6 +81,12 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip)
>  #define ftrace_regs_get_frame_pointer(fregs) \
>  	((fregs)->regs.regs[22])
>  
> +static __always_inline unsigned long
> +ftrace_regs_get_return_address(struct ftrace_regs *fregs)
> +{
> +	return *(unsigned long *)(fregs->regs.regs[1]);
> +}
> +
>  #define ftrace_graph_func ftrace_graph_func
>  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>  		       struct ftrace_ops *op, struct ftrace_regs *fregs);
> diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
> index 773e9011cff1..7ac1bc3e7196 100644
> --- a/arch/powerpc/include/asm/ftrace.h
> +++ b/arch/powerpc/include/asm/ftrace.h
> @@ -85,6 +85,12 @@ ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
>  #define ftrace_regs_query_register_offset(name) \
>  	regs_query_register_offset(name)
>  
> +static __always_inline unsigned long
> +ftrace_regs_get_return_address(struct ftrace_regs *fregs)
> +{
> +	return fregs->regs.link;
> +}
> +
>  struct ftrace_ops;
>  
>  #define ftrace_graph_func ftrace_graph_func
> diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
> index e910924eee59..d434a0497683 100644
> --- a/arch/s390/include/asm/ftrace.h
> +++ b/arch/s390/include/asm/ftrace.h
> @@ -89,6 +89,12 @@ ftrace_regs_get_frame_pointer(struct ftrace_regs *fregs)
>  	return sp[0];	/* return backchain */
>  }
>  
> +static __always_inline unsigned long
> +ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
> +{
> +	return fregs->regs.gprs[14];
> +}
> +
>  #define arch_ftrace_fill_perf_regs(fregs, _regs)	 do {		\
>  		(_regs)->psw.addr = (fregs)->regs.psw.addr;		\
>  		(_regs)->gprs[15] = (fregs)->regs.gprs[15];		\
> diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> index 7625887fc49b..979d3458a328 100644
> --- a/arch/x86/include/asm/ftrace.h
> +++ b/arch/x86/include/asm/ftrace.h
> @@ -82,6 +82,12 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
>  #define ftrace_regs_get_frame_pointer(fregs) \
>  	frame_pointer(&(fregs)->regs)
>  
> +static __always_inline unsigned long
> +ftrace_regs_get_return_address(struct ftrace_regs *fregs)
> +{
> +	return *(unsigned long *)ftrace_regs_get_stack_pointer(fregs);
> +}
> +
>  struct ftrace_ops;
>  #define ftrace_graph_func ftrace_graph_func
>  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
> diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
> index 879a30956009..08b37b0d1d05 100644
> --- a/include/linux/fprobe.h
> +++ b/include/linux/fprobe.h
> @@ -5,32 +5,56 @@
>  
>  #include <linux/compiler.h>
>  #include <linux/ftrace.h>
> -#include <linux/rethook.h>
> +#include <linux/rcupdate.h>
> +#include <linux/refcount.h>
> +#include <linux/slab.h>
> +
> +struct fprobe;
> +
> +/**
> + * strcut fprobe_hlist_node - address based hash list node for fprobe.
> + *
> + * @hlist: The hlist node for address search hash table.
> + * @addr: The address represented by this.
> + * @fp: The fprobe which owns this.
> + */
> +struct fprobe_hlist_node {
> +	struct hlist_node	hlist;
> +	unsigned long		addr;
> +	struct fprobe		*fp;
> +};
> +
> +/**
> + * struct fprobe_hlist - hash list nodes for fprobe.
> + *
> + * @hlist: The hlist node for existence checking hash table.
> + * @rcu: rcu_head for RCU deferred release.
> + * @fp: The fprobe which owns this fprobe_hlist.
> + * @size: The size of @array.
> + * @array: The fprobe_hlist_node for each address to probe.
> + */
> +struct fprobe_hlist {
> +	struct hlist_node		hlist;
> +	struct rcu_head			rcu;
> +	struct fprobe			*fp;
> +	int				size;
> +	struct fprobe_hlist_node	array[];
> +};
>  
>  /**
>   * struct fprobe - ftrace based probe.
> - * @ops: The ftrace_ops.
> + *
>   * @nmissed: The counter for missing events.
>   * @flags: The status flag.
> - * @rethook: The rethook data structure. (internal data)
>   * @entry_data_size: The private data storage size.
> - * @nr_maxactive: The max number of active functions.
> + * @nr_maxactive: The max number of active functions. (*deprecated)
>   * @entry_handler: The callback function for function entry.
>   * @exit_handler: The callback function for function exit.
> + * @hlist_array: The fprobe_hlist for fprobe search from IP hash table.
>   */
>  struct fprobe {
> -#ifdef CONFIG_FUNCTION_TRACER
> -	/*
> -	 * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too.
> -	 * But user of fprobe may keep embedding the struct fprobe on their own
> -	 * code. To avoid build error, this will keep the fprobe data structure
> -	 * defined here, but remove ftrace_ops data structure.
> -	 */
> -	struct ftrace_ops	ops;
> -#endif
>  	unsigned long		nmissed;
>  	unsigned int		flags;
> -	struct rethook		*rethook;
>  	size_t			entry_data_size;
>  	int			nr_maxactive;
>  
> @@ -40,6 +64,8 @@ struct fprobe {
>  	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
>  			     unsigned long ret_ip, struct ftrace_regs *fregs,
>  			     void *entry_data);
> +
> +	struct fprobe_hlist	*hlist_array;
>  };
>  
>  /* This fprobe is soft-disabled. */
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index d895767dcb93..2bb819f08725 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -254,6 +254,13 @@ static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs)
>  	regs_query_register_offset(name)
>  #define ftrace_regs_get_frame_pointer(fregs) \
>  	frame_pointer(&(fregs)->regs)
> +
> +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
> +/* This function works correctly in ftrace entry function. */
> +static __always_inline unsigned long
> +ftrace_regs_get_return_address(struct ftrace_regs *fregs);
> +#endif
> +
>  #endif
>  
>  #ifdef CONFIG_HAVE_REGS_AND_STACK_ACCESS_API
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index 9056315d56ed..d2ee9e6f1561 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -298,11 +298,9 @@ config DYNAMIC_FTRACE_WITH_ARGS
>  
>  config FPROBE
>  	bool "Kernel Function Probe (fprobe)"
> -	depends on FUNCTION_TRACER
> -	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
> -	depends on HAVE_PT_REGS_TO_FTRACE_REGS_CAST || !HAVE_DYNAMIC_FTRACE_WITH_ARGS
> -	depends on HAVE_RETHOOK
> -	select RETHOOK
> +	depends on FUNCTION_GRAPH_TRACER
> +	depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
> +	depends on DYNAMIC_FTRACE_WITH_ARGS
>  	default n
>  	help
>  	  This option enables kernel function probe (fprobe) based on ftrace.
> diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> index 31210423efc3..845ac5af4aeb 100644
> --- a/kernel/trace/fprobe.c
> +++ b/kernel/trace/fprobe.c
> @@ -8,98 +8,193 @@
>  #include <linux/fprobe.h>
>  #include <linux/kallsyms.h>
>  #include <linux/kprobes.h>
> -#include <linux/rethook.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
>  #include <linux/slab.h>
>  #include <linux/sort.h>
>  
>  #include "trace.h"
>  
> -struct fprobe_rethook_node {
> -	struct rethook_node node;
> -	unsigned long entry_ip;
> -	unsigned long entry_parent_ip;
> -	char data[];
> -};
> +#define FPROBE_IP_HASH_BITS 8
> +#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
>  
> -static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
> -			struct ftrace_ops *ops, struct ftrace_regs *fregs)
> -{
> -	struct fprobe_rethook_node *fpr;
> -	struct rethook_node *rh = NULL;
> -	struct fprobe *fp;
> -	void *entry_data = NULL;
> -	int ret = 0;
> +#define FPROBE_HASH_BITS 6
> +#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
>  
> -	fp = container_of(ops, struct fprobe, ops);
> +/*
> + * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
> + *   exists. The key is the address of fprobe instance.
> + * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
> + *   instance related to the funciton address. The key is the ftrace IP
> + *   address.
> + *
> + * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
> + * are set NULL and delete those from both hash tables (by hlist_del_rcu).
> + * After an RCU grace period, the fprobe_hlist itself will be released.
> + *
> + * fprobe_table and fprobe_ip_table can be accessed from either
> + *  - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
> + *  - RCU hlist traversal under disabling preempt
> + */
> +static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
> +static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
> +static DEFINE_MUTEX(fprobe_mutex);
>  
> -	if (fp->exit_handler) {
> -		rh = rethook_try_get(fp->rethook);
> -		if (!rh) {
> -			fp->nmissed++;
> -			return;
> -		}
> -		fpr = container_of(rh, struct fprobe_rethook_node, node);
> -		fpr->entry_ip = ip;
> -		fpr->entry_parent_ip = parent_ip;
> -		if (fp->entry_data_size)
> -			entry_data = fpr->data;
> +/*
> + * Find first fprobe in the hlist. It will be iterated twice in the entry
> + * probe, once for correcting the total required size, the second time is
> + * calling back the user handlers.
> + * Thus the hlist in the fprobe_table must be sorted and new probe needs to
> + * be added *before* the first fprobe.
> + */
> +static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
> +{
> +	struct fprobe_hlist_node *node;
> +	struct hlist_head *head;
> +
> +	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> +	hlist_for_each_entry_rcu(node, head, hlist,
> +				 lockdep_is_held(&fprobe_mutex)) {
> +		if (node->addr == ip)
> +			return node;
>  	}
> +	return NULL;
> +}
> +NOKPROBE_SYMBOL(find_first_fprobe_node);
>  
> -	if (fp->entry_handler)
> -		ret = fp->entry_handler(fp, ip, parent_ip, fregs, entry_data);
> +/* Node insertion and deletion requires the fprobe_mutex */
> +static void insert_fprobe_node(struct fprobe_hlist_node *node)
> +{
> +	unsigned long ip = node->addr;
> +	struct fprobe_hlist_node *next;
> +	struct hlist_head *head;
>  
> -	/* If entry_handler returns !0, nmissed is not counted. */
> -	if (rh) {
> -		if (ret)
> -			rethook_recycle(rh);
> -		else
> -			rethook_hook(rh, ftrace_get_regs(fregs), true);
> +	lockdep_assert_held(&fprobe_mutex);
> +
> +	next = find_first_fprobe_node(ip);
> +	if (next) {
> +		hlist_add_before_rcu(&node->hlist, &next->hlist);
> +		return;
>  	}
> +	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> +	hlist_add_head_rcu(&node->hlist, head);
>  }
>  
> -static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
> -		struct ftrace_ops *ops, struct ftrace_regs *fregs)
> +/* Return true if there are synonims */
> +static bool delete_fprobe_node(struct fprobe_hlist_node *node)
>  {
> -	struct fprobe *fp;
> -	int bit;
> +	lockdep_assert_held(&fprobe_mutex);
>  
> -	fp = container_of(ops, struct fprobe, ops);
> -	if (fprobe_disabled(fp))
> -		return;
> +	WRITE_ONCE(node->fp, NULL);
> +	hlist_del_rcu(&node->hlist);
> +	return !!find_first_fprobe_node(node->addr);
> +}
>  
> -	/* recursion detection has to go before any traceable function and
> -	 * all functions before this point should be marked as notrace
> -	 */
> -	bit = ftrace_test_recursion_trylock(ip, parent_ip);
> -	if (bit < 0) {
> -		fp->nmissed++;
> -		return;
> +/* Check existence of the fprobe */
> +static bool is_fprobe_still_exist(struct fprobe *fp)
> +{
> +	struct hlist_head *head;
> +	struct fprobe_hlist *fph;
> +
> +	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
> +	hlist_for_each_entry_rcu(fph, head, hlist,
> +				 lockdep_is_held(&fprobe_mutex)) {
> +		if (fph->fp == fp)
> +			return true;
>  	}
> -	__fprobe_handler(ip, parent_ip, ops, fregs);
> -	ftrace_test_recursion_unlock(bit);
> +	return false;
> +}
> +NOKPROBE_SYMBOL(is_fprobe_still_exist);
> +
> +static int add_fprobe_hash(struct fprobe *fp)
> +{
> +	struct fprobe_hlist *fph = fp->hlist_array;
> +	struct hlist_head *head;
> +
> +	lockdep_assert_held(&fprobe_mutex);
>  
> +	if (WARN_ON_ONCE(!fph))
> +		return -EINVAL;
> +
> +	if (is_fprobe_still_exist(fp))
> +		return -EEXIST;
> +
> +	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
> +	hlist_add_head_rcu(&fp->hlist_array->hlist, head);
> +	return 0;
>  }
> -NOKPROBE_SYMBOL(fprobe_handler);
>  
> -static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
> -				  struct ftrace_ops *ops, struct ftrace_regs *fregs)
> +static int del_fprobe_hash(struct fprobe *fp)
>  {
> -	struct fprobe *fp;
> -	int bit;
> +	struct fprobe_hlist *fph = fp->hlist_array;
>  
> -	fp = container_of(ops, struct fprobe, ops);
> -	if (fprobe_disabled(fp))
> -		return;
> +	lockdep_assert_held(&fprobe_mutex);
>  
> -	/* recursion detection has to go before any traceable function and
> -	 * all functions called before this point should be marked as notrace
> -	 */
> -	bit = ftrace_test_recursion_trylock(ip, parent_ip);
> -	if (bit < 0) {
> -		fp->nmissed++;
> -		return;
> +	if (WARN_ON_ONCE(!fph))
> +		return -EINVAL;
> +
> +	if (!is_fprobe_still_exist(fp))
> +		return -ENOENT;
> +
> +	fph->fp = NULL;
> +	hlist_del_rcu(&fph->hlist);
> +	return 0;
> +}
> +
> +/* The entry data size is 4 bits (=16) * sizeof(long) in maximum */
> +#define FPROBE_HEADER_SIZE_BITS		4
> +#define MAX_FPROBE_DATA_SIZE_WORD	((1L << FPROBE_HEADER_SIZE_BITS) - 1)
> +#define MAX_FPROBE_DATA_SIZE		(MAX_FPROBE_DATA_SIZE_WORD * sizeof(long))
> +#define FPROBE_HEADER_PTR_BITS		(BITS_PER_LONG - FPROBE_HEADER_SIZE_BITS)
> +#define FPROBE_HEADER_PTR_MASK		GENMASK(FPROBE_HEADER_PTR_BITS - 1, 0)
> +#define FPROBE_HEADER_SIZE		sizeof(unsigned long)
> +
> +static inline unsigned long encode_fprobe_header(struct fprobe *fp, int size_words)
> +{
> +	if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
> +	    ((unsigned long)fp & ~FPROBE_HEADER_PTR_MASK) !=
> +	    ~FPROBE_HEADER_PTR_MASK)) {
> +		return 0;
>  	}
> +	return ((unsigned long)size_words << FPROBE_HEADER_PTR_BITS) |
> +		((unsigned long)fp & FPROBE_HEADER_PTR_MASK);
> +}
> +
> +/* Return reserved data size in words */
> +static inline int decode_fprobe_header(unsigned long val, struct fprobe **fp)
> +{
> +	unsigned long ptr;
> +
> +	ptr = (val & FPROBE_HEADER_PTR_MASK) | ~FPROBE_HEADER_PTR_MASK;
> +	if (fp)
> +		*fp = (struct fprobe *)ptr;
> +	return val >> FPROBE_HEADER_PTR_BITS;
> +}
> +
> +/*
> + * fprobe shadow stack management:
> + * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
> + * among the probes on the same function exit. Note that a new probe can be
> + * registered before a target function is returning, we can not use the hash
> + * table to find the corresponding probes. Thus the probe address is stored on
> + * the shadow stack with its entry data size.
> + *
> + */
> +static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
> +				   struct fprobe *fp, struct ftrace_regs *fregs,
> +				   void *data)
> +{
> +	if (!fp->entry_handler)
> +		return 0;
> +
> +	return fp->entry_handler(fp, ip, parent_ip, fregs, data);
> +}
>  
> +static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
> +					  struct fprobe *fp, struct ftrace_regs *fregs,
> +					  void *data)
> +{
> +	int ret;
>  	/*
>  	 * This user handler is shared with other kprobes and is not expected to be
>  	 * called recursively. So if any other kprobe handler is running, this will
> @@ -108,45 +203,185 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
>  	 */
>  	if (unlikely(kprobe_running())) {
>  		fp->nmissed++;
> -		goto recursion_unlock;
> +		return 0;
>  	}
>  
>  	kprobe_busy_begin();
> -	__fprobe_handler(ip, parent_ip, ops, fregs);
> +	ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
>  	kprobe_busy_end();
> -
> -recursion_unlock:
> -	ftrace_test_recursion_unlock(bit);
> +	return ret;
>  }
>  
> -static void fprobe_exit_handler(struct rethook_node *rh, void *data,
> -				unsigned long ret_ip, struct pt_regs *regs)
> +static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
> +			struct ftrace_regs *fregs)
>  {
> -	struct fprobe *fp = (struct fprobe *)data;
> -	struct fprobe_rethook_node *fpr;
> -	struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
> -	int bit;
> +	struct fprobe_hlist_node *node, *first;
> +	unsigned long *fgraph_data = NULL;
> +	unsigned long func = trace->func;
> +	unsigned long header, ret_ip;
> +	int reserved_words;
> +	struct fprobe *fp;
> +	int used, ret;
>  
> -	if (!fp || fprobe_disabled(fp))
> -		return;
> +	if (WARN_ON_ONCE(!fregs))
> +		return 0;
>  
> -	fpr = container_of(rh, struct fprobe_rethook_node, node);
> +	first = node = find_first_fprobe_node(func);
> +	if (unlikely(!first))
> +		return 0;
> +
> +	reserved_words = 0;
> +	hlist_for_each_entry_from_rcu(node, hlist) {
> +		if (node->addr != func)
> +			break;
> +		fp = READ_ONCE(node->fp);
> +		if (!fp || !fp->exit_handler)
> +			continue;
> +		/*
> +		 * Since fprobe can be enabled until the next loop, we ignore the
> +		 * fprobe's disabled flag in this loop.
> +		 */
> +		reserved_words +=
> +			DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
> +	}
> +	node = first;
> +	if (reserved_words) {
> +		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
> +		if (unlikely(!fgraph_data)) {
> +			hlist_for_each_entry_from_rcu(node, hlist) {
> +				if (node->addr != func)
> +					break;
> +				fp = READ_ONCE(node->fp);
> +				if (fp && !fprobe_disabled(fp))
> +					fp->nmissed++;
> +			}
> +			return 0;
> +		}
> +	}
>  
>  	/*
> -	 * we need to assure no calls to traceable functions in-between the
> -	 * end of fprobe_handler and the beginning of fprobe_exit_handler.
> +	 * TODO: recursion detection has been done in the fgraph. Thus we need
> +	 * to add a callback to increment missed counter.
>  	 */
> -	bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
> -	if (bit < 0) {
> -		fp->nmissed++;
> +	ret_ip = ftrace_regs_get_return_address(fregs);
> +	used = 0;
> +	hlist_for_each_entry_from_rcu(node, hlist) {
> +		void *data;
> +
> +		if (node->addr != func)
> +			break;
> +		fp = READ_ONCE(node->fp);
> +		if (!fp || fprobe_disabled(fp))
> +			continue;
> +
> +		if (fp->entry_data_size && fp->exit_handler)
> +			data = fgraph_data + used + 1;
> +		else
> +			data = NULL;
> +
> +		if (fprobe_shared_with_kprobes(fp))
> +			ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
> +		else
> +			ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
> +		/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
> +		if (!ret && fp->exit_handler) {
> +			int size_words = DIV_ROUND_UP(fp->entry_data_size, sizeof(long));
> +
> +			header = encode_fprobe_header(fp, size_words);
> +			if (likely(header)) {
> +				fgraph_data[used] = header;
> +				used += size_words + 1;
> +			}
> +		}
> +	}
> +	if (used < reserved_words)
> +		memset(fgraph_data + used, 0, reserved_words - used);
> +
> +	/* If any exit_handler is set, data must be used. */
> +	return used != 0;
> +}
> +NOKPROBE_SYMBOL(fprobe_entry);
> +
> +static void fprobe_return(struct ftrace_graph_ret *trace,
> +			  struct fgraph_ops *gops,
> +			  struct ftrace_regs *fregs)
> +{
> +	unsigned long *fgraph_data = NULL;
> +	unsigned long ret_ip;
> +	unsigned long val;
> +	struct fprobe *fp;
> +	int size, curr;
> +	int size_words;
> +
> +	fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
> +	if (WARN_ON_ONCE(!fgraph_data))
>  		return;
> +	size_words = DIV_ROUND_UP(size, sizeof(long));
> +	ret_ip = ftrace_regs_get_instruction_pointer(fregs);
> +
> +	preempt_disable();
> +
> +	curr = 0;
> +	while (size_words > curr) {
> +		val = fgraph_data[curr++];

hi,
the ++ in here -----------------------^^ screws the logic below,
I think we need the change below to properly access the stack data

I wrote bpf test on top of this that adds multiple fprobes attached
on single function and makes sure the fprobe gets its own data on
fentry/fexit callbacks and with the change below it seems to work
properly

jirka

> +		if (!val)
> +			break;
> +
> +		size = decode_fprobe_header(val, &fp);
> +		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
> +			if (WARN_ON_ONCE(curr + size > size_words))
> +				break;
> +			fp->exit_handler(fp, trace->func, ret_ip, fregs,
> +					 size ? fgraph_data + curr : NULL);
> +		}
> +		curr += size + 1;
>  	}
> +	preempt_enable();
> +}


---
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 845ac5af4aeb..bfe255e12a13 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -323,16 +323,16 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
 
 	curr = 0;
 	while (size_words > curr) {
-		val = fgraph_data[curr++];
+		val = fgraph_data[curr];
 		if (!val)
 			break;
 
 		size = decode_fprobe_header(val, &fp);
 		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
-			if (WARN_ON_ONCE(curr + size > size_words))
+			if (WARN_ON_ONCE(curr + size + 1 > size_words))
 				break;
 			fp->exit_handler(fp, trace->func, ret_ip, fregs,
-					 size ? fgraph_data + curr : NULL);
+					 size ? fgraph_data + curr + 1 : NULL);
 		}
 		curr += size + 1;
 	}
Masami Hiramatsu (Google) Feb. 27, 2024, 10:23 p.m. UTC | #2
On Mon, 26 Feb 2024 13:56:09 +0100
Jiri Olsa <olsajiri@gmail.com> wrote:

> On Mon, Feb 26, 2024 at 12:20:43AM +0900, Masami Hiramatsu (Google) wrote:
> > From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > 
> > Rewrite fprobe implementation on function-graph tracer.
> > Major API changes are:
> >  -  'nr_maxactive' field is deprecated.
> >  -  This depends on CONFIG_DYNAMIC_FTRACE_WITH_ARGS or
> >     !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS, and
> >     CONFIG_HAVE_FUNCTION_GRAPH_FREGS. So currently works only
> >     on x86_64.
> >  -  Currently the entry size is limited in 15 * sizeof(long).
> >  -  If there is too many fprobe exit handler set on the same
> >     function, it will fail to probe.
> > 
> > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > ---
> >  Changes in v8:
> >   - Use trace_func_graph_ret/ent_t for fgraph_ops.
> >   - Update CONFIG_FPROBE dependencies.
> >   - Add ftrace_regs_get_return_address() for each arch.
> >  Changes in v3:
> >   - Update for new reserve_data/retrieve_data API.
> >   - Fix internal push/pop on fgraph data logic so that it can
> >     correctly save/restore the returning fprobes.
> >  Changes in v2:
> >   - Add more lockdep_assert_held(fprobe_mutex)
> >   - Use READ_ONCE() and WRITE_ONCE() for fprobe_hlist_node::fp.
> >   - Add NOKPROBE_SYMBOL() for the functions which is called from
> >     entry/exit callback.
> > ---
> >  arch/arm64/include/asm/ftrace.h     |    6 
> >  arch/loongarch/include/asm/ftrace.h |    6 
> >  arch/powerpc/include/asm/ftrace.h   |    6 
> >  arch/s390/include/asm/ftrace.h      |    6 
> >  arch/x86/include/asm/ftrace.h       |    6 
> >  include/linux/fprobe.h              |   54 ++-
> >  include/linux/ftrace.h              |    7 
> >  kernel/trace/Kconfig                |    8 
> >  kernel/trace/fprobe.c               |  638 +++++++++++++++++++++++++----------
> >  lib/test_fprobe.c                   |   45 --
> >  10 files changed, 536 insertions(+), 246 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
> > index 95a8f349f871..800c75f46a13 100644
> > --- a/arch/arm64/include/asm/ftrace.h
> > +++ b/arch/arm64/include/asm/ftrace.h
> > @@ -143,6 +143,12 @@ ftrace_regs_get_frame_pointer(const struct ftrace_regs *fregs)
> >  	return fregs->fp;
> >  }
> >  
> > +static __always_inline unsigned long
> > +ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
> > +{
> > +	return fregs->lr;
> > +}
> > +
> >  static __always_inline struct pt_regs *
> >  ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs *regs)
> >  {
> > diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h
> > index 14a1576bf948..b8432b7cc9d4 100644
> > --- a/arch/loongarch/include/asm/ftrace.h
> > +++ b/arch/loongarch/include/asm/ftrace.h
> > @@ -81,6 +81,12 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip)
> >  #define ftrace_regs_get_frame_pointer(fregs) \
> >  	((fregs)->regs.regs[22])
> >  
> > +static __always_inline unsigned long
> > +ftrace_regs_get_return_address(struct ftrace_regs *fregs)
> > +{
> > +	return *(unsigned long *)(fregs->regs.regs[1]);
> > +}
> > +
> >  #define ftrace_graph_func ftrace_graph_func
> >  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
> >  		       struct ftrace_ops *op, struct ftrace_regs *fregs);
> > diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
> > index 773e9011cff1..7ac1bc3e7196 100644
> > --- a/arch/powerpc/include/asm/ftrace.h
> > +++ b/arch/powerpc/include/asm/ftrace.h
> > @@ -85,6 +85,12 @@ ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
> >  #define ftrace_regs_query_register_offset(name) \
> >  	regs_query_register_offset(name)
> >  
> > +static __always_inline unsigned long
> > +ftrace_regs_get_return_address(struct ftrace_regs *fregs)
> > +{
> > +	return fregs->regs.link;
> > +}
> > +
> >  struct ftrace_ops;
> >  
> >  #define ftrace_graph_func ftrace_graph_func
> > diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
> > index e910924eee59..d434a0497683 100644
> > --- a/arch/s390/include/asm/ftrace.h
> > +++ b/arch/s390/include/asm/ftrace.h
> > @@ -89,6 +89,12 @@ ftrace_regs_get_frame_pointer(struct ftrace_regs *fregs)
> >  	return sp[0];	/* return backchain */
> >  }
> >  
> > +static __always_inline unsigned long
> > +ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
> > +{
> > +	return fregs->regs.gprs[14];
> > +}
> > +
> >  #define arch_ftrace_fill_perf_regs(fregs, _regs)	 do {		\
> >  		(_regs)->psw.addr = (fregs)->regs.psw.addr;		\
> >  		(_regs)->gprs[15] = (fregs)->regs.gprs[15];		\
> > diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> > index 7625887fc49b..979d3458a328 100644
> > --- a/arch/x86/include/asm/ftrace.h
> > +++ b/arch/x86/include/asm/ftrace.h
> > @@ -82,6 +82,12 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
> >  #define ftrace_regs_get_frame_pointer(fregs) \
> >  	frame_pointer(&(fregs)->regs)
> >  
> > +static __always_inline unsigned long
> > +ftrace_regs_get_return_address(struct ftrace_regs *fregs)
> > +{
> > +	return *(unsigned long *)ftrace_regs_get_stack_pointer(fregs);
> > +}
> > +
> >  struct ftrace_ops;
> >  #define ftrace_graph_func ftrace_graph_func
> >  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
> > diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
> > index 879a30956009..08b37b0d1d05 100644
> > --- a/include/linux/fprobe.h
> > +++ b/include/linux/fprobe.h
> > @@ -5,32 +5,56 @@
> >  
> >  #include <linux/compiler.h>
> >  #include <linux/ftrace.h>
> > -#include <linux/rethook.h>
> > +#include <linux/rcupdate.h>
> > +#include <linux/refcount.h>
> > +#include <linux/slab.h>
> > +
> > +struct fprobe;
> > +
> > +/**
> > + * strcut fprobe_hlist_node - address based hash list node for fprobe.
> > + *
> > + * @hlist: The hlist node for address search hash table.
> > + * @addr: The address represented by this.
> > + * @fp: The fprobe which owns this.
> > + */
> > +struct fprobe_hlist_node {
> > +	struct hlist_node	hlist;
> > +	unsigned long		addr;
> > +	struct fprobe		*fp;
> > +};
> > +
> > +/**
> > + * struct fprobe_hlist - hash list nodes for fprobe.
> > + *
> > + * @hlist: The hlist node for existence checking hash table.
> > + * @rcu: rcu_head for RCU deferred release.
> > + * @fp: The fprobe which owns this fprobe_hlist.
> > + * @size: The size of @array.
> > + * @array: The fprobe_hlist_node for each address to probe.
> > + */
> > +struct fprobe_hlist {
> > +	struct hlist_node		hlist;
> > +	struct rcu_head			rcu;
> > +	struct fprobe			*fp;
> > +	int				size;
> > +	struct fprobe_hlist_node	array[];
> > +};
> >  
> >  /**
> >   * struct fprobe - ftrace based probe.
> > - * @ops: The ftrace_ops.
> > + *
> >   * @nmissed: The counter for missing events.
> >   * @flags: The status flag.
> > - * @rethook: The rethook data structure. (internal data)
> >   * @entry_data_size: The private data storage size.
> > - * @nr_maxactive: The max number of active functions.
> > + * @nr_maxactive: The max number of active functions. (*deprecated)
> >   * @entry_handler: The callback function for function entry.
> >   * @exit_handler: The callback function for function exit.
> > + * @hlist_array: The fprobe_hlist for fprobe search from IP hash table.
> >   */
> >  struct fprobe {
> > -#ifdef CONFIG_FUNCTION_TRACER
> > -	/*
> > -	 * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too.
> > -	 * But user of fprobe may keep embedding the struct fprobe on their own
> > -	 * code. To avoid build error, this will keep the fprobe data structure
> > -	 * defined here, but remove ftrace_ops data structure.
> > -	 */
> > -	struct ftrace_ops	ops;
> > -#endif
> >  	unsigned long		nmissed;
> >  	unsigned int		flags;
> > -	struct rethook		*rethook;
> >  	size_t			entry_data_size;
> >  	int			nr_maxactive;
> >  
> > @@ -40,6 +64,8 @@ struct fprobe {
> >  	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
> >  			     unsigned long ret_ip, struct ftrace_regs *fregs,
> >  			     void *entry_data);
> > +
> > +	struct fprobe_hlist	*hlist_array;
> >  };
> >  
> >  /* This fprobe is soft-disabled. */
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > index d895767dcb93..2bb819f08725 100644
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -254,6 +254,13 @@ static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs)
> >  	regs_query_register_offset(name)
> >  #define ftrace_regs_get_frame_pointer(fregs) \
> >  	frame_pointer(&(fregs)->regs)
> > +
> > +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
> > +/* This function works correctly in ftrace entry function. */
> > +static __always_inline unsigned long
> > +ftrace_regs_get_return_address(struct ftrace_regs *fregs);
> > +#endif
> > +
> >  #endif
> >  
> >  #ifdef CONFIG_HAVE_REGS_AND_STACK_ACCESS_API
> > diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> > index 9056315d56ed..d2ee9e6f1561 100644
> > --- a/kernel/trace/Kconfig
> > +++ b/kernel/trace/Kconfig
> > @@ -298,11 +298,9 @@ config DYNAMIC_FTRACE_WITH_ARGS
> >  
> >  config FPROBE
> >  	bool "Kernel Function Probe (fprobe)"
> > -	depends on FUNCTION_TRACER
> > -	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
> > -	depends on HAVE_PT_REGS_TO_FTRACE_REGS_CAST || !HAVE_DYNAMIC_FTRACE_WITH_ARGS
> > -	depends on HAVE_RETHOOK
> > -	select RETHOOK
> > +	depends on FUNCTION_GRAPH_TRACER
> > +	depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
> > +	depends on DYNAMIC_FTRACE_WITH_ARGS
> >  	default n
> >  	help
> >  	  This option enables kernel function probe (fprobe) based on ftrace.
> > diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> > index 31210423efc3..845ac5af4aeb 100644
> > --- a/kernel/trace/fprobe.c
> > +++ b/kernel/trace/fprobe.c
> > @@ -8,98 +8,193 @@
> >  #include <linux/fprobe.h>
> >  #include <linux/kallsyms.h>
> >  #include <linux/kprobes.h>
> > -#include <linux/rethook.h>
> > +#include <linux/list.h>
> > +#include <linux/mutex.h>
> >  #include <linux/slab.h>
> >  #include <linux/sort.h>
> >  
> >  #include "trace.h"
> >  
> > -struct fprobe_rethook_node {
> > -	struct rethook_node node;
> > -	unsigned long entry_ip;
> > -	unsigned long entry_parent_ip;
> > -	char data[];
> > -};
> > +#define FPROBE_IP_HASH_BITS 8
> > +#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
> >  
> > -static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
> > -			struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > -{
> > -	struct fprobe_rethook_node *fpr;
> > -	struct rethook_node *rh = NULL;
> > -	struct fprobe *fp;
> > -	void *entry_data = NULL;
> > -	int ret = 0;
> > +#define FPROBE_HASH_BITS 6
> > +#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
> >  
> > -	fp = container_of(ops, struct fprobe, ops);
> > +/*
> > + * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
> > + *   exists. The key is the address of fprobe instance.
> > + * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
> > + *   instance related to the funciton address. The key is the ftrace IP
> > + *   address.
> > + *
> > + * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
> > + * are set NULL and delete those from both hash tables (by hlist_del_rcu).
> > + * After an RCU grace period, the fprobe_hlist itself will be released.
> > + *
> > + * fprobe_table and fprobe_ip_table can be accessed from either
> > + *  - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
> > + *  - RCU hlist traversal under disabling preempt
> > + */
> > +static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
> > +static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
> > +static DEFINE_MUTEX(fprobe_mutex);
> >  
> > -	if (fp->exit_handler) {
> > -		rh = rethook_try_get(fp->rethook);
> > -		if (!rh) {
> > -			fp->nmissed++;
> > -			return;
> > -		}
> > -		fpr = container_of(rh, struct fprobe_rethook_node, node);
> > -		fpr->entry_ip = ip;
> > -		fpr->entry_parent_ip = parent_ip;
> > -		if (fp->entry_data_size)
> > -			entry_data = fpr->data;
> > +/*
> > + * Find first fprobe in the hlist. It will be iterated twice in the entry
> > + * probe, once for correcting the total required size, the second time is
> > + * calling back the user handlers.
> > + * Thus the hlist in the fprobe_table must be sorted and new probe needs to
> > + * be added *before* the first fprobe.
> > + */
> > +static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
> > +{
> > +	struct fprobe_hlist_node *node;
> > +	struct hlist_head *head;
> > +
> > +	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> > +	hlist_for_each_entry_rcu(node, head, hlist,
> > +				 lockdep_is_held(&fprobe_mutex)) {
> > +		if (node->addr == ip)
> > +			return node;
> >  	}
> > +	return NULL;
> > +}
> > +NOKPROBE_SYMBOL(find_first_fprobe_node);
> >  
> > -	if (fp->entry_handler)
> > -		ret = fp->entry_handler(fp, ip, parent_ip, fregs, entry_data);
> > +/* Node insertion and deletion requires the fprobe_mutex */
> > +static void insert_fprobe_node(struct fprobe_hlist_node *node)
> > +{
> > +	unsigned long ip = node->addr;
> > +	struct fprobe_hlist_node *next;
> > +	struct hlist_head *head;
> >  
> > -	/* If entry_handler returns !0, nmissed is not counted. */
> > -	if (rh) {
> > -		if (ret)
> > -			rethook_recycle(rh);
> > -		else
> > -			rethook_hook(rh, ftrace_get_regs(fregs), true);
> > +	lockdep_assert_held(&fprobe_mutex);
> > +
> > +	next = find_first_fprobe_node(ip);
> > +	if (next) {
> > +		hlist_add_before_rcu(&node->hlist, &next->hlist);
> > +		return;
> >  	}
> > +	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> > +	hlist_add_head_rcu(&node->hlist, head);
> >  }
> >  
> > -static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
> > -		struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > +/* Return true if there are synonims */
> > +static bool delete_fprobe_node(struct fprobe_hlist_node *node)
> >  {
> > -	struct fprobe *fp;
> > -	int bit;
> > +	lockdep_assert_held(&fprobe_mutex);
> >  
> > -	fp = container_of(ops, struct fprobe, ops);
> > -	if (fprobe_disabled(fp))
> > -		return;
> > +	WRITE_ONCE(node->fp, NULL);
> > +	hlist_del_rcu(&node->hlist);
> > +	return !!find_first_fprobe_node(node->addr);
> > +}
> >  
> > -	/* recursion detection has to go before any traceable function and
> > -	 * all functions before this point should be marked as notrace
> > -	 */
> > -	bit = ftrace_test_recursion_trylock(ip, parent_ip);
> > -	if (bit < 0) {
> > -		fp->nmissed++;
> > -		return;
> > +/* Check existence of the fprobe */
> > +static bool is_fprobe_still_exist(struct fprobe *fp)
> > +{
> > +	struct hlist_head *head;
> > +	struct fprobe_hlist *fph;
> > +
> > +	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
> > +	hlist_for_each_entry_rcu(fph, head, hlist,
> > +				 lockdep_is_held(&fprobe_mutex)) {
> > +		if (fph->fp == fp)
> > +			return true;
> >  	}
> > -	__fprobe_handler(ip, parent_ip, ops, fregs);
> > -	ftrace_test_recursion_unlock(bit);
> > +	return false;
> > +}
> > +NOKPROBE_SYMBOL(is_fprobe_still_exist);
> > +
> > +static int add_fprobe_hash(struct fprobe *fp)
> > +{
> > +	struct fprobe_hlist *fph = fp->hlist_array;
> > +	struct hlist_head *head;
> > +
> > +	lockdep_assert_held(&fprobe_mutex);
> >  
> > +	if (WARN_ON_ONCE(!fph))
> > +		return -EINVAL;
> > +
> > +	if (is_fprobe_still_exist(fp))
> > +		return -EEXIST;
> > +
> > +	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
> > +	hlist_add_head_rcu(&fp->hlist_array->hlist, head);
> > +	return 0;
> >  }
> > -NOKPROBE_SYMBOL(fprobe_handler);
> >  
> > -static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
> > -				  struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > +static int del_fprobe_hash(struct fprobe *fp)
> >  {
> > -	struct fprobe *fp;
> > -	int bit;
> > +	struct fprobe_hlist *fph = fp->hlist_array;
> >  
> > -	fp = container_of(ops, struct fprobe, ops);
> > -	if (fprobe_disabled(fp))
> > -		return;
> > +	lockdep_assert_held(&fprobe_mutex);
> >  
> > -	/* recursion detection has to go before any traceable function and
> > -	 * all functions called before this point should be marked as notrace
> > -	 */
> > -	bit = ftrace_test_recursion_trylock(ip, parent_ip);
> > -	if (bit < 0) {
> > -		fp->nmissed++;
> > -		return;
> > +	if (WARN_ON_ONCE(!fph))
> > +		return -EINVAL;
> > +
> > +	if (!is_fprobe_still_exist(fp))
> > +		return -ENOENT;
> > +
> > +	fph->fp = NULL;
> > +	hlist_del_rcu(&fph->hlist);
> > +	return 0;
> > +}
> > +
> > +/* The entry data size is 4 bits (=16) * sizeof(long) in maximum */
> > +#define FPROBE_HEADER_SIZE_BITS		4
> > +#define MAX_FPROBE_DATA_SIZE_WORD	((1L << FPROBE_HEADER_SIZE_BITS) - 1)
> > +#define MAX_FPROBE_DATA_SIZE		(MAX_FPROBE_DATA_SIZE_WORD * sizeof(long))
> > +#define FPROBE_HEADER_PTR_BITS		(BITS_PER_LONG - FPROBE_HEADER_SIZE_BITS)
> > +#define FPROBE_HEADER_PTR_MASK		GENMASK(FPROBE_HEADER_PTR_BITS - 1, 0)
> > +#define FPROBE_HEADER_SIZE		sizeof(unsigned long)
> > +
> > +static inline unsigned long encode_fprobe_header(struct fprobe *fp, int size_words)
> > +{
> > +	if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
> > +	    ((unsigned long)fp & ~FPROBE_HEADER_PTR_MASK) !=
> > +	    ~FPROBE_HEADER_PTR_MASK)) {
> > +		return 0;
> >  	}
> > +	return ((unsigned long)size_words << FPROBE_HEADER_PTR_BITS) |
> > +		((unsigned long)fp & FPROBE_HEADER_PTR_MASK);
> > +}
> > +
> > +/* Return reserved data size in words */
> > +static inline int decode_fprobe_header(unsigned long val, struct fprobe **fp)
> > +{
> > +	unsigned long ptr;
> > +
> > +	ptr = (val & FPROBE_HEADER_PTR_MASK) | ~FPROBE_HEADER_PTR_MASK;
> > +	if (fp)
> > +		*fp = (struct fprobe *)ptr;
> > +	return val >> FPROBE_HEADER_PTR_BITS;
> > +}
> > +
> > +/*
> > + * fprobe shadow stack management:
> > + * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
> > + * among the probes on the same function exit. Note that a new probe can be
> > + * registered before a target function is returning, we can not use the hash
> > + * table to find the corresponding probes. Thus the probe address is stored on
> > + * the shadow stack with its entry data size.
> > + *
> > + */
> > +static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
> > +				   struct fprobe *fp, struct ftrace_regs *fregs,
> > +				   void *data)
> > +{
> > +	if (!fp->entry_handler)
> > +		return 0;
> > +
> > +	return fp->entry_handler(fp, ip, parent_ip, fregs, data);
> > +}
> >  
> > +static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
> > +					  struct fprobe *fp, struct ftrace_regs *fregs,
> > +					  void *data)
> > +{
> > +	int ret;
> >  	/*
> >  	 * This user handler is shared with other kprobes and is not expected to be
> >  	 * called recursively. So if any other kprobe handler is running, this will
> > @@ -108,45 +203,185 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
> >  	 */
> >  	if (unlikely(kprobe_running())) {
> >  		fp->nmissed++;
> > -		goto recursion_unlock;
> > +		return 0;
> >  	}
> >  
> >  	kprobe_busy_begin();
> > -	__fprobe_handler(ip, parent_ip, ops, fregs);
> > +	ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
> >  	kprobe_busy_end();
> > -
> > -recursion_unlock:
> > -	ftrace_test_recursion_unlock(bit);
> > +	return ret;
> >  }
> >  
> > -static void fprobe_exit_handler(struct rethook_node *rh, void *data,
> > -				unsigned long ret_ip, struct pt_regs *regs)
> > +static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
> > +			struct ftrace_regs *fregs)
> >  {
> > -	struct fprobe *fp = (struct fprobe *)data;
> > -	struct fprobe_rethook_node *fpr;
> > -	struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
> > -	int bit;
> > +	struct fprobe_hlist_node *node, *first;
> > +	unsigned long *fgraph_data = NULL;
> > +	unsigned long func = trace->func;
> > +	unsigned long header, ret_ip;
> > +	int reserved_words;
> > +	struct fprobe *fp;
> > +	int used, ret;
> >  
> > -	if (!fp || fprobe_disabled(fp))
> > -		return;
> > +	if (WARN_ON_ONCE(!fregs))
> > +		return 0;
> >  
> > -	fpr = container_of(rh, struct fprobe_rethook_node, node);
> > +	first = node = find_first_fprobe_node(func);
> > +	if (unlikely(!first))
> > +		return 0;
> > +
> > +	reserved_words = 0;
> > +	hlist_for_each_entry_from_rcu(node, hlist) {
> > +		if (node->addr != func)
> > +			break;
> > +		fp = READ_ONCE(node->fp);
> > +		if (!fp || !fp->exit_handler)
> > +			continue;
> > +		/*
> > +		 * Since fprobe can be enabled until the next loop, we ignore the
> > +		 * fprobe's disabled flag in this loop.
> > +		 */
> > +		reserved_words +=
> > +			DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
> > +	}
> > +	node = first;
> > +	if (reserved_words) {
> > +		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
> > +		if (unlikely(!fgraph_data)) {
> > +			hlist_for_each_entry_from_rcu(node, hlist) {
> > +				if (node->addr != func)
> > +					break;
> > +				fp = READ_ONCE(node->fp);
> > +				if (fp && !fprobe_disabled(fp))
> > +					fp->nmissed++;
> > +			}
> > +			return 0;
> > +		}
> > +	}
> >  
> >  	/*
> > -	 * we need to assure no calls to traceable functions in-between the
> > -	 * end of fprobe_handler and the beginning of fprobe_exit_handler.
> > +	 * TODO: recursion detection has been done in the fgraph. Thus we need
> > +	 * to add a callback to increment missed counter.
> >  	 */
> > -	bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
> > -	if (bit < 0) {
> > -		fp->nmissed++;
> > +	ret_ip = ftrace_regs_get_return_address(fregs);
> > +	used = 0;
> > +	hlist_for_each_entry_from_rcu(node, hlist) {
> > +		void *data;
> > +
> > +		if (node->addr != func)
> > +			break;
> > +		fp = READ_ONCE(node->fp);
> > +		if (!fp || fprobe_disabled(fp))
> > +			continue;
> > +
> > +		if (fp->entry_data_size && fp->exit_handler)
> > +			data = fgraph_data + used + 1;
> > +		else
> > +			data = NULL;
> > +
> > +		if (fprobe_shared_with_kprobes(fp))
> > +			ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
> > +		else
> > +			ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
> > +		/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
> > +		if (!ret && fp->exit_handler) {
> > +			int size_words = DIV_ROUND_UP(fp->entry_data_size, sizeof(long));
> > +
> > +			header = encode_fprobe_header(fp, size_words);
> > +			if (likely(header)) {
> > +				fgraph_data[used] = header;
> > +				used += size_words + 1;
> > +			}
> > +		}
> > +	}
> > +	if (used < reserved_words)
> > +		memset(fgraph_data + used, 0, reserved_words - used);
> > +
> > +	/* If any exit_handler is set, data must be used. */
> > +	return used != 0;
> > +}
> > +NOKPROBE_SYMBOL(fprobe_entry);
> > +
> > +static void fprobe_return(struct ftrace_graph_ret *trace,
> > +			  struct fgraph_ops *gops,
> > +			  struct ftrace_regs *fregs)
> > +{
> > +	unsigned long *fgraph_data = NULL;
> > +	unsigned long ret_ip;
> > +	unsigned long val;
> > +	struct fprobe *fp;
> > +	int size, curr;
> > +	int size_words;
> > +
> > +	fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
> > +	if (WARN_ON_ONCE(!fgraph_data))
> >  		return;
> > +	size_words = DIV_ROUND_UP(size, sizeof(long));
> > +	ret_ip = ftrace_regs_get_instruction_pointer(fregs);
> > +
> > +	preempt_disable();
> > +
> > +	curr = 0;
> > +	while (size_words > curr) {
> > +		val = fgraph_data[curr++];
> 
> hi,
> the ++ in here -----------------------^^ screws the logic below,
> I think we need the change below to properly access the stack data
> 
> I wrote bpf test on top of this that adds multiple fprobes attached
> on single function and makes sure the fprobe gets its own data on
> fentry/fexit callbacks and with the change below it seems to work
> properly

Thanks for finding the bug!

> 
> jirka
> 
> > +		if (!val)
> > +			break;
> > +
> > +		size = decode_fprobe_header(val, &fp);
> > +		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
> > +			if (WARN_ON_ONCE(curr + size > size_words))
> > +				break;
> > +			fp->exit_handler(fp, trace->func, ret_ip, fregs,
> > +					 size ? fgraph_data + curr : NULL);
> > +		}
> > +		curr += size + 1;

But I think the problem is here. We've already decoded header, so "cur += size" is enough.

BTW, I also found that I used DIV_ROUND_UP(..., sizeof(long)) on hotpath.
It should be changed.

Thank you,

> >  	}
> > +	preempt_enable();
> > +}
> 
> 
> ---
> diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> index 845ac5af4aeb..bfe255e12a13 100644
> --- a/kernel/trace/fprobe.c
> +++ b/kernel/trace/fprobe.c
> @@ -323,16 +323,16 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
>  
>  	curr = 0;
>  	while (size_words > curr) {
> -		val = fgraph_data[curr++];
> +		val = fgraph_data[curr];
>  		if (!val)
>  			break;
>  
>  		size = decode_fprobe_header(val, &fp);
>  		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
> -			if (WARN_ON_ONCE(curr + size > size_words))
> +			if (WARN_ON_ONCE(curr + size + 1 > size_words))
>  				break;
>  			fp->exit_handler(fp, trace->func, ret_ip, fregs,
> -					 size ? fgraph_data + curr : NULL);
> +					 size ? fgraph_data + curr + 1 : NULL);
>  		}
>  		curr += size + 1;
>  	}
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index 95a8f349f871..800c75f46a13 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -143,6 +143,12 @@  ftrace_regs_get_frame_pointer(const struct ftrace_regs *fregs)
 	return fregs->fp;
 }
 
+static __always_inline unsigned long
+ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
+{
+	return fregs->lr;
+}
+
 static __always_inline struct pt_regs *
 ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs *regs)
 {
diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h
index 14a1576bf948..b8432b7cc9d4 100644
--- a/arch/loongarch/include/asm/ftrace.h
+++ b/arch/loongarch/include/asm/ftrace.h
@@ -81,6 +81,12 @@  ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip)
 #define ftrace_regs_get_frame_pointer(fregs) \
 	((fregs)->regs.regs[22])
 
+static __always_inline unsigned long
+ftrace_regs_get_return_address(struct ftrace_regs *fregs)
+{
+	return *(unsigned long *)(fregs->regs.regs[1]);
+}
+
 #define ftrace_graph_func ftrace_graph_func
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 773e9011cff1..7ac1bc3e7196 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -85,6 +85,12 @@  ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
 #define ftrace_regs_query_register_offset(name) \
 	regs_query_register_offset(name)
 
+static __always_inline unsigned long
+ftrace_regs_get_return_address(struct ftrace_regs *fregs)
+{
+	return fregs->regs.link;
+}
+
 struct ftrace_ops;
 
 #define ftrace_graph_func ftrace_graph_func
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index e910924eee59..d434a0497683 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -89,6 +89,12 @@  ftrace_regs_get_frame_pointer(struct ftrace_regs *fregs)
 	return sp[0];	/* return backchain */
 }
 
+static __always_inline unsigned long
+ftrace_regs_get_return_address(const struct ftrace_regs *fregs)
+{
+	return fregs->regs.gprs[14];
+}
+
 #define arch_ftrace_fill_perf_regs(fregs, _regs)	 do {		\
 		(_regs)->psw.addr = (fregs)->regs.psw.addr;		\
 		(_regs)->gprs[15] = (fregs)->regs.gprs[15];		\
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 7625887fc49b..979d3458a328 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -82,6 +82,12 @@  arch_ftrace_get_regs(struct ftrace_regs *fregs)
 #define ftrace_regs_get_frame_pointer(fregs) \
 	frame_pointer(&(fregs)->regs)
 
+static __always_inline unsigned long
+ftrace_regs_get_return_address(struct ftrace_regs *fregs)
+{
+	return *(unsigned long *)ftrace_regs_get_stack_pointer(fregs);
+}
+
 struct ftrace_ops;
 #define ftrace_graph_func ftrace_graph_func
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 879a30956009..08b37b0d1d05 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -5,32 +5,56 @@ 
 
 #include <linux/compiler.h>
 #include <linux/ftrace.h>
-#include <linux/rethook.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+
+struct fprobe;
+
+/**
+ * strcut fprobe_hlist_node - address based hash list node for fprobe.
+ *
+ * @hlist: The hlist node for address search hash table.
+ * @addr: The address represented by this.
+ * @fp: The fprobe which owns this.
+ */
+struct fprobe_hlist_node {
+	struct hlist_node	hlist;
+	unsigned long		addr;
+	struct fprobe		*fp;
+};
+
+/**
+ * struct fprobe_hlist - hash list nodes for fprobe.
+ *
+ * @hlist: The hlist node for existence checking hash table.
+ * @rcu: rcu_head for RCU deferred release.
+ * @fp: The fprobe which owns this fprobe_hlist.
+ * @size: The size of @array.
+ * @array: The fprobe_hlist_node for each address to probe.
+ */
+struct fprobe_hlist {
+	struct hlist_node		hlist;
+	struct rcu_head			rcu;
+	struct fprobe			*fp;
+	int				size;
+	struct fprobe_hlist_node	array[];
+};
 
 /**
  * struct fprobe - ftrace based probe.
- * @ops: The ftrace_ops.
+ *
  * @nmissed: The counter for missing events.
  * @flags: The status flag.
- * @rethook: The rethook data structure. (internal data)
  * @entry_data_size: The private data storage size.
- * @nr_maxactive: The max number of active functions.
+ * @nr_maxactive: The max number of active functions. (*deprecated)
  * @entry_handler: The callback function for function entry.
  * @exit_handler: The callback function for function exit.
+ * @hlist_array: The fprobe_hlist for fprobe search from IP hash table.
  */
 struct fprobe {
-#ifdef CONFIG_FUNCTION_TRACER
-	/*
-	 * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too.
-	 * But user of fprobe may keep embedding the struct fprobe on their own
-	 * code. To avoid build error, this will keep the fprobe data structure
-	 * defined here, but remove ftrace_ops data structure.
-	 */
-	struct ftrace_ops	ops;
-#endif
 	unsigned long		nmissed;
 	unsigned int		flags;
-	struct rethook		*rethook;
 	size_t			entry_data_size;
 	int			nr_maxactive;
 
@@ -40,6 +64,8 @@  struct fprobe {
 	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
 			     unsigned long ret_ip, struct ftrace_regs *fregs,
 			     void *entry_data);
+
+	struct fprobe_hlist	*hlist_array;
 };
 
 /* This fprobe is soft-disabled. */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d895767dcb93..2bb819f08725 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -254,6 +254,13 @@  static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs)
 	regs_query_register_offset(name)
 #define ftrace_regs_get_frame_pointer(fregs) \
 	frame_pointer(&(fregs)->regs)
+
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
+/* This function works correctly in ftrace entry function. */
+static __always_inline unsigned long
+ftrace_regs_get_return_address(struct ftrace_regs *fregs);
+#endif
+
 #endif
 
 #ifdef CONFIG_HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9056315d56ed..d2ee9e6f1561 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -298,11 +298,9 @@  config DYNAMIC_FTRACE_WITH_ARGS
 
 config FPROBE
 	bool "Kernel Function Probe (fprobe)"
-	depends on FUNCTION_TRACER
-	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
-	depends on HAVE_PT_REGS_TO_FTRACE_REGS_CAST || !HAVE_DYNAMIC_FTRACE_WITH_ARGS
-	depends on HAVE_RETHOOK
-	select RETHOOK
+	depends on FUNCTION_GRAPH_TRACER
+	depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
+	depends on DYNAMIC_FTRACE_WITH_ARGS
 	default n
 	help
 	  This option enables kernel function probe (fprobe) based on ftrace.
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 31210423efc3..845ac5af4aeb 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -8,98 +8,193 @@ 
 #include <linux/fprobe.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
-#include <linux/rethook.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 
 #include "trace.h"
 
-struct fprobe_rethook_node {
-	struct rethook_node node;
-	unsigned long entry_ip;
-	unsigned long entry_parent_ip;
-	char data[];
-};
+#define FPROBE_IP_HASH_BITS 8
+#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
 
-static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
-			struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
-	struct fprobe_rethook_node *fpr;
-	struct rethook_node *rh = NULL;
-	struct fprobe *fp;
-	void *entry_data = NULL;
-	int ret = 0;
+#define FPROBE_HASH_BITS 6
+#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
 
-	fp = container_of(ops, struct fprobe, ops);
+/*
+ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
+ *   exists. The key is the address of fprobe instance.
+ * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
+ *   instance related to the funciton address. The key is the ftrace IP
+ *   address.
+ *
+ * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
+ * are set NULL and delete those from both hash tables (by hlist_del_rcu).
+ * After an RCU grace period, the fprobe_hlist itself will be released.
+ *
+ * fprobe_table and fprobe_ip_table can be accessed from either
+ *  - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
+ *  - RCU hlist traversal under disabling preempt
+ */
+static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
+static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static DEFINE_MUTEX(fprobe_mutex);
 
-	if (fp->exit_handler) {
-		rh = rethook_try_get(fp->rethook);
-		if (!rh) {
-			fp->nmissed++;
-			return;
-		}
-		fpr = container_of(rh, struct fprobe_rethook_node, node);
-		fpr->entry_ip = ip;
-		fpr->entry_parent_ip = parent_ip;
-		if (fp->entry_data_size)
-			entry_data = fpr->data;
+/*
+ * Find first fprobe in the hlist. It will be iterated twice in the entry
+ * probe, once for correcting the total required size, the second time is
+ * calling back the user handlers.
+ * Thus the hlist in the fprobe_table must be sorted and new probe needs to
+ * be added *before* the first fprobe.
+ */
+static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+{
+	struct fprobe_hlist_node *node;
+	struct hlist_head *head;
+
+	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+	hlist_for_each_entry_rcu(node, head, hlist,
+				 lockdep_is_held(&fprobe_mutex)) {
+		if (node->addr == ip)
+			return node;
 	}
+	return NULL;
+}
+NOKPROBE_SYMBOL(find_first_fprobe_node);
 
-	if (fp->entry_handler)
-		ret = fp->entry_handler(fp, ip, parent_ip, fregs, entry_data);
+/* Node insertion and deletion requires the fprobe_mutex */
+static void insert_fprobe_node(struct fprobe_hlist_node *node)
+{
+	unsigned long ip = node->addr;
+	struct fprobe_hlist_node *next;
+	struct hlist_head *head;
 
-	/* If entry_handler returns !0, nmissed is not counted. */
-	if (rh) {
-		if (ret)
-			rethook_recycle(rh);
-		else
-			rethook_hook(rh, ftrace_get_regs(fregs), true);
+	lockdep_assert_held(&fprobe_mutex);
+
+	next = find_first_fprobe_node(ip);
+	if (next) {
+		hlist_add_before_rcu(&node->hlist, &next->hlist);
+		return;
 	}
+	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+	hlist_add_head_rcu(&node->hlist, head);
 }
 
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
-		struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Return true if there are synonims */
+static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	struct fprobe *fp;
-	int bit;
+	lockdep_assert_held(&fprobe_mutex);
 
-	fp = container_of(ops, struct fprobe, ops);
-	if (fprobe_disabled(fp))
-		return;
+	WRITE_ONCE(node->fp, NULL);
+	hlist_del_rcu(&node->hlist);
+	return !!find_first_fprobe_node(node->addr);
+}
 
-	/* recursion detection has to go before any traceable function and
-	 * all functions before this point should be marked as notrace
-	 */
-	bit = ftrace_test_recursion_trylock(ip, parent_ip);
-	if (bit < 0) {
-		fp->nmissed++;
-		return;
+/* Check existence of the fprobe */
+static bool is_fprobe_still_exist(struct fprobe *fp)
+{
+	struct hlist_head *head;
+	struct fprobe_hlist *fph;
+
+	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+	hlist_for_each_entry_rcu(fph, head, hlist,
+				 lockdep_is_held(&fprobe_mutex)) {
+		if (fph->fp == fp)
+			return true;
 	}
-	__fprobe_handler(ip, parent_ip, ops, fregs);
-	ftrace_test_recursion_unlock(bit);
+	return false;
+}
+NOKPROBE_SYMBOL(is_fprobe_still_exist);
+
+static int add_fprobe_hash(struct fprobe *fp)
+{
+	struct fprobe_hlist *fph = fp->hlist_array;
+	struct hlist_head *head;
+
+	lockdep_assert_held(&fprobe_mutex);
 
+	if (WARN_ON_ONCE(!fph))
+		return -EINVAL;
+
+	if (is_fprobe_still_exist(fp))
+		return -EEXIST;
+
+	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+	hlist_add_head_rcu(&fp->hlist_array->hlist, head);
+	return 0;
 }
-NOKPROBE_SYMBOL(fprobe_handler);
 
-static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static int del_fprobe_hash(struct fprobe *fp)
 {
-	struct fprobe *fp;
-	int bit;
+	struct fprobe_hlist *fph = fp->hlist_array;
 
-	fp = container_of(ops, struct fprobe, ops);
-	if (fprobe_disabled(fp))
-		return;
+	lockdep_assert_held(&fprobe_mutex);
 
-	/* recursion detection has to go before any traceable function and
-	 * all functions called before this point should be marked as notrace
-	 */
-	bit = ftrace_test_recursion_trylock(ip, parent_ip);
-	if (bit < 0) {
-		fp->nmissed++;
-		return;
+	if (WARN_ON_ONCE(!fph))
+		return -EINVAL;
+
+	if (!is_fprobe_still_exist(fp))
+		return -ENOENT;
+
+	fph->fp = NULL;
+	hlist_del_rcu(&fph->hlist);
+	return 0;
+}
+
+/* The entry data size is 4 bits (=16) * sizeof(long) in maximum */
+#define FPROBE_HEADER_SIZE_BITS		4
+#define MAX_FPROBE_DATA_SIZE_WORD	((1L << FPROBE_HEADER_SIZE_BITS) - 1)
+#define MAX_FPROBE_DATA_SIZE		(MAX_FPROBE_DATA_SIZE_WORD * sizeof(long))
+#define FPROBE_HEADER_PTR_BITS		(BITS_PER_LONG - FPROBE_HEADER_SIZE_BITS)
+#define FPROBE_HEADER_PTR_MASK		GENMASK(FPROBE_HEADER_PTR_BITS - 1, 0)
+#define FPROBE_HEADER_SIZE		sizeof(unsigned long)
+
+static inline unsigned long encode_fprobe_header(struct fprobe *fp, int size_words)
+{
+	if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
+	    ((unsigned long)fp & ~FPROBE_HEADER_PTR_MASK) !=
+	    ~FPROBE_HEADER_PTR_MASK)) {
+		return 0;
 	}
+	return ((unsigned long)size_words << FPROBE_HEADER_PTR_BITS) |
+		((unsigned long)fp & FPROBE_HEADER_PTR_MASK);
+}
+
+/* Return reserved data size in words */
+static inline int decode_fprobe_header(unsigned long val, struct fprobe **fp)
+{
+	unsigned long ptr;
+
+	ptr = (val & FPROBE_HEADER_PTR_MASK) | ~FPROBE_HEADER_PTR_MASK;
+	if (fp)
+		*fp = (struct fprobe *)ptr;
+	return val >> FPROBE_HEADER_PTR_BITS;
+}
+
+/*
+ * fprobe shadow stack management:
+ * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
+ * among the probes on the same function exit. Note that a new probe can be
+ * registered before a target function is returning, we can not use the hash
+ * table to find the corresponding probes. Thus the probe address is stored on
+ * the shadow stack with its entry data size.
+ *
+ */
+static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+				   struct fprobe *fp, struct ftrace_regs *fregs,
+				   void *data)
+{
+	if (!fp->entry_handler)
+		return 0;
+
+	return fp->entry_handler(fp, ip, parent_ip, fregs, data);
+}
 
+static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+					  struct fprobe *fp, struct ftrace_regs *fregs,
+					  void *data)
+{
+	int ret;
 	/*
 	 * This user handler is shared with other kprobes and is not expected to be
 	 * called recursively. So if any other kprobe handler is running, this will
@@ -108,45 +203,185 @@  static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
 	 */
 	if (unlikely(kprobe_running())) {
 		fp->nmissed++;
-		goto recursion_unlock;
+		return 0;
 	}
 
 	kprobe_busy_begin();
-	__fprobe_handler(ip, parent_ip, ops, fregs);
+	ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
 	kprobe_busy_end();
-
-recursion_unlock:
-	ftrace_test_recursion_unlock(bit);
+	return ret;
 }
 
-static void fprobe_exit_handler(struct rethook_node *rh, void *data,
-				unsigned long ret_ip, struct pt_regs *regs)
+static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+			struct ftrace_regs *fregs)
 {
-	struct fprobe *fp = (struct fprobe *)data;
-	struct fprobe_rethook_node *fpr;
-	struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
-	int bit;
+	struct fprobe_hlist_node *node, *first;
+	unsigned long *fgraph_data = NULL;
+	unsigned long func = trace->func;
+	unsigned long header, ret_ip;
+	int reserved_words;
+	struct fprobe *fp;
+	int used, ret;
 
-	if (!fp || fprobe_disabled(fp))
-		return;
+	if (WARN_ON_ONCE(!fregs))
+		return 0;
 
-	fpr = container_of(rh, struct fprobe_rethook_node, node);
+	first = node = find_first_fprobe_node(func);
+	if (unlikely(!first))
+		return 0;
+
+	reserved_words = 0;
+	hlist_for_each_entry_from_rcu(node, hlist) {
+		if (node->addr != func)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (!fp || !fp->exit_handler)
+			continue;
+		/*
+		 * Since fprobe can be enabled until the next loop, we ignore the
+		 * fprobe's disabled flag in this loop.
+		 */
+		reserved_words +=
+			DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
+	}
+	node = first;
+	if (reserved_words) {
+		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
+		if (unlikely(!fgraph_data)) {
+			hlist_for_each_entry_from_rcu(node, hlist) {
+				if (node->addr != func)
+					break;
+				fp = READ_ONCE(node->fp);
+				if (fp && !fprobe_disabled(fp))
+					fp->nmissed++;
+			}
+			return 0;
+		}
+	}
 
 	/*
-	 * we need to assure no calls to traceable functions in-between the
-	 * end of fprobe_handler and the beginning of fprobe_exit_handler.
+	 * TODO: recursion detection has been done in the fgraph. Thus we need
+	 * to add a callback to increment missed counter.
 	 */
-	bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
-	if (bit < 0) {
-		fp->nmissed++;
+	ret_ip = ftrace_regs_get_return_address(fregs);
+	used = 0;
+	hlist_for_each_entry_from_rcu(node, hlist) {
+		void *data;
+
+		if (node->addr != func)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (!fp || fprobe_disabled(fp))
+			continue;
+
+		if (fp->entry_data_size && fp->exit_handler)
+			data = fgraph_data + used + 1;
+		else
+			data = NULL;
+
+		if (fprobe_shared_with_kprobes(fp))
+			ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
+		else
+			ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
+		/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
+		if (!ret && fp->exit_handler) {
+			int size_words = DIV_ROUND_UP(fp->entry_data_size, sizeof(long));
+
+			header = encode_fprobe_header(fp, size_words);
+			if (likely(header)) {
+				fgraph_data[used] = header;
+				used += size_words + 1;
+			}
+		}
+	}
+	if (used < reserved_words)
+		memset(fgraph_data + used, 0, reserved_words - used);
+
+	/* If any exit_handler is set, data must be used. */
+	return used != 0;
+}
+NOKPROBE_SYMBOL(fprobe_entry);
+
+static void fprobe_return(struct ftrace_graph_ret *trace,
+			  struct fgraph_ops *gops,
+			  struct ftrace_regs *fregs)
+{
+	unsigned long *fgraph_data = NULL;
+	unsigned long ret_ip;
+	unsigned long val;
+	struct fprobe *fp;
+	int size, curr;
+	int size_words;
+
+	fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
+	if (WARN_ON_ONCE(!fgraph_data))
 		return;
+	size_words = DIV_ROUND_UP(size, sizeof(long));
+	ret_ip = ftrace_regs_get_instruction_pointer(fregs);
+
+	preempt_disable();
+
+	curr = 0;
+	while (size_words > curr) {
+		val = fgraph_data[curr++];
+		if (!val)
+			break;
+
+		size = decode_fprobe_header(val, &fp);
+		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+			if (WARN_ON_ONCE(curr + size > size_words))
+				break;
+			fp->exit_handler(fp, trace->func, ret_ip, fregs,
+					 size ? fgraph_data + curr : NULL);
+		}
+		curr += size + 1;
 	}
+	preempt_enable();
+}
+NOKPROBE_SYMBOL(fprobe_return);
 
-	fp->exit_handler(fp, fpr->entry_ip, ret_ip, fregs,
-			 fp->entry_data_size ? (void *)fpr->data : NULL);
-	ftrace_test_recursion_unlock(bit);
+static struct fgraph_ops fprobe_graph_ops = {
+	.entryfunc	= fprobe_entry,
+	.retfunc	= fprobe_return,
+};
+static int fprobe_graph_active;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+	int ret;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+	if (ret)
+		return ret;
+
+	if (!fprobe_graph_active) {
+		ret = register_ftrace_graph(&fprobe_graph_ops);
+		if (WARN_ON_ONCE(ret)) {
+			ftrace_free_filter(&fprobe_graph_ops.ops);
+			return ret;
+		}
+	}
+	fprobe_graph_active++;
+	return 0;
+}
+
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+	lockdep_assert_held(&fprobe_mutex);
+
+	fprobe_graph_active--;
+	if (!fprobe_graph_active) {
+		/* Q: should we unregister it ? */
+		unregister_ftrace_graph(&fprobe_graph_ops);
+		return;
+	}
+
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
 }
-NOKPROBE_SYMBOL(fprobe_exit_handler);
 
 static int symbols_cmp(const void *a, const void *b)
 {
@@ -176,56 +411,97 @@  static unsigned long *get_ftrace_locations(const char **syms, int num)
 	return ERR_PTR(-ENOENT);
 }
 
-static void fprobe_init(struct fprobe *fp)
-{
-	fp->nmissed = 0;
-	if (fprobe_shared_with_kprobes(fp))
-		fp->ops.func = fprobe_kprobe_handler;
-	else
-		fp->ops.func = fprobe_handler;
-
-	fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
-}
+struct filter_match_data {
+	const char *filter;
+	const char *notfilter;
+	size_t index;
+	size_t size;
+	unsigned long *addrs;
+};
 
-static int fprobe_init_rethook(struct fprobe *fp, int num)
+static int filter_match_callback(void *data, const char *name, unsigned long addr)
 {
-	int size;
+	struct filter_match_data *match = data;
 
-	if (num <= 0)
-		return -EINVAL;
+	if (!glob_match(match->filter, name) ||
+	    (match->notfilter && glob_match(match->notfilter, name)))
+		return 0;
 
-	if (!fp->exit_handler) {
-		fp->rethook = NULL;
+	if (!ftrace_location(addr))
 		return 0;
-	}
 
-	/* Initialize rethook if needed */
-	if (fp->nr_maxactive)
-		size = fp->nr_maxactive;
-	else
-		size = num * num_possible_cpus() * 2;
-	if (size <= 0)
-		return -EINVAL;
+	if (match->addrs)
+		match->addrs[match->index] = addr;
 
-	/* Initialize rethook */
-	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
-				sizeof(struct fprobe_rethook_node), size);
-	if (IS_ERR(fp->rethook))
-		return PTR_ERR(fp->rethook);
+	match->index++;
+	return match->index == match->size;
+}
 
-	return 0;
+/*
+ * Make IP list from the filter/no-filter glob patterns.
+ * Return the number of matched symbols, or -ENOENT.
+ */
+static int ip_list_from_filter(const char *filter, const char *notfilter,
+			       unsigned long *addrs, size_t size)
+{
+	struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
+		.index = 0, .size = size, .addrs = addrs};
+	int ret;
+
+	ret = kallsyms_on_each_symbol(filter_match_callback, &match);
+	if (ret < 0)
+		return ret;
+	ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+	if (ret < 0)
+		return ret;
+
+	return match.index ?: -ENOENT;
 }
 
 static void fprobe_fail_cleanup(struct fprobe *fp)
 {
-	if (!IS_ERR_OR_NULL(fp->rethook)) {
-		/* Don't need to cleanup rethook->handler because this is not used. */
-		rethook_free(fp->rethook);
-		fp->rethook = NULL;
+	kfree(fp->hlist_array);
+	fp->hlist_array = NULL;
+}
+
+/* Initialize the fprobe data structure. */
+static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
+{
+	struct fprobe_hlist *hlist_array;
+	unsigned long addr;
+	int size, i;
+
+	if (!fp || !addrs || num <= 0)
+		return -EINVAL;
+
+	size = ALIGN(fp->entry_data_size, sizeof(long));
+	if (size > MAX_FPROBE_DATA_SIZE)
+		return -E2BIG;
+	fp->entry_data_size = size;
+
+	hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
+	if (!hlist_array)
+		return -ENOMEM;
+
+	fp->nmissed = 0;
+
+	hlist_array->size = num;
+	fp->hlist_array = hlist_array;
+	hlist_array->fp = fp;
+	for (i = 0; i < num; i++) {
+		hlist_array->array[i].fp = fp;
+		addr = ftrace_location(addrs[i]);
+		if (!addr) {
+			fprobe_fail_cleanup(fp);
+			return -ENOENT;
+		}
+		hlist_array->array[i].addr = addr;
 	}
-	ftrace_free_filter(&fp->ops);
+	return 0;
 }
 
+#define FPROBE_IPS_MAX	INT_MAX
+
 /**
  * register_fprobe() - Register fprobe to ftrace by pattern.
  * @fp: A fprobe data structure to be registered.
@@ -239,46 +515,24 @@  static void fprobe_fail_cleanup(struct fprobe *fp)
  */
 int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
 {
-	struct ftrace_hash *hash;
-	unsigned char *str;
-	int ret, len;
+	unsigned long *addrs;
+	int ret;
 
 	if (!fp || !filter)
 		return -EINVAL;
 
-	fprobe_init(fp);
-
-	len = strlen(filter);
-	str = kstrdup(filter, GFP_KERNEL);
-	ret = ftrace_set_filter(&fp->ops, str, len, 0);
-	kfree(str);
-	if (ret)
+	ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
+	if (ret < 0)
 		return ret;
 
-	if (notfilter) {
-		len = strlen(notfilter);
-		str = kstrdup(notfilter, GFP_KERNEL);
-		ret = ftrace_set_notrace(&fp->ops, str, len, 0);
-		kfree(str);
-		if (ret)
-			goto out;
-	}
-
-	/* TODO:
-	 * correctly calculate the total number of filtered symbols
-	 * from both filter and notfilter.
-	 */
-	hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
-	if (WARN_ON_ONCE(!hash))
-		goto out;
-
-	ret = fprobe_init_rethook(fp, (int)hash->count);
-	if (!ret)
-		ret = register_ftrace_function(&fp->ops);
+	addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+	ret = ip_list_from_filter(filter, notfilter, addrs, ret);
+	if (ret > 0)
+		ret = register_fprobe_ips(fp, addrs, ret);
 
-out:
-	if (ret)
-		fprobe_fail_cleanup(fp);
+	kfree(addrs);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_fprobe);
@@ -286,7 +540,7 @@  EXPORT_SYMBOL_GPL(register_fprobe);
 /**
  * register_fprobe_ips() - Register fprobe to ftrace by address.
  * @fp: A fprobe data structure to be registered.
- * @addrs: An array of target ftrace location addresses.
+ * @addrs: An array of target function address.
  * @num: The number of entries of @addrs.
  *
  * Register @fp to ftrace for enabling the probe on the address given by @addrs.
@@ -298,23 +552,27 @@  EXPORT_SYMBOL_GPL(register_fprobe);
  */
 int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 {
-	int ret;
-
-	if (!fp || !addrs || num <= 0)
-		return -EINVAL;
-
-	fprobe_init(fp);
+	struct fprobe_hlist *hlist_array;
+	int ret, i;
 
-	ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+	ret = fprobe_init(fp, addrs, num);
 	if (ret)
 		return ret;
 
-	ret = fprobe_init_rethook(fp, num);
-	if (!ret)
-		ret = register_ftrace_function(&fp->ops);
+	mutex_lock(&fprobe_mutex);
+
+	hlist_array = fp->hlist_array;
+	ret = fprobe_graph_add_ips(addrs, num);
+	if (!ret) {
+		add_fprobe_hash(fp);
+		for (i = 0; i < hlist_array->size; i++)
+			insert_fprobe_node(&hlist_array->array[i]);
+	}
+	mutex_unlock(&fprobe_mutex);
 
 	if (ret)
 		fprobe_fail_cleanup(fp);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -352,14 +610,13 @@  EXPORT_SYMBOL_GPL(register_fprobe_syms);
 
 bool fprobe_is_registered(struct fprobe *fp)
 {
-	if (!fp || (fp->ops.saved_func != fprobe_handler &&
-		    fp->ops.saved_func != fprobe_kprobe_handler))
+	if (!fp || !fp->hlist_array)
 		return false;
 	return true;
 }
 
 /**
- * unregister_fprobe() - Unregister fprobe from ftrace
+ * unregister_fprobe() - Unregister fprobe.
  * @fp: A fprobe data structure to be unregistered.
  *
  * Unregister fprobe (and remove ftrace hooks from the function entries).
@@ -368,23 +625,40 @@  bool fprobe_is_registered(struct fprobe *fp)
  */
 int unregister_fprobe(struct fprobe *fp)
 {
-	int ret;
+	struct fprobe_hlist *hlist_array;
+	unsigned long *addrs = NULL;
+	int ret = 0, i, count;
 
-	if (!fprobe_is_registered(fp))
-		return -EINVAL;
+	mutex_lock(&fprobe_mutex);
+	if (!fp || !is_fprobe_still_exist(fp)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
-	if (!IS_ERR_OR_NULL(fp->rethook))
-		rethook_stop(fp->rethook);
+	hlist_array = fp->hlist_array;
+	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
+	if (!addrs) {
+		ret = -ENOMEM;	/* TODO: Fallback to one-by-one loop */
+		goto out;
+	}
 
-	ret = unregister_ftrace_function(&fp->ops);
-	if (ret < 0)
-		return ret;
+	/* Remove non-synonim ips from table and hash */
+	count = 0;
+	for (i = 0; i < hlist_array->size; i++) {
+		if (!delete_fprobe_node(&hlist_array->array[i]))
+			addrs[count++] = hlist_array->array[i].addr;
+	}
+	del_fprobe_hash(fp);
 
-	if (!IS_ERR_OR_NULL(fp->rethook))
-		rethook_free(fp->rethook);
+	fprobe_graph_remove_ips(addrs, count);
 
-	ftrace_free_filter(&fp->ops);
+	kfree_rcu(hlist_array, rcu);
+	fp->hlist_array = NULL;
 
+out:
+	mutex_unlock(&fprobe_mutex);
+
+	kfree(addrs);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
index 271ce0caeec0..cf92111b5c79 100644
--- a/lib/test_fprobe.c
+++ b/lib/test_fprobe.c
@@ -17,10 +17,8 @@  static u32 rand1, entry_val, exit_val;
 /* Use indirect calls to avoid inlining the target functions */
 static u32 (*target)(u32 value);
 static u32 (*target2)(u32 value);
-static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
 static unsigned long target_ip;
 static unsigned long target2_ip;
-static unsigned long target_nest_ip;
 static int entry_return_value;
 
 static noinline u32 fprobe_selftest_target(u32 value)
@@ -33,11 +31,6 @@  static noinline u32 fprobe_selftest_target2(u32 value)
 	return (value / div_factor) + 1;
 }
 
-static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
-{
-	return nest(value + 2);
-}
-
 static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
 				    unsigned long ret_ip,
 				    struct ftrace_regs *fregs, void *data)
@@ -79,22 +72,6 @@  static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
 		KUNIT_EXPECT_NULL(current_test, data);
 }
 
-static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
-				      unsigned long ret_ip,
-				      struct ftrace_regs *fregs, void *data)
-{
-	KUNIT_EXPECT_FALSE(current_test, preemptible());
-	return 0;
-}
-
-static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
-				      unsigned long ret_ip,
-				      struct ftrace_regs *fregs, void *data)
-{
-	KUNIT_EXPECT_FALSE(current_test, preemptible());
-	KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
-}
-
 /* Test entry only (no rethook) */
 static void test_fprobe_entry(struct kunit *test)
 {
@@ -191,25 +168,6 @@  static void test_fprobe_data(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
 }
 
-/* Test nr_maxactive */
-static void test_fprobe_nest(struct kunit *test)
-{
-	static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
-	struct fprobe fp = {
-		.entry_handler = nest_entry_handler,
-		.exit_handler = nest_exit_handler,
-		.nr_maxactive = 1,
-	};
-
-	current_test = test;
-	KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
-
-	target_nest(rand1, target);
-	KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
-
-	KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
-}
-
 static void test_fprobe_skip(struct kunit *test)
 {
 	struct fprobe fp = {
@@ -247,10 +205,8 @@  static int fprobe_test_init(struct kunit *test)
 	rand1 = get_random_u32_above(div_factor);
 	target = fprobe_selftest_target;
 	target2 = fprobe_selftest_target2;
-	target_nest = fprobe_selftest_nest_target;
 	target_ip = get_ftrace_location(target);
 	target2_ip = get_ftrace_location(target2);
-	target_nest_ip = get_ftrace_location(target_nest);
 
 	return 0;
 }
@@ -260,7 +216,6 @@  static struct kunit_case fprobe_testcases[] = {
 	KUNIT_CASE(test_fprobe),
 	KUNIT_CASE(test_fprobe_syms),
 	KUNIT_CASE(test_fprobe_data),
-	KUNIT_CASE(test_fprobe_nest),
 	KUNIT_CASE(test_fprobe_skip),
 	{}
 };