diff mbox series

[RFC] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

Message ID 20240306165904.108141-1-puranjay12@gmail.com (mailing list archive)
State RFC
Headers show
Series [RFC] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR success PR summary
conchuod/patch-1-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-1-test-2 success .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-1-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-1-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-1-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-1-test-6 success .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-1-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-1-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-1-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-1-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-1-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-1-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Puranjay Mohan March 6, 2024, 4:59 p.m. UTC
This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
This allows each ftrace callsite to provide an ftrace_ops to the common
ftrace trampoline, allowing each callsite to invoke distinct tracer
functions without the need to fall back to list processing or to
allocate custom trampolines for each callsite. This significantly speeds
up cases where multiple distinct trace functions are used and callsites
are mostly traced by a single tracer.

The idea and most of the implementation is taken from the ARM64's
implementation of the same feature. The idea is to place a pointer to
the ftrace_ops as a literal at a fixed offset from the function entry
point, which can be recovered by the common ftrace trampoline.

We use -fpatchable-function-entry to reserve 8 bytes above the function
entry by emitting 2 4 byte or 4 2 byte  nops depending on the presence of
CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
to the associated ftrace_ops for that callsite. Functions are aligned to
8 bytes to make sure that the accesses to this literal are atomic.

This approach allows for directly invoking ftrace_ops::func even for
ftrace_ops which are dynamically-allocated (or part of a module),
without going via ftrace_ops_list_func.

I've benchamrked this with the ftrace_ops sample module on Qemu, with
the next version, I will provide benchmarks on real hardware:

Without this patch:

+-----------------------+-----------------+----------------------------+
|  Number of tracers    | Total time (ns) | Per-call average time      |
|-----------------------+-----------------+----------------------------|
| Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
|----------+------------+-----------------+------------+---------------|
|        0 |          0 |        15615700 |        156 |             - |
|        0 |          1 |        15917600 |        159 |             - |
|        0 |          2 |        15668000 |        156 |             - |
|        0 |         10 |        14971500 |        149 |             - |
|        0 |        100 |        15417600 |        154 |             - |
|        0 |        200 |        15387000 |        153 |             - |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |       119906800 |       1199 |          1043 |
|        1 |          1 |       137428600 |       1374 |          1218 |
|        1 |          2 |       159562400 |       1374 |          1218 |
|        1 |         10 |       302099900 |       3020 |          2864 |
|        1 |        100 |      2008785500 |      20087 |         19931 |
|        1 |        200 |      3965221900 |      39652 |         39496 |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |       119166700 |       1191 |          1035 |
|        2 |          0 |       157999900 |       1579 |          1423 |
|       10 |          0 |       425370100 |       4253 |          4097 |
|      100 |          0 |      3595252100 |      35952 |         35796 |
|      200 |          0 |      7023485700 |      70234 |         70078 |
+----------+------------+-----------------+------------+---------------+

Note: per-call overhead is estimated relative to the baseline case with
0 relevant tracers and 0 irrelevant tracers.

With this patch:

+-----------------------+-----------------+----------------------------+
|   Number of tracers   | Total time (ns) | Per-call average time      |
|-----------------------+-----------------+----------------------------|
| Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
|----------+------------+-----------------+------------+---------------|
|        0 |          0 |        15254600 |        152 |             - |
|        0 |          1 |        16136700 |        161 |             - |
|        0 |          2 |        15329500 |        153 |             - |
|        0 |         10 |        15148800 |        151 |             - |
|        0 |        100 |        15746900 |        157 |             - |
|        0 |        200 |        15737400 |        157 |             - |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |        47909000 |        479 |           327 |
|        1 |          1 |        48297400 |        482 |           330 |
|        1 |          2 |        47314100 |        473 |           321 |
|        1 |         10 |        47844900 |        478 |           326 |
|        1 |        100 |        46591900 |        465 |           313 |
|        1 |        200 |        47178900 |        471 |           319 |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |        46715800 |        467 |           315 |
|        2 |          0 |       155134500 |       1551 |          1399 |
|       10 |          0 |       442672800 |       4426 |          4274 |
|      100 |          0 |      4092353900 |      40923 |         40771 |
|      200 |          0 |      7135796400 |      71357 |         71205 |
+----------+------------+-----------------+------------+---------------+

Note: per-call overhead is estimated relative to the baseline case with
0 relevant tracers and 0 irrelevant tracers.

As can be seen from the above:

 a) Whenever there is a single relevant tracer function associated with a
    tracee, the overhead of invoking the tracer is constant, and does not
    scale with the number of tracers which are *not* associated with that
    tracee.

 b) The overhead for a single relevant tracer has dropped to ~1/3 of the
    overhead prior to this series (from 1035ns to 315ns). This is largely
    due to permitting calls to dynamically-allocated ftrace_ops without
    going through ftrace_ops_list_func.

Why is this patch a RFC patch:
 1. I saw some rcu stalls on Qemu and need to debug them and see if they
    were introduced by this patch.
 2. This needs to be tested thoroughly on real hardware.
 3. Seeking reviews to fix any fundamental problems with this patch that I
    might have missed due to my lack of RISC-V architecture knowledge.
 4. I would like to benchmark this on real hardware and put the results in
    the commit message.

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
---
 arch/riscv/Kconfig              |  2 ++
 arch/riscv/Makefile             |  8 +++++
 arch/riscv/include/asm/ftrace.h |  3 ++
 arch/riscv/kernel/asm-offsets.c |  3 ++
 arch/riscv/kernel/ftrace.c      | 59 +++++++++++++++++++++++++++++++++
 arch/riscv/kernel/mcount-dyn.S  | 42 ++++++++++++++++++++---
 6 files changed, 112 insertions(+), 5 deletions(-)

Comments

Alexandre Ghiti March 6, 2024, 8:35 p.m. UTC | #1
Hi Puranjay,

On 06/03/2024 17:59, Puranjay Mohan wrote:
> This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> This allows each ftrace callsite to provide an ftrace_ops to the common
> ftrace trampoline, allowing each callsite to invoke distinct tracer
> functions without the need to fall back to list processing or to
> allocate custom trampolines for each callsite. This significantly speeds
> up cases where multiple distinct trace functions are used and callsites
> are mostly traced by a single tracer.
>
> The idea and most of the implementation is taken from the ARM64's
> implementation of the same feature. The idea is to place a pointer to
> the ftrace_ops as a literal at a fixed offset from the function entry
> point, which can be recovered by the common ftrace trampoline.
>
> We use -fpatchable-function-entry to reserve 8 bytes above the function
> entry by emitting 2 4 byte or 4 2 byte  nops depending on the presence of
> CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
> to the associated ftrace_ops for that callsite. Functions are aligned to
> 8 bytes to make sure that the accesses to this literal are atomic.
>
> This approach allows for directly invoking ftrace_ops::func even for
> ftrace_ops which are dynamically-allocated (or part of a module),
> without going via ftrace_ops_list_func.
>
> I've benchamrked this with the ftrace_ops sample module on Qemu, with
> the next version, I will provide benchmarks on real hardware:
>
> Without this patch:
>
> +-----------------------+-----------------+----------------------------+
> |  Number of tracers    | Total time (ns) | Per-call average time      |
> |-----------------------+-----------------+----------------------------|
> | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
> |----------+------------+-----------------+------------+---------------|
> |        0 |          0 |        15615700 |        156 |             - |
> |        0 |          1 |        15917600 |        159 |             - |
> |        0 |          2 |        15668000 |        156 |             - |
> |        0 |         10 |        14971500 |        149 |             - |
> |        0 |        100 |        15417600 |        154 |             - |
> |        0 |        200 |        15387000 |        153 |             - |
> |----------+------------+-----------------+------------+---------------|
> |        1 |          0 |       119906800 |       1199 |          1043 |
> |        1 |          1 |       137428600 |       1374 |          1218 |
> |        1 |          2 |       159562400 |       1374 |          1218 |
> |        1 |         10 |       302099900 |       3020 |          2864 |
> |        1 |        100 |      2008785500 |      20087 |         19931 |
> |        1 |        200 |      3965221900 |      39652 |         39496 |
> |----------+------------+-----------------+------------+---------------|
> |        1 |          0 |       119166700 |       1191 |          1035 |
> |        2 |          0 |       157999900 |       1579 |          1423 |
> |       10 |          0 |       425370100 |       4253 |          4097 |
> |      100 |          0 |      3595252100 |      35952 |         35796 |
> |      200 |          0 |      7023485700 |      70234 |         70078 |
> +----------+------------+-----------------+------------+---------------+
>
> Note: per-call overhead is estimated relative to the baseline case with
> 0 relevant tracers and 0 irrelevant tracers.
>
> With this patch:
>
> +-----------------------+-----------------+----------------------------+
> |   Number of tracers   | Total time (ns) | Per-call average time      |
> |-----------------------+-----------------+----------------------------|
> | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
> |----------+------------+-----------------+------------+---------------|
> |        0 |          0 |        15254600 |        152 |             - |
> |        0 |          1 |        16136700 |        161 |             - |
> |        0 |          2 |        15329500 |        153 |             - |
> |        0 |         10 |        15148800 |        151 |             - |
> |        0 |        100 |        15746900 |        157 |             - |
> |        0 |        200 |        15737400 |        157 |             - |
> |----------+------------+-----------------+------------+---------------|
> |        1 |          0 |        47909000 |        479 |           327 |
> |        1 |          1 |        48297400 |        482 |           330 |
> |        1 |          2 |        47314100 |        473 |           321 |
> |        1 |         10 |        47844900 |        478 |           326 |
> |        1 |        100 |        46591900 |        465 |           313 |
> |        1 |        200 |        47178900 |        471 |           319 |
> |----------+------------+-----------------+------------+---------------|
> |        1 |          0 |        46715800 |        467 |           315 |
> |        2 |          0 |       155134500 |       1551 |          1399 |
> |       10 |          0 |       442672800 |       4426 |          4274 |
> |      100 |          0 |      4092353900 |      40923 |         40771 |
> |      200 |          0 |      7135796400 |      71357 |         71205 |
> +----------+------------+-----------------+------------+---------------+
>
> Note: per-call overhead is estimated relative to the baseline case with
> 0 relevant tracers and 0 irrelevant tracers.
>
> As can be seen from the above:
>
>   a) Whenever there is a single relevant tracer function associated with a
>      tracee, the overhead of invoking the tracer is constant, and does not
>      scale with the number of tracers which are *not* associated with that
>      tracee.
>
>   b) The overhead for a single relevant tracer has dropped to ~1/3 of the
>      overhead prior to this series (from 1035ns to 315ns). This is largely
>      due to permitting calls to dynamically-allocated ftrace_ops without
>      going through ftrace_ops_list_func.
>
> Why is this patch a RFC patch:
>   1. I saw some rcu stalls on Qemu and need to debug them and see if they
>      were introduced by this patch.


FYI, I'm currently working on debugging such issues (and other) with the 
*current* ftrace implementation, so probably not caused by your 
patchset. But keep debugging too, maybe this introduces other issues or 
even better, you'll find the root cause :)


>   2. This needs to be tested thoroughly on real hardware.
>   3. Seeking reviews to fix any fundamental problems with this patch that I
>      might have missed due to my lack of RISC-V architecture knowledge.
>   4. I would like to benchmark this on real hardware and put the results in
>      the commit message.
>
> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
> ---
>   arch/riscv/Kconfig              |  2 ++
>   arch/riscv/Makefile             |  8 +++++
>   arch/riscv/include/asm/ftrace.h |  3 ++
>   arch/riscv/kernel/asm-offsets.c |  3 ++
>   arch/riscv/kernel/ftrace.c      | 59 +++++++++++++++++++++++++++++++++
>   arch/riscv/kernel/mcount-dyn.S  | 42 ++++++++++++++++++++---
>   6 files changed, 112 insertions(+), 5 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 0bfcfec67ed5..e474742e23b2 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -78,6 +78,7 @@ config RISCV
>   	select EDAC_SUPPORT
>   	select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
>   	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
> +	select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS


A recent discussion [1] states that -falign-functions cannot guarantee 
this alignment for all code and that gcc developers came up with a new 
option [2]: WDYT? I have added Andy and Evgenii in +cc to help on that.

[1] 
https://lore.kernel.org/linux-riscv/4fe4567b-96be-4102-952b-7d39109b2186@yadro.com/
[2] 
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=0f5a9a00e3ab1fe96142f304cfbcf3f63b15f326


>   	select GENERIC_ARCH_TOPOLOGY
>   	select GENERIC_ATOMIC64 if !64BIT
>   	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
> @@ -127,6 +128,7 @@ config RISCV
>   	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
>   	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>   	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
> +	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_REGS && !CFI_CLANG)
>   	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
>   	select HAVE_FUNCTION_GRAPH_TRACER
>   	select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
> diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
> index 252d63942f34..875ad5dc3d32 100644
> --- a/arch/riscv/Makefile
> +++ b/arch/riscv/Makefile
> @@ -14,12 +14,20 @@ endif
>   ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
>   	LDFLAGS_vmlinux += --no-relax
>   	KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
> +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS), y)
> +ifeq ($(CONFIG_RISCV_ISA_C),y)
> +	CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
> +else
> +	CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
> +endif
> +else
>   ifeq ($(CONFIG_RISCV_ISA_C),y)
>   	CC_FLAGS_FTRACE := -fpatchable-function-entry=4
>   else
>   	CC_FLAGS_FTRACE := -fpatchable-function-entry=2
>   endif
>   endif
> +endif
>   
>   ifeq ($(CONFIG_CMODEL_MEDLOW),y)
>   KBUILD_CFLAGS_MODULE += -mcmodel=medany
> diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
> index 329172122952..c9a84222c9ea 100644
> --- a/arch/riscv/include/asm/ftrace.h
> +++ b/arch/riscv/include/asm/ftrace.h
> @@ -28,6 +28,9 @@
>   void MCOUNT_NAME(void);
>   static inline unsigned long ftrace_call_adjust(unsigned long addr)
>   {
> +	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
> +		return addr + 8;
> +
>   	return addr;
>   }
>   
> diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> index a03129f40c46..7d7c4b486852 100644
> --- a/arch/riscv/kernel/asm-offsets.c
> +++ b/arch/riscv/kernel/asm-offsets.c
> @@ -488,4 +488,7 @@ void asm_offsets(void)
>   	DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
>   	OFFSET(STACKFRAME_FP, stackframe, fp);
>   	OFFSET(STACKFRAME_RA, stackframe, ra);
> +#ifdef CONFIG_FUNCTION_TRACER
> +	DEFINE(FTRACE_OPS_FUNC,		offsetof(struct ftrace_ops, func));
> +#endif
>   }
> diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
> index f5aa24d9e1c1..e2e75e15d32e 100644
> --- a/arch/riscv/kernel/ftrace.c
> +++ b/arch/riscv/kernel/ftrace.c
> @@ -82,9 +82,52 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
>   	return 0;
>   }
>   
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
> +{
> +	const struct ftrace_ops *ops = NULL;
> +
> +	if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
> +		ops = ftrace_find_unique_ops(rec);
> +		WARN_ON_ONCE(!ops);
> +	}
> +
> +	if (!ops)
> +		ops = &ftrace_list_ops;
> +
> +	return ops;
> +}
> +
> +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
> +			      const struct ftrace_ops *ops)
> +{
> +	unsigned long literal = rec->ip - 8;
> +
> +	return patch_text_nosync((void *)literal, &ops, sizeof(ops));
> +}
> +
> +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
> +{
> +	return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
> +}
> +
> +static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
> +{
> +	return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
> +}
> +#else
> +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
> +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
> +#endif
> +
>   int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>   {
>   	unsigned int call[2];
> +	int ret;
> +
> +	ret = ftrace_rec_update_ops(rec);
> +	if (ret)
> +		return ret;
>   
>   	make_call_t0(rec->ip, addr, call);
>   
> @@ -98,6 +141,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
>   		    unsigned long addr)
>   {
>   	unsigned int nops[2] = {NOP4, NOP4};
> +	int ret;
> +
> +	ret = ftrace_rec_set_nop_ops(rec);
> +	if (ret)
> +		return ret;
>   
>   	if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
>   		return -EPERM;
> @@ -125,6 +173,13 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
>   
>   int ftrace_update_ftrace_func(ftrace_func_t func)
>   {
> +	/*
> +	 * When using CALL_OPS, the function to call is associated with the
> +	 * call site, and we don't have a global function pointer to update.
> +	 */
> +	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
> +		return 0;
> +
>   	int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
>   				       (unsigned long)func, true, true);
>   	if (!ret) {
> @@ -147,6 +202,10 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
>   	make_call_t0(caller, old_addr, call);
>   	ret = ftrace_check_current_call(caller, call);
>   
> +	if (ret)
> +		return ret;
> +
> +	ret = ftrace_rec_update_ops(rec);
>   	if (ret)
>   		return ret;
>   
> diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
> index b7561288e8da..cb241e36e514 100644
> --- a/arch/riscv/kernel/mcount-dyn.S
> +++ b/arch/riscv/kernel/mcount-dyn.S
> @@ -191,11 +191,35 @@
>   	.endm
>   
>   	.macro PREPARE_ARGS
> -	addi	a0, t0, -FENTRY_RA_OFFSET
> +	addi	a0, t0, -FENTRY_RA_OFFSET	// ip (callsite's auipc insn)
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> +	/*
> +	 * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
> +	 * function entry, these are later overwritten with the pointer to the
> +	 * associated struct ftrace_ops.
> +	 *
> +	 * -8: &ftrace_ops of the associated tracer function.
> +	 *<ftrace enable>:
> +	 *  0: auipc  t0/ra, 0x?
> +	 *  4: jalr   t0/ra, ?(t0/ra)
> +	 *
> +	 * -8: &ftrace_nop_ops
> +	 *<ftrace disable>:
> +	 *  0: nop
> +	 *  4: nop
> +	 *
> +	 * t0 is set to ip+8 after the jalr is executed at the callsite,
> +	 * so we find the associated op at t0-16.
> +	 */
> +	mv 	a1, ra				// parent_ip
> +	REG_L   a2, -16(t0)			// op
> +	REG_L   ra, FTRACE_OPS_FUNC(a2)		// op->func
> +#else
>   	la	a1, function_trace_op
> -	REG_L	a2, 0(a1)
> -	mv	a1, ra
> -	mv	a3, sp
> +	REG_L	a2, 0(a1)			// op
> +	mv	a1, ra				// parent_ip
> +#endif
> +	mv	a3, sp				// regs
>   	.endm
>   
>   #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
> @@ -233,8 +257,12 @@ SYM_FUNC_START(ftrace_regs_caller)
>   	SAVE_ABI_REGS 1
>   	PREPARE_ARGS
>   
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> +	jalr ra
> +#else
>   SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
>   	call	ftrace_stub
> +#endif
>   
>   	RESTORE_ABI_REGS 1
>   	bnez	t1, .Ldirect
> @@ -247,9 +275,13 @@ SYM_FUNC_START(ftrace_caller)
>   	SAVE_ABI_REGS 0
>   	PREPARE_ARGS
>   
> -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> +	jalr ra
> +#else
> +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
>   	call	ftrace_stub
>   
> +#endif
>   	RESTORE_ABI_REGS 0
>   	jr	t0
>   SYM_FUNC_END(ftrace_caller)


As I'm diving into ftrace right now, I'll give a proper review soon. But 
as a note, I noticed that the function_graph tracer, when enabled, makes 
the whole system unresponsive (but still up, just very slow). A fix I 
sent recently seems to really improve that if you're interested in 
testing it (I am :)). You can find it here: 
https://lore.kernel.org/linux-riscv/20240229121056.203419-1-alexghiti@rivosinc.com/

Thanks,

Alex
Alexandre Ghiti March 6, 2024, 8:38 p.m. UTC | #2
+cc Andy and Evgenii

On 06/03/2024 21:35, Alexandre Ghiti wrote:
> Hi Puranjay,
>
> On 06/03/2024 17:59, Puranjay Mohan wrote:
>> This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
>> This allows each ftrace callsite to provide an ftrace_ops to the common
>> ftrace trampoline, allowing each callsite to invoke distinct tracer
>> functions without the need to fall back to list processing or to
>> allocate custom trampolines for each callsite. This significantly speeds
>> up cases where multiple distinct trace functions are used and callsites
>> are mostly traced by a single tracer.
>>
>> The idea and most of the implementation is taken from the ARM64's
>> implementation of the same feature. The idea is to place a pointer to
>> the ftrace_ops as a literal at a fixed offset from the function entry
>> point, which can be recovered by the common ftrace trampoline.
>>
>> We use -fpatchable-function-entry to reserve 8 bytes above the function
>> entry by emitting 2 4 byte or 4 2 byte  nops depending on the 
>> presence of
>> CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
>> to the associated ftrace_ops for that callsite. Functions are aligned to
>> 8 bytes to make sure that the accesses to this literal are atomic.
>>
>> This approach allows for directly invoking ftrace_ops::func even for
>> ftrace_ops which are dynamically-allocated (or part of a module),
>> without going via ftrace_ops_list_func.
>>
>> I've benchamrked this with the ftrace_ops sample module on Qemu, with
>> the next version, I will provide benchmarks on real hardware:
>>
>> Without this patch:
>>
>> +-----------------------+-----------------+----------------------------+
>> |  Number of tracers    | Total time (ns) | Per-call average time      |
>> |-----------------------+-----------------+----------------------------|
>> | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
>> |----------+------------+-----------------+------------+---------------|
>> |        0 |          0 |        15615700 |        156 |             - |
>> |        0 |          1 |        15917600 |        159 |             - |
>> |        0 |          2 |        15668000 |        156 |             - |
>> |        0 |         10 |        14971500 |        149 |             - |
>> |        0 |        100 |        15417600 |        154 |             - |
>> |        0 |        200 |        15387000 |        153 |             - |
>> |----------+------------+-----------------+------------+---------------|
>> |        1 |          0 |       119906800 |       1199 |          1043 |
>> |        1 |          1 |       137428600 |       1374 |          1218 |
>> |        1 |          2 |       159562400 |       1374 |          1218 |
>> |        1 |         10 |       302099900 |       3020 |          2864 |
>> |        1 |        100 |      2008785500 |      20087 | 19931 |
>> |        1 |        200 |      3965221900 |      39652 | 39496 |
>> |----------+------------+-----------------+------------+---------------|
>> |        1 |          0 |       119166700 |       1191 |          1035 |
>> |        2 |          0 |       157999900 |       1579 |          1423 |
>> |       10 |          0 |       425370100 |       4253 |          4097 |
>> |      100 |          0 |      3595252100 |      35952 | 35796 |
>> |      200 |          0 |      7023485700 |      70234 | 70078 |
>> +----------+------------+-----------------+------------+---------------+
>>
>> Note: per-call overhead is estimated relative to the baseline case with
>> 0 relevant tracers and 0 irrelevant tracers.
>>
>> With this patch:
>>
>> +-----------------------+-----------------+----------------------------+
>> |   Number of tracers   | Total time (ns) | Per-call average time      |
>> |-----------------------+-----------------+----------------------------|
>> | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
>> |----------+------------+-----------------+------------+---------------|
>> |        0 |          0 |        15254600 |        152 |             - |
>> |        0 |          1 |        16136700 |        161 |             - |
>> |        0 |          2 |        15329500 |        153 |             - |
>> |        0 |         10 |        15148800 |        151 |             - |
>> |        0 |        100 |        15746900 |        157 |             - |
>> |        0 |        200 |        15737400 |        157 |             - |
>> |----------+------------+-----------------+------------+---------------|
>> |        1 |          0 |        47909000 |        479 |           327 |
>> |        1 |          1 |        48297400 |        482 |           330 |
>> |        1 |          2 |        47314100 |        473 |           321 |
>> |        1 |         10 |        47844900 |        478 |           326 |
>> |        1 |        100 |        46591900 |        465 |           313 |
>> |        1 |        200 |        47178900 |        471 |           319 |
>> |----------+------------+-----------------+------------+---------------|
>> |        1 |          0 |        46715800 |        467 |           315 |
>> |        2 |          0 |       155134500 |       1551 |          1399 |
>> |       10 |          0 |       442672800 |       4426 |          4274 |
>> |      100 |          0 |      4092353900 |      40923 | 40771 |
>> |      200 |          0 |      7135796400 |      71357 | 71205 |
>> +----------+------------+-----------------+------------+---------------+
>>
>> Note: per-call overhead is estimated relative to the baseline case with
>> 0 relevant tracers and 0 irrelevant tracers.
>>
>> As can be seen from the above:
>>
>>   a) Whenever there is a single relevant tracer function associated 
>> with a
>>      tracee, the overhead of invoking the tracer is constant, and 
>> does not
>>      scale with the number of tracers which are *not* associated with 
>> that
>>      tracee.
>>
>>   b) The overhead for a single relevant tracer has dropped to ~1/3 of 
>> the
>>      overhead prior to this series (from 1035ns to 315ns). This is 
>> largely
>>      due to permitting calls to dynamically-allocated ftrace_ops without
>>      going through ftrace_ops_list_func.
>>
>> Why is this patch a RFC patch:
>>   1. I saw some rcu stalls on Qemu and need to debug them and see if 
>> they
>>      were introduced by this patch.
>
>
> FYI, I'm currently working on debugging such issues (and other) with 
> the *current* ftrace implementation, so probably not caused by your 
> patchset. But keep debugging too, maybe this introduces other issues 
> or even better, you'll find the root cause :)
>
>
>>   2. This needs to be tested thoroughly on real hardware.
>>   3. Seeking reviews to fix any fundamental problems with this patch 
>> that I
>>      might have missed due to my lack of RISC-V architecture knowledge.
>>   4. I would like to benchmark this on real hardware and put the 
>> results in
>>      the commit message.
>>
>> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
>> ---
>>   arch/riscv/Kconfig              |  2 ++
>>   arch/riscv/Makefile             |  8 +++++
>>   arch/riscv/include/asm/ftrace.h |  3 ++
>>   arch/riscv/kernel/asm-offsets.c |  3 ++
>>   arch/riscv/kernel/ftrace.c      | 59 +++++++++++++++++++++++++++++++++
>>   arch/riscv/kernel/mcount-dyn.S  | 42 ++++++++++++++++++++---
>>   6 files changed, 112 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 0bfcfec67ed5..e474742e23b2 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -78,6 +78,7 @@ config RISCV
>>       select EDAC_SUPPORT
>>       select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && 
>> !DYNAMIC_FTRACE)
>>       select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if 
>> DYNAMIC_FTRACE
>> +    select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS
>
>
> A recent discussion [1] states that -falign-functions cannot guarantee 
> this alignment for all code and that gcc developers came up with a new 
> option [2]: WDYT? I have added Andy and Evgenii in +cc to help on that.
>
> [1] 
> https://lore.kernel.org/linux-riscv/4fe4567b-96be-4102-952b-7d39109b2186@yadro.com/
> [2] 
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=0f5a9a00e3ab1fe96142f304cfbcf3f63b15f326
>
>
>>       select GENERIC_ARCH_TOPOLOGY
>>       select GENERIC_ATOMIC64 if !64BIT
>>       select GENERIC_CLOCKEVENTS_BROADCAST if SMP
>> @@ -127,6 +128,7 @@ config RISCV
>>       select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && 
>> (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
>>       select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>>       select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
>> +    select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if 
>> (DYNAMIC_FTRACE_WITH_REGS && !CFI_CLANG)
>>       select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
>>       select HAVE_FUNCTION_GRAPH_TRACER
>>       select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
>> diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
>> index 252d63942f34..875ad5dc3d32 100644
>> --- a/arch/riscv/Makefile
>> +++ b/arch/riscv/Makefile
>> @@ -14,12 +14,20 @@ endif
>>   ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
>>       LDFLAGS_vmlinux += --no-relax
>>       KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
>> +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS), y)
>> +ifeq ($(CONFIG_RISCV_ISA_C),y)
>> +    CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
>> +else
>> +    CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
>> +endif
>> +else
>>   ifeq ($(CONFIG_RISCV_ISA_C),y)
>>       CC_FLAGS_FTRACE := -fpatchable-function-entry=4
>>   else
>>       CC_FLAGS_FTRACE := -fpatchable-function-entry=2
>>   endif
>>   endif
>> +endif
>>     ifeq ($(CONFIG_CMODEL_MEDLOW),y)
>>   KBUILD_CFLAGS_MODULE += -mcmodel=medany
>> diff --git a/arch/riscv/include/asm/ftrace.h 
>> b/arch/riscv/include/asm/ftrace.h
>> index 329172122952..c9a84222c9ea 100644
>> --- a/arch/riscv/include/asm/ftrace.h
>> +++ b/arch/riscv/include/asm/ftrace.h
>> @@ -28,6 +28,9 @@
>>   void MCOUNT_NAME(void);
>>   static inline unsigned long ftrace_call_adjust(unsigned long addr)
>>   {
>> +    if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
>> +        return addr + 8;
>> +
>>       return addr;
>>   }
>>   diff --git a/arch/riscv/kernel/asm-offsets.c 
>> b/arch/riscv/kernel/asm-offsets.c
>> index a03129f40c46..7d7c4b486852 100644
>> --- a/arch/riscv/kernel/asm-offsets.c
>> +++ b/arch/riscv/kernel/asm-offsets.c
>> @@ -488,4 +488,7 @@ void asm_offsets(void)
>>       DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct 
>> stackframe), STACK_ALIGN));
>>       OFFSET(STACKFRAME_FP, stackframe, fp);
>>       OFFSET(STACKFRAME_RA, stackframe, ra);
>> +#ifdef CONFIG_FUNCTION_TRACER
>> +    DEFINE(FTRACE_OPS_FUNC,        offsetof(struct ftrace_ops, func));
>> +#endif
>>   }
>> diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
>> index f5aa24d9e1c1..e2e75e15d32e 100644
>> --- a/arch/riscv/kernel/ftrace.c
>> +++ b/arch/riscv/kernel/ftrace.c
>> @@ -82,9 +82,52 @@ static int __ftrace_modify_call(unsigned long 
>> hook_pos, unsigned long target,
>>       return 0;
>>   }
>>   +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>> +static const struct ftrace_ops *riscv64_rec_get_ops(struct 
>> dyn_ftrace *rec)
>> +{
>> +    const struct ftrace_ops *ops = NULL;
>> +
>> +    if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
>> +        ops = ftrace_find_unique_ops(rec);
>> +        WARN_ON_ONCE(!ops);
>> +    }
>> +
>> +    if (!ops)
>> +        ops = &ftrace_list_ops;
>> +
>> +    return ops;
>> +}
>> +
>> +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
>> +                  const struct ftrace_ops *ops)
>> +{
>> +    unsigned long literal = rec->ip - 8;
>> +
>> +    return patch_text_nosync((void *)literal, &ops, sizeof(ops));
>> +}
>> +
>> +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
>> +{
>> +    return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
>> +}
>> +
>> +static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
>> +{
>> +    return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
>> +}
>> +#else
>> +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
>> +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
>> +#endif
>> +
>>   int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>>   {
>>       unsigned int call[2];
>> +    int ret;
>> +
>> +    ret = ftrace_rec_update_ops(rec);
>> +    if (ret)
>> +        return ret;
>>         make_call_t0(rec->ip, addr, call);
>>   @@ -98,6 +141,11 @@ int ftrace_make_nop(struct module *mod, struct 
>> dyn_ftrace *rec,
>>               unsigned long addr)
>>   {
>>       unsigned int nops[2] = {NOP4, NOP4};
>> +    int ret;
>> +
>> +    ret = ftrace_rec_set_nop_ops(rec);
>> +    if (ret)
>> +        return ret;
>>         if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
>>           return -EPERM;
>> @@ -125,6 +173,13 @@ int ftrace_init_nop(struct module *mod, struct 
>> dyn_ftrace *rec)
>>     int ftrace_update_ftrace_func(ftrace_func_t func)
>>   {
>> +    /*
>> +     * When using CALL_OPS, the function to call is associated with the
>> +     * call site, and we don't have a global function pointer to 
>> update.
>> +     */
>> +    if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
>> +        return 0;
>> +
>>       int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
>>                          (unsigned long)func, true, true);
>>       if (!ret) {
>> @@ -147,6 +202,10 @@ int ftrace_modify_call(struct dyn_ftrace *rec, 
>> unsigned long old_addr,
>>       make_call_t0(caller, old_addr, call);
>>       ret = ftrace_check_current_call(caller, call);
>>   +    if (ret)
>> +        return ret;
>> +
>> +    ret = ftrace_rec_update_ops(rec);
>>       if (ret)
>>           return ret;
>>   diff --git a/arch/riscv/kernel/mcount-dyn.S 
>> b/arch/riscv/kernel/mcount-dyn.S
>> index b7561288e8da..cb241e36e514 100644
>> --- a/arch/riscv/kernel/mcount-dyn.S
>> +++ b/arch/riscv/kernel/mcount-dyn.S
>> @@ -191,11 +191,35 @@
>>       .endm
>>         .macro PREPARE_ARGS
>> -    addi    a0, t0, -FENTRY_RA_OFFSET
>> +    addi    a0, t0, -FENTRY_RA_OFFSET    // ip (callsite's auipc insn)
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>> +    /*
>> +     * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before 
>> the
>> +     * function entry, these are later overwritten with the pointer 
>> to the
>> +     * associated struct ftrace_ops.
>> +     *
>> +     * -8: &ftrace_ops of the associated tracer function.
>> +     *<ftrace enable>:
>> +     *  0: auipc  t0/ra, 0x?
>> +     *  4: jalr   t0/ra, ?(t0/ra)
>> +     *
>> +     * -8: &ftrace_nop_ops
>> +     *<ftrace disable>:
>> +     *  0: nop
>> +     *  4: nop
>> +     *
>> +     * t0 is set to ip+8 after the jalr is executed at the callsite,
>> +     * so we find the associated op at t0-16.
>> +     */
>> +    mv     a1, ra                // parent_ip
>> +    REG_L   a2, -16(t0)            // op
>> +    REG_L   ra, FTRACE_OPS_FUNC(a2)        // op->func
>> +#else
>>       la    a1, function_trace_op
>> -    REG_L    a2, 0(a1)
>> -    mv    a1, ra
>> -    mv    a3, sp
>> +    REG_L    a2, 0(a1)            // op
>> +    mv    a1, ra                // parent_ip
>> +#endif
>> +    mv    a3, sp                // regs
>>       .endm
>>     #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
>> @@ -233,8 +257,12 @@ SYM_FUNC_START(ftrace_regs_caller)
>>       SAVE_ABI_REGS 1
>>       PREPARE_ARGS
>>   +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>> +    jalr ra
>> +#else
>>   SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
>>       call    ftrace_stub
>> +#endif
>>         RESTORE_ABI_REGS 1
>>       bnez    t1, .Ldirect
>> @@ -247,9 +275,13 @@ SYM_FUNC_START(ftrace_caller)
>>       SAVE_ABI_REGS 0
>>       PREPARE_ARGS
>>   -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>> +    jalr ra
>> +#else
>> +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
>>       call    ftrace_stub
>>   +#endif
>>       RESTORE_ABI_REGS 0
>>       jr    t0
>>   SYM_FUNC_END(ftrace_caller)
>
>
> As I'm diving into ftrace right now, I'll give a proper review soon. 
> But as a note, I noticed that the function_graph tracer, when enabled, 
> makes the whole system unresponsive (but still up, just very slow). A 
> fix I sent recently seems to really improve that if you're interested 
> in testing it (I am :)). You can find it here: 
> https://lore.kernel.org/linux-riscv/20240229121056.203419-1-alexghiti@rivosinc.com/
>
> Thanks,
>
> Alex
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Puranjay Mohan March 7, 2024, 12:17 a.m. UTC | #3
Hi Alex,

On Wed, Mar 6, 2024 at 9:35 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> Hi Puranjay,
>
> On 06/03/2024 17:59, Puranjay Mohan wrote:
> > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> > This allows each ftrace callsite to provide an ftrace_ops to the common
> > ftrace trampoline, allowing each callsite to invoke distinct tracer
> > functions without the need to fall back to list processing or to
> > allocate custom trampolines for each callsite. This significantly speeds
> > up cases where multiple distinct trace functions are used and callsites
> > are mostly traced by a single tracer.
> >
> > The idea and most of the implementation is taken from the ARM64's
> > implementation of the same feature. The idea is to place a pointer to
> > the ftrace_ops as a literal at a fixed offset from the function entry
> > point, which can be recovered by the common ftrace trampoline.
> >
> > We use -fpatchable-function-entry to reserve 8 bytes above the function
> > entry by emitting 2 4 byte or 4 2 byte  nops depending on the presence of
> > CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
> > to the associated ftrace_ops for that callsite. Functions are aligned to
> > 8 bytes to make sure that the accesses to this literal are atomic.
> >
> > This approach allows for directly invoking ftrace_ops::func even for
> > ftrace_ops which are dynamically-allocated (or part of a module),
> > without going via ftrace_ops_list_func.
> >
> > I've benchamrked this with the ftrace_ops sample module on Qemu, with
> > the next version, I will provide benchmarks on real hardware:
> >
> > Without this patch:
> >
> > +-----------------------+-----------------+----------------------------+
> > |  Number of tracers    | Total time (ns) | Per-call average time      |
> > |-----------------------+-----------------+----------------------------|
> > | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
> > |----------+------------+-----------------+------------+---------------|
> > |        0 |          0 |        15615700 |        156 |             - |
> > |        0 |          1 |        15917600 |        159 |             - |
> > |        0 |          2 |        15668000 |        156 |             - |
> > |        0 |         10 |        14971500 |        149 |             - |
> > |        0 |        100 |        15417600 |        154 |             - |
> > |        0 |        200 |        15387000 |        153 |             - |
> > |----------+------------+-----------------+------------+---------------|
> > |        1 |          0 |       119906800 |       1199 |          1043 |
> > |        1 |          1 |       137428600 |       1374 |          1218 |
> > |        1 |          2 |       159562400 |       1374 |          1218 |
> > |        1 |         10 |       302099900 |       3020 |          2864 |
> > |        1 |        100 |      2008785500 |      20087 |         19931 |
> > |        1 |        200 |      3965221900 |      39652 |         39496 |
> > |----------+------------+-----------------+------------+---------------|
> > |        1 |          0 |       119166700 |       1191 |          1035 |
> > |        2 |          0 |       157999900 |       1579 |          1423 |
> > |       10 |          0 |       425370100 |       4253 |          4097 |
> > |      100 |          0 |      3595252100 |      35952 |         35796 |
> > |      200 |          0 |      7023485700 |      70234 |         70078 |
> > +----------+------------+-----------------+------------+---------------+
> >
> > Note: per-call overhead is estimated relative to the baseline case with
> > 0 relevant tracers and 0 irrelevant tracers.
> >
> > With this patch:
> >
> > +-----------------------+-----------------+----------------------------+
> > |   Number of tracers   | Total time (ns) | Per-call average time      |
> > |-----------------------+-----------------+----------------------------|
> > | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
> > |----------+------------+-----------------+------------+---------------|
> > |        0 |          0 |        15254600 |        152 |             - |
> > |        0 |          1 |        16136700 |        161 |             - |
> > |        0 |          2 |        15329500 |        153 |             - |
> > |        0 |         10 |        15148800 |        151 |             - |
> > |        0 |        100 |        15746900 |        157 |             - |
> > |        0 |        200 |        15737400 |        157 |             - |
> > |----------+------------+-----------------+------------+---------------|
> > |        1 |          0 |        47909000 |        479 |           327 |
> > |        1 |          1 |        48297400 |        482 |           330 |
> > |        1 |          2 |        47314100 |        473 |           321 |
> > |        1 |         10 |        47844900 |        478 |           326 |
> > |        1 |        100 |        46591900 |        465 |           313 |
> > |        1 |        200 |        47178900 |        471 |           319 |
> > |----------+------------+-----------------+------------+---------------|
> > |        1 |          0 |        46715800 |        467 |           315 |
> > |        2 |          0 |       155134500 |       1551 |          1399 |
> > |       10 |          0 |       442672800 |       4426 |          4274 |
> > |      100 |          0 |      4092353900 |      40923 |         40771 |
> > |      200 |          0 |      7135796400 |      71357 |         71205 |
> > +----------+------------+-----------------+------------+---------------+
> >
> > Note: per-call overhead is estimated relative to the baseline case with
> > 0 relevant tracers and 0 irrelevant tracers.
> >
> > As can be seen from the above:
> >
> >   a) Whenever there is a single relevant tracer function associated with a
> >      tracee, the overhead of invoking the tracer is constant, and does not
> >      scale with the number of tracers which are *not* associated with that
> >      tracee.
> >
> >   b) The overhead for a single relevant tracer has dropped to ~1/3 of the
> >      overhead prior to this series (from 1035ns to 315ns). This is largely
> >      due to permitting calls to dynamically-allocated ftrace_ops without
> >      going through ftrace_ops_list_func.
> >
> > Why is this patch a RFC patch:
> >   1. I saw some rcu stalls on Qemu and need to debug them and see if they
> >      were introduced by this patch.
>
>
> FYI, I'm currently working on debugging such issues (and other) with the
> *current* ftrace implementation, so probably not caused by your
> patchset. But keep debugging too, maybe this introduces other issues or
> even better, you'll find the root cause :)
>
>
> >   2. This needs to be tested thoroughly on real hardware.
> >   3. Seeking reviews to fix any fundamental problems with this patch that I
> >      might have missed due to my lack of RISC-V architecture knowledge.
> >   4. I would like to benchmark this on real hardware and put the results in
> >      the commit message.
> >
> > Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
> > ---
> >   arch/riscv/Kconfig              |  2 ++
> >   arch/riscv/Makefile             |  8 +++++
> >   arch/riscv/include/asm/ftrace.h |  3 ++
> >   arch/riscv/kernel/asm-offsets.c |  3 ++
> >   arch/riscv/kernel/ftrace.c      | 59 +++++++++++++++++++++++++++++++++
> >   arch/riscv/kernel/mcount-dyn.S  | 42 ++++++++++++++++++++---
> >   6 files changed, 112 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 0bfcfec67ed5..e474742e23b2 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -78,6 +78,7 @@ config RISCV
> >       select EDAC_SUPPORT
> >       select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
> >       select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
> > +     select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS
>
>
> A recent discussion [1] states that -falign-functions cannot guarantee
> this alignment for all code and that gcc developers came up with a new
> option [2]: WDYT? I have added Andy and Evgenii in +cc to help on that.

I saw arm64 uses the same and assumes this guarantee, maybe it is broken too?
Will look into the discussion and try to use the other option. I am
currently using Clang to build the kernel.

>
> [1]
> https://lore.kernel.org/linux-riscv/4fe4567b-96be-4102-952b-7d39109b2186@yadro.com/
> [2]
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=0f5a9a00e3ab1fe96142f304cfbcf3f63b15f326
>
>
> >       select GENERIC_ARCH_TOPOLOGY
> >       select GENERIC_ATOMIC64 if !64BIT
> >       select GENERIC_CLOCKEVENTS_BROADCAST if SMP
> > @@ -127,6 +128,7 @@ config RISCV
> >       select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
> >       select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> >       select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
> > +     select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_REGS && !CFI_CLANG)
> >       select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
> >       select HAVE_FUNCTION_GRAPH_TRACER
> >       select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
> > diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
> > index 252d63942f34..875ad5dc3d32 100644
> > --- a/arch/riscv/Makefile
> > +++ b/arch/riscv/Makefile
> > @@ -14,12 +14,20 @@ endif
> >   ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
> >       LDFLAGS_vmlinux += --no-relax
> >       KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
> > +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS), y)
> > +ifeq ($(CONFIG_RISCV_ISA_C),y)
> > +     CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
> > +else
> > +     CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
> > +endif
> > +else
> >   ifeq ($(CONFIG_RISCV_ISA_C),y)
> >       CC_FLAGS_FTRACE := -fpatchable-function-entry=4
> >   else
> >       CC_FLAGS_FTRACE := -fpatchable-function-entry=2
> >   endif
> >   endif
> > +endif
> >
> >   ifeq ($(CONFIG_CMODEL_MEDLOW),y)
> >   KBUILD_CFLAGS_MODULE += -mcmodel=medany
> > diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
> > index 329172122952..c9a84222c9ea 100644
> > --- a/arch/riscv/include/asm/ftrace.h
> > +++ b/arch/riscv/include/asm/ftrace.h
> > @@ -28,6 +28,9 @@
> >   void MCOUNT_NAME(void);
> >   static inline unsigned long ftrace_call_adjust(unsigned long addr)
> >   {
> > +     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
> > +             return addr + 8;
> > +
> >       return addr;
> >   }
> >
> > diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> > index a03129f40c46..7d7c4b486852 100644
> > --- a/arch/riscv/kernel/asm-offsets.c
> > +++ b/arch/riscv/kernel/asm-offsets.c
> > @@ -488,4 +488,7 @@ void asm_offsets(void)
> >       DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
> >       OFFSET(STACKFRAME_FP, stackframe, fp);
> >       OFFSET(STACKFRAME_RA, stackframe, ra);
> > +#ifdef CONFIG_FUNCTION_TRACER
> > +     DEFINE(FTRACE_OPS_FUNC,         offsetof(struct ftrace_ops, func));
> > +#endif
> >   }
> > diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
> > index f5aa24d9e1c1..e2e75e15d32e 100644
> > --- a/arch/riscv/kernel/ftrace.c
> > +++ b/arch/riscv/kernel/ftrace.c
> > @@ -82,9 +82,52 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
> >       return 0;
> >   }
> >
> > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
> > +{
> > +     const struct ftrace_ops *ops = NULL;
> > +
> > +     if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
> > +             ops = ftrace_find_unique_ops(rec);
> > +             WARN_ON_ONCE(!ops);
> > +     }
> > +
> > +     if (!ops)
> > +             ops = &ftrace_list_ops;
> > +
> > +     return ops;
> > +}
> > +
> > +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
> > +                           const struct ftrace_ops *ops)
> > +{
> > +     unsigned long literal = rec->ip - 8;
> > +
> > +     return patch_text_nosync((void *)literal, &ops, sizeof(ops));

^^
I will change this to use a new function that does a 64 bit write and
doesn't do flush_icache_range() as naturally aligned writes are
atomic and therefore synchronization is not required here.

> > +}
> > +
> > +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
> > +{
> > +     return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
> > +}
> > +
> > +static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
> > +{
> > +     return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
> > +}
> > +#else
> > +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
> > +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
> > +#endif
> > +
> >   int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
> >   {
> >       unsigned int call[2];
> > +     int ret;
> > +
> > +     ret = ftrace_rec_update_ops(rec);
> > +     if (ret)
> > +             return ret;
> >
> >       make_call_t0(rec->ip, addr, call);
> >
> > @@ -98,6 +141,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
> >                   unsigned long addr)
> >   {
> >       unsigned int nops[2] = {NOP4, NOP4};
> > +     int ret;
> > +
> > +     ret = ftrace_rec_set_nop_ops(rec);
> > +     if (ret)
> > +             return ret;
> >
> >       if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
> >               return -EPERM;
> > @@ -125,6 +173,13 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
> >
> >   int ftrace_update_ftrace_func(ftrace_func_t func)
> >   {
> > +     /*
> > +      * When using CALL_OPS, the function to call is associated with the
> > +      * call site, and we don't have a global function pointer to update.
> > +      */
> > +     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
> > +             return 0;
> > +
> >       int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
> >                                      (unsigned long)func, true, true);
> >       if (!ret) {
> > @@ -147,6 +202,10 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
> >       make_call_t0(caller, old_addr, call);
> >       ret = ftrace_check_current_call(caller, call);
> >
> > +     if (ret)
> > +             return ret;
> > +
> > +     ret = ftrace_rec_update_ops(rec);
> >       if (ret)
> >               return ret;
> >
> > diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
> > index b7561288e8da..cb241e36e514 100644
> > --- a/arch/riscv/kernel/mcount-dyn.S
> > +++ b/arch/riscv/kernel/mcount-dyn.S
> > @@ -191,11 +191,35 @@
> >       .endm
> >
> >       .macro PREPARE_ARGS
> > -     addi    a0, t0, -FENTRY_RA_OFFSET
> > +     addi    a0, t0, -FENTRY_RA_OFFSET       // ip (callsite's auipc insn)
> > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > +     /*
> > +      * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
> > +      * function entry, these are later overwritten with the pointer to the
> > +      * associated struct ftrace_ops.
> > +      *
> > +      * -8: &ftrace_ops of the associated tracer function.
> > +      *<ftrace enable>:
> > +      *  0: auipc  t0/ra, 0x?
> > +      *  4: jalr   t0/ra, ?(t0/ra)
> > +      *
> > +      * -8: &ftrace_nop_ops
> > +      *<ftrace disable>:
> > +      *  0: nop
> > +      *  4: nop
> > +      *
> > +      * t0 is set to ip+8 after the jalr is executed at the callsite,
> > +      * so we find the associated op at t0-16.
> > +      */
> > +     mv      a1, ra                          // parent_ip
> > +     REG_L   a2, -16(t0)                     // op
> > +     REG_L   ra, FTRACE_OPS_FUNC(a2)         // op->func
> > +#else
> >       la      a1, function_trace_op
> > -     REG_L   a2, 0(a1)
> > -     mv      a1, ra
> > -     mv      a3, sp
> > +     REG_L   a2, 0(a1)                       // op
> > +     mv      a1, ra                          // parent_ip
> > +#endif
> > +     mv      a3, sp                          // regs
> >       .endm
> >
> >   #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
> > @@ -233,8 +257,12 @@ SYM_FUNC_START(ftrace_regs_caller)
> >       SAVE_ABI_REGS 1
> >       PREPARE_ARGS
> >
> > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > +     jalr ra
> > +#else
> >   SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
> >       call    ftrace_stub
> > +#endif
> >
> >       RESTORE_ABI_REGS 1
> >       bnez    t1, .Ldirect
> > @@ -247,9 +275,13 @@ SYM_FUNC_START(ftrace_caller)
> >       SAVE_ABI_REGS 0
> >       PREPARE_ARGS
> >
> > -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)

^^ this hunk is a mistake, I will fix it in the next version.

> > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > +     jalr ra
> > +#else
> > +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
> >       call    ftrace_stub
> >
> > +#endif
> >       RESTORE_ABI_REGS 0
> >       jr      t0
> >   SYM_FUNC_END(ftrace_caller)
>
>
> As I'm diving into ftrace right now, I'll give a proper review soon. But
> as a note, I noticed that the function_graph tracer, when enabled, makes
> the whole system unresponsive (but still up, just very slow). A fix I
> sent recently seems to really improve that if you're interested in
> testing it (I am :)). You can find it here:
> https://lore.kernel.org/linux-riscv/20240229121056.203419-1-alexghiti@rivosinc.com/

I saw the same issue where function_graph was making the system slow on Qemu.
What hardware do you use for testing? or are you testing on Qemu as well?

I tested your patch, it speeds up the process of patching the
instructions so the following
command completes ~2.5 seconds faster compared to without your patch.
$ time echo function_graph > current_tracer

But at runtime the system is still slow and laggy with function_graph,
I guess because my
Qemu setup is not powerful enough to run function_graph.

Thanks,
Puranjay
Björn Töpel March 7, 2024, 7:27 p.m. UTC | #4
Puranjay!

Puranjay Mohan <puranjay12@gmail.com> writes:

> This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> This allows each ftrace callsite to provide an ftrace_ops to the common
> ftrace trampoline, allowing each callsite to invoke distinct tracer
> functions without the need to fall back to list processing or to
> allocate custom trampolines for each callsite. This significantly speeds
> up cases where multiple distinct trace functions are used and callsites
> are mostly traced by a single tracer.
>
> The idea and most of the implementation is taken from the ARM64's
> implementation of the same feature. The idea is to place a pointer to
> the ftrace_ops as a literal at a fixed offset from the function entry
> point, which can be recovered by the common ftrace trampoline.

Not really a review, but some more background; Another rationale (on-top
of the improved per-call performance!) for CALL_OPS was to use it to
build ftrace direct call support (which BPF uses a lot!). Mark, please
correct me if I'm lying here!

On Arm64, CALL_OPS makes it possible to implement direct calls, while
only patching one BL instruction -- nice!

On RISC-V we cannot use use the same ideas as Arm64 straight off,
because the range of jal (compare to BL) is simply too short (+/-1M).
So, on RISC-V we need to use a full auipc/jal pair (the text patching
story is another chapter, but let's leave that aside for now). Since we
have to patch multiple instructions, the cmodx situation doesn't really
improve with CALL_OPS.

Let's say that we continue building on your patch and implement direct
calls on CALL_OPS for RISC-V as well.

From Florent's commit message for direct calls:

  |    There are a few cases to distinguish:
  |    - If a direct call ops is the only one tracing a function:
  |      - If the direct called trampoline is within the reach of a BL
  |        instruction
  |         -> the ftrace patchsite jumps to the trampoline
  |      - Else
  |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
  |            reads the ops pointer in the patchsite and jumps to the direct
  |            call address stored in the ops
  |    - Else
  |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
  |         ops literal points to ftrace_list_ops so it iterates over all
  |         registered ftrace ops, including the direct call ops and calls its
  |         call_direct_funcs handler which stores the direct called
  |         trampoline's address in the ftrace_regs and the ftrace_caller
  |         trampoline will return to that address instead of returning to the
  |         traced function

On RISC-V, where auipc/jalr is used, the direct called trampoline would
always be reachable, and then first Else-clause would never be entered.
This means the the performance for direct calls would be the same as the
one we have today (i.e. no regression!).

RISC-V does like x86 does (-ish) -- patch multiple instructions, long
reach.

Arm64 uses CALL_OPS and patch one instruction BL.

Now, with this background in mind, compared to what we have today,
CALL_OPS would give us (again assuming we're using it for direct calls):

* Better performance for tracer per-call (faster ops lookup) GOOD
* Larger text size (function alignment + extra nops) BAD
* Same direct call performance NEUTRAL
* Same complicated text patching required NEUTRAL

It would be interesting to see how the per-call performance would
improve on x86 with CALL_OPS! ;-)

I'm trying to wrap my head if it makes sense to have it on RISC-V, given
that we're a bit different from Arm64. Does the scale tip to the GOOD
side?

Oh, and we really need to see performance numbers on real HW! I have a
VF2 that I could try this series on.


Björn
Puranjay Mohan March 7, 2024, 7:51 p.m. UTC | #5
Hi Björn,

On Thu, Mar 7, 2024 at 8:27 PM Björn Töpel <bjorn@kernel.org> wrote:
>
> Puranjay!
>
> Puranjay Mohan <puranjay12@gmail.com> writes:
>
> > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> > This allows each ftrace callsite to provide an ftrace_ops to the common
> > ftrace trampoline, allowing each callsite to invoke distinct tracer
> > functions without the need to fall back to list processing or to
> > allocate custom trampolines for each callsite. This significantly speeds
> > up cases where multiple distinct trace functions are used and callsites
> > are mostly traced by a single tracer.
> >
> > The idea and most of the implementation is taken from the ARM64's
> > implementation of the same feature. The idea is to place a pointer to
> > the ftrace_ops as a literal at a fixed offset from the function entry
> > point, which can be recovered by the common ftrace trampoline.
>
> Not really a review, but some more background; Another rationale (on-top
> of the improved per-call performance!) for CALL_OPS was to use it to
> build ftrace direct call support (which BPF uses a lot!). Mark, please
> correct me if I'm lying here!
>
> On Arm64, CALL_OPS makes it possible to implement direct calls, while
> only patching one BL instruction -- nice!
>
> On RISC-V we cannot use use the same ideas as Arm64 straight off,
> because the range of jal (compare to BL) is simply too short (+/-1M).
> So, on RISC-V we need to use a full auipc/jal pair (the text patching
> story is another chapter, but let's leave that aside for now). Since we
> have to patch multiple instructions, the cmodx situation doesn't really
> improve with CALL_OPS.
>
> Let's say that we continue building on your patch and implement direct
> calls on CALL_OPS for RISC-V as well.
>
> From Florent's commit message for direct calls:
>
>   |    There are a few cases to distinguish:
>   |    - If a direct call ops is the only one tracing a function:
>   |      - If the direct called trampoline is within the reach of a BL
>   |        instruction
>   |         -> the ftrace patchsite jumps to the trampoline
>   |      - Else
>   |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
>   |            reads the ops pointer in the patchsite and jumps to the direct
>   |            call address stored in the ops
>   |    - Else
>   |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
>   |         ops literal points to ftrace_list_ops so it iterates over all
>   |         registered ftrace ops, including the direct call ops and calls its
>   |         call_direct_funcs handler which stores the direct called
>   |         trampoline's address in the ftrace_regs and the ftrace_caller
>   |         trampoline will return to that address instead of returning to the
>   |         traced function
>
> On RISC-V, where auipc/jalr is used, the direct called trampoline would
> always be reachable, and then first Else-clause would never be entered.
> This means the the performance for direct calls would be the same as the
> one we have today (i.e. no regression!).
>
> RISC-V does like x86 does (-ish) -- patch multiple instructions, long
> reach.
>
> Arm64 uses CALL_OPS and patch one instruction BL.
>
> Now, with this background in mind, compared to what we have today,
> CALL_OPS would give us (again assuming we're using it for direct calls):
>
> * Better performance for tracer per-call (faster ops lookup) GOOD

^ this was the only motivation for me to implement this patch.

I don't think implementing direct calls over call ops is fruitful for
RISC-V because once
the auipc/jalr can be patched atomically, the direct call trampoline
is always reachable.
Solving the atomic text patching problem would be fun!! I am eager to
see how it will be
solved.

> * Larger text size (function alignment + extra nops) BAD
> * Same direct call performance NEUTRAL
> * Same complicated text patching required NEUTRAL
>
> It would be interesting to see how the per-call performance would
> improve on x86 with CALL_OPS! ;-)

If I remember from Steven's talk, x86 uses dynamically allocated trampolines
for per callsite tracers, would CALL_OPS provide better performance than that?

>
> I'm trying to wrap my head if it makes sense to have it on RISC-V, given
> that we're a bit different from Arm64. Does the scale tip to the GOOD
> side?
>
> Oh, and we really need to see performance numbers on real HW! I have a
> VF2 that I could try this series on.

It would be great if you can do it :D.

Thanks,
Puranjay
Andy Chiu March 8, 2024, 8:48 a.m. UTC | #6
On Thu, Mar 7, 2024 at 8:19 AM Puranjay Mohan <puranjay12@gmail.com> wrote:
>
> Hi Alex,
>
> On Wed, Mar 6, 2024 at 9:35 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
> >
> > Hi Puranjay,
> >
> > On 06/03/2024 17:59, Puranjay Mohan wrote:
> > > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> > > This allows each ftrace callsite to provide an ftrace_ops to the common
> > > ftrace trampoline, allowing each callsite to invoke distinct tracer
> > > functions without the need to fall back to list processing or to
> > > allocate custom trampolines for each callsite. This significantly speeds
> > > up cases where multiple distinct trace functions are used and callsites
> > > are mostly traced by a single tracer.
> > >
> > > The idea and most of the implementation is taken from the ARM64's
> > > implementation of the same feature. The idea is to place a pointer to
> > > the ftrace_ops as a literal at a fixed offset from the function entry
> > > point, which can be recovered by the common ftrace trampoline.
> > >
> > > We use -fpatchable-function-entry to reserve 8 bytes above the function
> > > entry by emitting 2 4 byte or 4 2 byte  nops depending on the presence of
> > > CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
> > > to the associated ftrace_ops for that callsite. Functions are aligned to
> > > 8 bytes to make sure that the accesses to this literal are atomic.
> > >
> > > This approach allows for directly invoking ftrace_ops::func even for
> > > ftrace_ops which are dynamically-allocated (or part of a module),
> > > without going via ftrace_ops_list_func.
> > >
> > > I've benchamrked this with the ftrace_ops sample module on Qemu, with
> > > the next version, I will provide benchmarks on real hardware:
> > >
> > > Without this patch:
> > >
> > > +-----------------------+-----------------+----------------------------+
> > > |  Number of tracers    | Total time (ns) | Per-call average time      |
> > > |-----------------------+-----------------+----------------------------|
> > > | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
> > > |----------+------------+-----------------+------------+---------------|
> > > |        0 |          0 |        15615700 |        156 |             - |
> > > |        0 |          1 |        15917600 |        159 |             - |
> > > |        0 |          2 |        15668000 |        156 |             - |
> > > |        0 |         10 |        14971500 |        149 |             - |
> > > |        0 |        100 |        15417600 |        154 |             - |
> > > |        0 |        200 |        15387000 |        153 |             - |
> > > |----------+------------+-----------------+------------+---------------|
> > > |        1 |          0 |       119906800 |       1199 |          1043 |
> > > |        1 |          1 |       137428600 |       1374 |          1218 |
> > > |        1 |          2 |       159562400 |       1374 |          1218 |
> > > |        1 |         10 |       302099900 |       3020 |          2864 |
> > > |        1 |        100 |      2008785500 |      20087 |         19931 |
> > > |        1 |        200 |      3965221900 |      39652 |         39496 |
> > > |----------+------------+-----------------+------------+---------------|
> > > |        1 |          0 |       119166700 |       1191 |          1035 |
> > > |        2 |          0 |       157999900 |       1579 |          1423 |
> > > |       10 |          0 |       425370100 |       4253 |          4097 |
> > > |      100 |          0 |      3595252100 |      35952 |         35796 |
> > > |      200 |          0 |      7023485700 |      70234 |         70078 |
> > > +----------+------------+-----------------+------------+---------------+
> > >
> > > Note: per-call overhead is estimated relative to the baseline case with
> > > 0 relevant tracers and 0 irrelevant tracers.
> > >
> > > With this patch:
> > >
> > > +-----------------------+-----------------+----------------------------+
> > > |   Number of tracers   | Total time (ns) | Per-call average time      |
> > > |-----------------------+-----------------+----------------------------|
> > > | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
> > > |----------+------------+-----------------+------------+---------------|
> > > |        0 |          0 |        15254600 |        152 |             - |
> > > |        0 |          1 |        16136700 |        161 |             - |
> > > |        0 |          2 |        15329500 |        153 |             - |
> > > |        0 |         10 |        15148800 |        151 |             - |
> > > |        0 |        100 |        15746900 |        157 |             - |
> > > |        0 |        200 |        15737400 |        157 |             - |
> > > |----------+------------+-----------------+------------+---------------|
> > > |        1 |          0 |        47909000 |        479 |           327 |
> > > |        1 |          1 |        48297400 |        482 |           330 |
> > > |        1 |          2 |        47314100 |        473 |           321 |
> > > |        1 |         10 |        47844900 |        478 |           326 |
> > > |        1 |        100 |        46591900 |        465 |           313 |
> > > |        1 |        200 |        47178900 |        471 |           319 |
> > > |----------+------------+-----------------+------------+---------------|
> > > |        1 |          0 |        46715800 |        467 |           315 |
> > > |        2 |          0 |       155134500 |       1551 |          1399 |
> > > |       10 |          0 |       442672800 |       4426 |          4274 |
> > > |      100 |          0 |      4092353900 |      40923 |         40771 |
> > > |      200 |          0 |      7135796400 |      71357 |         71205 |
> > > +----------+------------+-----------------+------------+---------------+
> > >
> > > Note: per-call overhead is estimated relative to the baseline case with
> > > 0 relevant tracers and 0 irrelevant tracers.
> > >
> > > As can be seen from the above:
> > >
> > >   a) Whenever there is a single relevant tracer function associated with a
> > >      tracee, the overhead of invoking the tracer is constant, and does not
> > >      scale with the number of tracers which are *not* associated with that
> > >      tracee.
> > >
> > >   b) The overhead for a single relevant tracer has dropped to ~1/3 of the
> > >      overhead prior to this series (from 1035ns to 315ns). This is largely
> > >      due to permitting calls to dynamically-allocated ftrace_ops without
> > >      going through ftrace_ops_list_func.
> > >
> > > Why is this patch a RFC patch:
> > >   1. I saw some rcu stalls on Qemu and need to debug them and see if they
> > >      were introduced by this patch.
> >
> >
> > FYI, I'm currently working on debugging such issues (and other) with the
> > *current* ftrace implementation, so probably not caused by your
> > patchset. But keep debugging too, maybe this introduces other issues or
> > even better, you'll find the root cause :)
> >
> >
> > >   2. This needs to be tested thoroughly on real hardware.
> > >   3. Seeking reviews to fix any fundamental problems with this patch that I
> > >      might have missed due to my lack of RISC-V architecture knowledge.
> > >   4. I would like to benchmark this on real hardware and put the results in
> > >      the commit message.
> > >
> > > Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
> > > ---
> > >   arch/riscv/Kconfig              |  2 ++
> > >   arch/riscv/Makefile             |  8 +++++
> > >   arch/riscv/include/asm/ftrace.h |  3 ++
> > >   arch/riscv/kernel/asm-offsets.c |  3 ++
> > >   arch/riscv/kernel/ftrace.c      | 59 +++++++++++++++++++++++++++++++++
> > >   arch/riscv/kernel/mcount-dyn.S  | 42 ++++++++++++++++++++---
> > >   6 files changed, 112 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index 0bfcfec67ed5..e474742e23b2 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -78,6 +78,7 @@ config RISCV
> > >       select EDAC_SUPPORT
> > >       select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
> > >       select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
> > > +     select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS
> >
> >
> > A recent discussion [1] states that -falign-functions cannot guarantee
> > this alignment for all code and that gcc developers came up with a new
> > option [2]: WDYT? I have added Andy and Evgenii in +cc to help on that.
>
> I saw arm64 uses the same and assumes this guarantee, maybe it is broken too?
> Will look into the discussion and try to use the other option. I am
> currently using Clang to build the kernel.

IIRC it should be fine if you're building with Clang. The
"-falign-function" flag for gcc is not aligning cold functions. So I
think we'd need some Kconfig checks for the difference.

>
> >
> > [1]
> > https://lore.kernel.org/linux-riscv/4fe4567b-96be-4102-952b-7d39109b2186@yadro.com/
> > [2]
> > https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=0f5a9a00e3ab1fe96142f304cfbcf3f63b15f326
> >
> >
> > >       select GENERIC_ARCH_TOPOLOGY
> > >       select GENERIC_ATOMIC64 if !64BIT
> > >       select GENERIC_CLOCKEVENTS_BROADCAST if SMP
> > > @@ -127,6 +128,7 @@ config RISCV
> > >       select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
> > >       select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> > >       select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
> > > +     select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_REGS && !CFI_CLANG)
> > >       select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
> > >       select HAVE_FUNCTION_GRAPH_TRACER
> > >       select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
> > > diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
> > > index 252d63942f34..875ad5dc3d32 100644
> > > --- a/arch/riscv/Makefile
> > > +++ b/arch/riscv/Makefile
> > > @@ -14,12 +14,20 @@ endif
> > >   ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
> > >       LDFLAGS_vmlinux += --no-relax
> > >       KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
> > > +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS), y)
> > > +ifeq ($(CONFIG_RISCV_ISA_C),y)
> > > +     CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
> > > +else
> > > +     CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
> > > +endif
> > > +else
> > >   ifeq ($(CONFIG_RISCV_ISA_C),y)
> > >       CC_FLAGS_FTRACE := -fpatchable-function-entry=4
> > >   else
> > >       CC_FLAGS_FTRACE := -fpatchable-function-entry=2
> > >   endif
> > >   endif
> > > +endif
> > >
> > >   ifeq ($(CONFIG_CMODEL_MEDLOW),y)
> > >   KBUILD_CFLAGS_MODULE += -mcmodel=medany
> > > diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
> > > index 329172122952..c9a84222c9ea 100644
> > > --- a/arch/riscv/include/asm/ftrace.h
> > > +++ b/arch/riscv/include/asm/ftrace.h
> > > @@ -28,6 +28,9 @@
> > >   void MCOUNT_NAME(void);
> > >   static inline unsigned long ftrace_call_adjust(unsigned long addr)
> > >   {
> > > +     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
> > > +             return addr + 8;
> > > +
> > >       return addr;
> > >   }
> > >
> > > diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> > > index a03129f40c46..7d7c4b486852 100644
> > > --- a/arch/riscv/kernel/asm-offsets.c
> > > +++ b/arch/riscv/kernel/asm-offsets.c
> > > @@ -488,4 +488,7 @@ void asm_offsets(void)
> > >       DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
> > >       OFFSET(STACKFRAME_FP, stackframe, fp);
> > >       OFFSET(STACKFRAME_RA, stackframe, ra);
> > > +#ifdef CONFIG_FUNCTION_TRACER
> > > +     DEFINE(FTRACE_OPS_FUNC,         offsetof(struct ftrace_ops, func));
> > > +#endif
> > >   }
> > > diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
> > > index f5aa24d9e1c1..e2e75e15d32e 100644
> > > --- a/arch/riscv/kernel/ftrace.c
> > > +++ b/arch/riscv/kernel/ftrace.c
> > > @@ -82,9 +82,52 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
> > >       return 0;
> > >   }
> > >
> > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
> > > +{
> > > +     const struct ftrace_ops *ops = NULL;
> > > +
> > > +     if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
> > > +             ops = ftrace_find_unique_ops(rec);
> > > +             WARN_ON_ONCE(!ops);
> > > +     }
> > > +
> > > +     if (!ops)
> > > +             ops = &ftrace_list_ops;
> > > +
> > > +     return ops;
> > > +}
> > > +
> > > +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
> > > +                           const struct ftrace_ops *ops)
> > > +{
> > > +     unsigned long literal = rec->ip - 8;
> > > +
> > > +     return patch_text_nosync((void *)literal, &ops, sizeof(ops));
>
> ^^
> I will change this to use a new function that does a 64 bit write and
> doesn't do flush_icache_range() as naturally aligned writes are
> atomic and therefore synchronization is not required here.

Yes, it's atomic. But it doesn't guarantee ordering if fence.i is not
executed, IIUC. I am not sure if CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
assumes seeing the updated ftrace_rec argument right after the patch
completes. If that is the case then probably a batched fence.i is
needed on local + remote cores.

>
> > > +}
> > > +
> > > +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
> > > +{
> > > +     return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
> > > +}
> > > +
> > > +static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
> > > +{
> > > +     return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
> > > +}
> > > +#else
> > > +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
> > > +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
> > > +#endif
> > > +
> > >   int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
> > >   {
> > >       unsigned int call[2];
> > > +     int ret;
> > > +
> > > +     ret = ftrace_rec_update_ops(rec);
> > > +     if (ret)
> > > +             return ret;
> > >
> > >       make_call_t0(rec->ip, addr, call);
> > >
> > > @@ -98,6 +141,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
> > >                   unsigned long addr)
> > >   {
> > >       unsigned int nops[2] = {NOP4, NOP4};
> > > +     int ret;
> > > +
> > > +     ret = ftrace_rec_set_nop_ops(rec);
> > > +     if (ret)
> > > +             return ret;
> > >
> > >       if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
> > >               return -EPERM;
> > > @@ -125,6 +173,13 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
> > >
> > >   int ftrace_update_ftrace_func(ftrace_func_t func)
> > >   {
> > > +     /*
> > > +      * When using CALL_OPS, the function to call is associated with the
> > > +      * call site, and we don't have a global function pointer to update.
> > > +      */
> > > +     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
> > > +             return 0;
> > > +
> > >       int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
> > >                                      (unsigned long)func, true, true);
> > >       if (!ret) {
> > > @@ -147,6 +202,10 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
> > >       make_call_t0(caller, old_addr, call);
> > >       ret = ftrace_check_current_call(caller, call);
> > >
> > > +     if (ret)
> > > +             return ret;
> > > +
> > > +     ret = ftrace_rec_update_ops(rec);
> > >       if (ret)
> > >               return ret;
> > >
> > > diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
> > > index b7561288e8da..cb241e36e514 100644
> > > --- a/arch/riscv/kernel/mcount-dyn.S
> > > +++ b/arch/riscv/kernel/mcount-dyn.S
> > > @@ -191,11 +191,35 @@
> > >       .endm
> > >
> > >       .macro PREPARE_ARGS
> > > -     addi    a0, t0, -FENTRY_RA_OFFSET
> > > +     addi    a0, t0, -FENTRY_RA_OFFSET       // ip (callsite's auipc insn)
> > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > +     /*
> > > +      * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
> > > +      * function entry, these are later overwritten with the pointer to the
> > > +      * associated struct ftrace_ops.
> > > +      *
> > > +      * -8: &ftrace_ops of the associated tracer function.
> > > +      *<ftrace enable>:
> > > +      *  0: auipc  t0/ra, 0x?
> > > +      *  4: jalr   t0/ra, ?(t0/ra)
> > > +      *
> > > +      * -8: &ftrace_nop_ops
> > > +      *<ftrace disable>:
> > > +      *  0: nop
> > > +      *  4: nop
> > > +      *
> > > +      * t0 is set to ip+8 after the jalr is executed at the callsite,
> > > +      * so we find the associated op at t0-16.
> > > +      */
> > > +     mv      a1, ra                          // parent_ip
> > > +     REG_L   a2, -16(t0)                     // op
> > > +     REG_L   ra, FTRACE_OPS_FUNC(a2)         // op->func
> > > +#else
> > >       la      a1, function_trace_op
> > > -     REG_L   a2, 0(a1)
> > > -     mv      a1, ra
> > > -     mv      a3, sp
> > > +     REG_L   a2, 0(a1)                       // op
> > > +     mv      a1, ra                          // parent_ip
> > > +#endif
> > > +     mv      a3, sp                          // regs
> > >       .endm
> > >
> > >   #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
> > > @@ -233,8 +257,12 @@ SYM_FUNC_START(ftrace_regs_caller)
> > >       SAVE_ABI_REGS 1
> > >       PREPARE_ARGS
> > >
> > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > +     jalr ra
> > > +#else
> > >   SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
> > >       call    ftrace_stub
> > > +#endif
> > >
> > >       RESTORE_ABI_REGS 1
> > >       bnez    t1, .Ldirect
> > > @@ -247,9 +275,13 @@ SYM_FUNC_START(ftrace_caller)
> > >       SAVE_ABI_REGS 0
> > >       PREPARE_ARGS
> > >
> > > -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
>
> ^^ this hunk is a mistake, I will fix it in the next version.
>
> > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > +     jalr ra
> > > +#else
> > > +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
> > >       call    ftrace_stub
> > >
> > > +#endif
> > >       RESTORE_ABI_REGS 0
> > >       jr      t0
> > >   SYM_FUNC_END(ftrace_caller)
> >
> >
> > As I'm diving into ftrace right now, I'll give a proper review soon. But
> > as a note, I noticed that the function_graph tracer, when enabled, makes
> > the whole system unresponsive (but still up, just very slow). A fix I
> > sent recently seems to really improve that if you're interested in
> > testing it (I am :)). You can find it here:
> > https://lore.kernel.org/linux-riscv/20240229121056.203419-1-alexghiti@rivosinc.com/
>
> I saw the same issue where function_graph was making the system slow on Qemu.
> What hardware do you use for testing? or are you testing on Qemu as well?
>
> I tested your patch, it speeds up the process of patching the
> instructions so the following
> command completes ~2.5 seconds faster compared to without your patch.
> $ time echo function_graph > current_tracer
>
> But at runtime the system is still slow and laggy with function_graph,
> I guess because my
> Qemu setup is not powerful enough to run function_graph.
>
> Thanks,
> Puranjay
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Andy Chiu March 8, 2024, 9:18 a.m. UTC | #7
Hi Puranjay,

On Fri, Mar 8, 2024 at 3:53 AM Puranjay Mohan <puranjay12@gmail.com> wrote:
>
> Hi Björn,
>
> On Thu, Mar 7, 2024 at 8:27 PM Björn Töpel <bjorn@kernel.org> wrote:
> >
> > Puranjay!
> >
> > Puranjay Mohan <puranjay12@gmail.com> writes:
> >
> > > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> > > This allows each ftrace callsite to provide an ftrace_ops to the common
> > > ftrace trampoline, allowing each callsite to invoke distinct tracer
> > > functions without the need to fall back to list processing or to
> > > allocate custom trampolines for each callsite. This significantly speeds
> > > up cases where multiple distinct trace functions are used and callsites
> > > are mostly traced by a single tracer.
> > >
> > > The idea and most of the implementation is taken from the ARM64's
> > > implementation of the same feature. The idea is to place a pointer to
> > > the ftrace_ops as a literal at a fixed offset from the function entry
> > > point, which can be recovered by the common ftrace trampoline.
> >
> > Not really a review, but some more background; Another rationale (on-top
> > of the improved per-call performance!) for CALL_OPS was to use it to
> > build ftrace direct call support (which BPF uses a lot!). Mark, please
> > correct me if I'm lying here!
> >
> > On Arm64, CALL_OPS makes it possible to implement direct calls, while
> > only patching one BL instruction -- nice!
> >
> > On RISC-V we cannot use use the same ideas as Arm64 straight off,
> > because the range of jal (compare to BL) is simply too short (+/-1M).
> > So, on RISC-V we need to use a full auipc/jal pair (the text patching
> > story is another chapter, but let's leave that aside for now). Since we
> > have to patch multiple instructions, the cmodx situation doesn't really
> > improve with CALL_OPS.
> >
> > Let's say that we continue building on your patch and implement direct
> > calls on CALL_OPS for RISC-V as well.
> >
> > From Florent's commit message for direct calls:
> >
> >   |    There are a few cases to distinguish:
> >   |    - If a direct call ops is the only one tracing a function:
> >   |      - If the direct called trampoline is within the reach of a BL
> >   |        instruction
> >   |         -> the ftrace patchsite jumps to the trampoline
> >   |      - Else
> >   |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
> >   |            reads the ops pointer in the patchsite and jumps to the direct
> >   |            call address stored in the ops
> >   |    - Else
> >   |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
> >   |         ops literal points to ftrace_list_ops so it iterates over all
> >   |         registered ftrace ops, including the direct call ops and calls its
> >   |         call_direct_funcs handler which stores the direct called
> >   |         trampoline's address in the ftrace_regs and the ftrace_caller
> >   |         trampoline will return to that address instead of returning to the
> >   |         traced function
> >
> > On RISC-V, where auipc/jalr is used, the direct called trampoline would
> > always be reachable, and then first Else-clause would never be entered.
> > This means the the performance for direct calls would be the same as the
> > one we have today (i.e. no regression!).
> >
> > RISC-V does like x86 does (-ish) -- patch multiple instructions, long
> > reach.
> >
> > Arm64 uses CALL_OPS and patch one instruction BL.
> >
> > Now, with this background in mind, compared to what we have today,
> > CALL_OPS would give us (again assuming we're using it for direct calls):
> >
> > * Better performance for tracer per-call (faster ops lookup) GOOD
>
> ^ this was the only motivation for me to implement this patch.
>
> I don't think implementing direct calls over call ops is fruitful for
> RISC-V because once
> the auipc/jalr can be patched atomically, the direct call trampoline
> is always reachable.

Yes, the auipc/jalr instruction pair can be patched atomically just as
long as their size is naturally aligned on. However, we cannot prevent
2 instructions from being fetched atomically :P

> Solving the atomic text patching problem would be fun!! I am eager to
> see how it will be
> solved.

I have a patch series to solve the atomic code patching issue, which I
am about to respin [1]. The idea is to solve it with yet another layer
of indirection. We add a 8-B aligned space at each function entry. The
space is a pointer to the ftrace entry. During boot, each function
entry code is updated to perform a load and then take the jump from
the 8-B space. When ftrace is disabled, we patch the first 4B-aligned
instruction to a jump so as to skip the ftrace entry.

We are still discussing with Alex to see if we have a better way to do
it. If not then I'd update the patchset and re-send it. There's a
pending improvement in the series to reduce complexity. The 8-B
aligned space can be added before the function entry (just like your
patch).

>
> > * Larger text size (function alignment + extra nops) BAD
> > * Same direct call performance NEUTRAL
> > * Same complicated text patching required NEUTRAL
> >
> > It would be interesting to see how the per-call performance would
> > improve on x86 with CALL_OPS! ;-)
>
> If I remember from Steven's talk, x86 uses dynamically allocated trampolines
> for per callsite tracers, would CALL_OPS provide better performance than that?
>
> >
> > I'm trying to wrap my head if it makes sense to have it on RISC-V, given
> > that we're a bit different from Arm64. Does the scale tip to the GOOD
> > side?
> >
> > Oh, and we really need to see performance numbers on real HW! I have a
> > VF2 that I could try this series on.
>
> It would be great if you can do it :D.
>
> Thanks,
> Puranjay
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

- [1] https://yhbt.net/lore/all/CAJF2gTSn3_cDYsF9D8dt-br2Wf_M8y02A09xgRq8kXi91sN3Aw@mail.gmail.com/T/

Regards,
Andy
Björn Töpel March 8, 2024, 10:16 a.m. UTC | #8
Puranjay Mohan <puranjay12@gmail.com> writes:

> Hi Björn,
>
> On Thu, Mar 7, 2024 at 8:27 PM Björn Töpel <bjorn@kernel.org> wrote:
>>
>> Puranjay!
>>
>> Puranjay Mohan <puranjay12@gmail.com> writes:
>>
>> > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
>> > This allows each ftrace callsite to provide an ftrace_ops to the common
>> > ftrace trampoline, allowing each callsite to invoke distinct tracer
>> > functions without the need to fall back to list processing or to
>> > allocate custom trampolines for each callsite. This significantly speeds
>> > up cases where multiple distinct trace functions are used and callsites
>> > are mostly traced by a single tracer.
>> >
>> > The idea and most of the implementation is taken from the ARM64's
>> > implementation of the same feature. The idea is to place a pointer to
>> > the ftrace_ops as a literal at a fixed offset from the function entry
>> > point, which can be recovered by the common ftrace trampoline.
>>
>> Not really a review, but some more background; Another rationale (on-top
>> of the improved per-call performance!) for CALL_OPS was to use it to
>> build ftrace direct call support (which BPF uses a lot!). Mark, please
>> correct me if I'm lying here!
>>
>> On Arm64, CALL_OPS makes it possible to implement direct calls, while
>> only patching one BL instruction -- nice!
>>
>> On RISC-V we cannot use use the same ideas as Arm64 straight off,
>> because the range of jal (compare to BL) is simply too short (+/-1M).
>> So, on RISC-V we need to use a full auipc/jal pair (the text patching
>> story is another chapter, but let's leave that aside for now). Since we
>> have to patch multiple instructions, the cmodx situation doesn't really
>> improve with CALL_OPS.
>>
>> Let's say that we continue building on your patch and implement direct
>> calls on CALL_OPS for RISC-V as well.
>>
>> From Florent's commit message for direct calls:
>>
>>   |    There are a few cases to distinguish:
>>   |    - If a direct call ops is the only one tracing a function:
>>   |      - If the direct called trampoline is within the reach of a BL
>>   |        instruction
>>   |         -> the ftrace patchsite jumps to the trampoline
>>   |      - Else
>>   |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
>>   |            reads the ops pointer in the patchsite and jumps to the direct
>>   |            call address stored in the ops
>>   |    - Else
>>   |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
>>   |         ops literal points to ftrace_list_ops so it iterates over all
>>   |         registered ftrace ops, including the direct call ops and calls its
>>   |         call_direct_funcs handler which stores the direct called
>>   |         trampoline's address in the ftrace_regs and the ftrace_caller
>>   |         trampoline will return to that address instead of returning to the
>>   |         traced function
>>
>> On RISC-V, where auipc/jalr is used, the direct called trampoline would
>> always be reachable, and then first Else-clause would never be entered.
>> This means the the performance for direct calls would be the same as the
>> one we have today (i.e. no regression!).
>>
>> RISC-V does like x86 does (-ish) -- patch multiple instructions, long
>> reach.
>>
>> Arm64 uses CALL_OPS and patch one instruction BL.
>>
>> Now, with this background in mind, compared to what we have today,
>> CALL_OPS would give us (again assuming we're using it for direct calls):
>>
>> * Better performance for tracer per-call (faster ops lookup) GOOD
>
> ^ this was the only motivation for me to implement this patch.
>
> I don't think implementing direct calls over call ops is fruitful for
> RISC-V because once
> the auipc/jalr can be patched atomically, the direct call trampoline
> is always reachable.
> Solving the atomic text patching problem would be fun!! I am eager to
> see how it will be
> solved.

Given the upcoming Zjid spec, we'll soon be in a much better place where
we can reason about cmodx.

>> * Larger text size (function alignment + extra nops) BAD
>> * Same direct call performance NEUTRAL
>> * Same complicated text patching required NEUTRAL
>>
>> It would be interesting to see how the per-call performance would
>> improve on x86 with CALL_OPS! ;-)
>
> If I remember from Steven's talk, x86 uses dynamically allocated trampolines
> for per callsite tracers, would CALL_OPS provide better performance than that?

Probably not, and it was really a tongue-in-cheek comment -- nothing I
encourage you to do!

Now, I think a better approach for RISC-V would be implementing what x86
has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V.

Thoughts?


Björn
Puranjay Mohan March 8, 2024, 2:13 p.m. UTC | #9
Hi Andy,


> >
> > I don't think implementing direct calls over call ops is fruitful for
> > RISC-V because once
> > the auipc/jalr can be patched atomically, the direct call trampoline
> > is always reachable.
>
> Yes, the auipc/jalr instruction pair can be patched atomically just as
> long as their size is naturally aligned on. However, we cannot prevent
> 2 instructions from being fetched atomically :P
>
> > Solving the atomic text patching problem would be fun!! I am eager to
> > see how it will be
> > solved.
>
> I have a patch series to solve the atomic code patching issue, which I
> am about to respin [1]. The idea is to solve it with yet another layer
> of indirection. We add a 8-B aligned space at each function entry. The
> space is a pointer to the ftrace entry. During boot, each function
> entry code is updated to perform a load and then take the jump from
> the 8-B space. When ftrace is disabled, we patch the first 4B-aligned
> instruction to a jump so as to skip the ftrace entry.
>
> We are still discussing with Alex to see if we have a better way to do
> it. If not then I'd update the patchset and re-send it. There's a
> pending improvement in the series to reduce complexity. The 8-B
> aligned space can be added before the function entry (just like your
> patch).

I briefly looked at the series and it looks promising. It looks similar to
how BPF programs jump to trampolines on ARM64. If the choice has to
be made about keeping the 8-B aligned space below or above the
function entry and both choices are viable then I would prefer keeping
the 8-B space below the function entry.

I am saying this because when nops are added above the function entry
clang's kcfi[1] doesn't work properly.

Thanks,
Puranjay

[1] https://clang.llvm.org/docs/ControlFlowIntegrity.html#fsanitize-kcfi
Puranjay Mohan March 8, 2024, 2:22 p.m. UTC | #10
Hi Björn,

On Fri, Mar 8, 2024 at 11:16 AM Björn Töpel <bjorn@kernel.org> wrote:
> >
> > If I remember from Steven's talk, x86 uses dynamically allocated trampolines
> > for per callsite tracers, would CALL_OPS provide better performance than that?
>
> Probably not, and it was really a tongue-in-cheek comment -- nothing I
> encourage you to do!
>
> Now, I think a better approach for RISC-V would be implementing what x86
> has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V.
>
> Thoughts?

I am going to spin some patches for implementing
arch_ftrace_update_trampoline() for
RISC-V, then we can compare the two approaches and see which is
better. But I agree
that arch_ftrace_update_trampoline() is a better approach given that
we can jump anywhere
with auipc/jalr.

Thanks,
Puranjay
Björn Töpel March 8, 2024, 3:15 p.m. UTC | #11
Puranjay Mohan <puranjay12@gmail.com> writes:

>> Now, I think a better approach for RISC-V would be implementing what x86
>> has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V.
>>
>> Thoughts?
>
> I am going to spin some patches for implementing
> arch_ftrace_update_trampoline() for
> RISC-V, then we can compare the two approaches and see which is
> better. But I agree
> that arch_ftrace_update_trampoline() is a better approach given that
> we can jump anywhere
> with auipc/jalr.

Yup, and the text size wont blow up.


Cheers,
Björn
Guo Ren March 10, 2024, 1:37 a.m. UTC | #12
On Fri, Mar 08, 2024 at 05:18:21PM +0800, Andy Chiu wrote:
> Hi Puranjay,
> 
> On Fri, Mar 8, 2024 at 3:53 AM Puranjay Mohan <puranjay12@gmail.com> wrote:
> >
> > Hi Björn,
> >
> > On Thu, Mar 7, 2024 at 8:27 PM Björn Töpel <bjorn@kernel.org> wrote:
> > >
> > > Puranjay!
> > >
> > > Puranjay Mohan <puranjay12@gmail.com> writes:
> > >
> > > > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> > > > This allows each ftrace callsite to provide an ftrace_ops to the common
> > > > ftrace trampoline, allowing each callsite to invoke distinct tracer
> > > > functions without the need to fall back to list processing or to
> > > > allocate custom trampolines for each callsite. This significantly speeds
> > > > up cases where multiple distinct trace functions are used and callsites
> > > > are mostly traced by a single tracer.
> > > >
> > > > The idea and most of the implementation is taken from the ARM64's
> > > > implementation of the same feature. The idea is to place a pointer to
> > > > the ftrace_ops as a literal at a fixed offset from the function entry
> > > > point, which can be recovered by the common ftrace trampoline.
> > >
> > > Not really a review, but some more background; Another rationale (on-top
> > > of the improved per-call performance!) for CALL_OPS was to use it to
> > > build ftrace direct call support (which BPF uses a lot!). Mark, please
> > > correct me if I'm lying here!
> > >
> > > On Arm64, CALL_OPS makes it possible to implement direct calls, while
> > > only patching one BL instruction -- nice!
> > >
> > > On RISC-V we cannot use use the same ideas as Arm64 straight off,
> > > because the range of jal (compare to BL) is simply too short (+/-1M).
> > > So, on RISC-V we need to use a full auipc/jal pair (the text patching
> > > story is another chapter, but let's leave that aside for now). Since we
> > > have to patch multiple instructions, the cmodx situation doesn't really
> > > improve with CALL_OPS.
> > >
> > > Let's say that we continue building on your patch and implement direct
> > > calls on CALL_OPS for RISC-V as well.
> > >
> > > From Florent's commit message for direct calls:
> > >
> > >   |    There are a few cases to distinguish:
> > >   |    - If a direct call ops is the only one tracing a function:
> > >   |      - If the direct called trampoline is within the reach of a BL
> > >   |        instruction
> > >   |         -> the ftrace patchsite jumps to the trampoline
> > >   |      - Else
> > >   |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
> > >   |            reads the ops pointer in the patchsite and jumps to the direct
> > >   |            call address stored in the ops
> > >   |    - Else
> > >   |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
> > >   |         ops literal points to ftrace_list_ops so it iterates over all
> > >   |         registered ftrace ops, including the direct call ops and calls its
> > >   |         call_direct_funcs handler which stores the direct called
> > >   |         trampoline's address in the ftrace_regs and the ftrace_caller
> > >   |         trampoline will return to that address instead of returning to the
> > >   |         traced function
> > >
> > > On RISC-V, where auipc/jalr is used, the direct called trampoline would
> > > always be reachable, and then first Else-clause would never be entered.
> > > This means the the performance for direct calls would be the same as the
> > > one we have today (i.e. no regression!).
> > >
> > > RISC-V does like x86 does (-ish) -- patch multiple instructions, long
> > > reach.
> > >
> > > Arm64 uses CALL_OPS and patch one instruction BL.
> > >
> > > Now, with this background in mind, compared to what we have today,
> > > CALL_OPS would give us (again assuming we're using it for direct calls):
> > >
> > > * Better performance for tracer per-call (faster ops lookup) GOOD
> >
> > ^ this was the only motivation for me to implement this patch.
> >
> > I don't think implementing direct calls over call ops is fruitful for
> > RISC-V because once
> > the auipc/jalr can be patched atomically, the direct call trampoline
> > is always reachable.
> 
> Yes, the auipc/jalr instruction pair can be patched atomically just as
> long as their size is naturally aligned on. However, we cannot prevent
> 2 instructions from being fetched atomically :P
There are some micro-arch optimization methods here, such as:
 - Disable interrupt when auipc retired.
 - When auipc -> auipc, the second one still could cause an
   interruption.

> 
> > Solving the atomic text patching problem would be fun!! I am eager to
> > see how it will be
> > solved.
> 
> I have a patch series to solve the atomic code patching issue, which I
> am about to respin [1]. The idea is to solve it with yet another layer
> of indirection. We add a 8-B aligned space at each function entry. The
> space is a pointer to the ftrace entry. During boot, each function
> entry code is updated to perform a load and then take the jump from
> the 8-B space. When ftrace is disabled, we patch the first 4B-aligned
> instruction to a jump so as to skip the ftrace entry.
> 
> We are still discussing with Alex to see if we have a better way to do
> it. If not then I'd update the patchset and re-send it. There's a
> pending improvement in the series to reduce complexity. The 8-B
> aligned space can be added before the function entry (just like your
> patch).
> 
> >
> > > * Larger text size (function alignment + extra nops) BAD
> > > * Same direct call performance NEUTRAL
> > > * Same complicated text patching required NEUTRAL
> > >
> > > It would be interesting to see how the per-call performance would
> > > improve on x86 with CALL_OPS! ;-)
> >
> > If I remember from Steven's talk, x86 uses dynamically allocated trampolines
> > for per callsite tracers, would CALL_OPS provide better performance than that?
> >
> > >
> > > I'm trying to wrap my head if it makes sense to have it on RISC-V, given
> > > that we're a bit different from Arm64. Does the scale tip to the GOOD
> > > side?
> > >
> > > Oh, and we really need to see performance numbers on real HW! I have a
> > > VF2 that I could try this series on.
> >
> > It would be great if you can do it :D.
> >
> > Thanks,
> > Puranjay
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
> 
> - [1] https://yhbt.net/lore/all/CAJF2gTSn3_cDYsF9D8dt-br2Wf_M8y02A09xgRq8kXi91sN3Aw@mail.gmail.com/T/
> 
> Regards,
> Andy
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Evgenii Shatokhin March 11, 2024, 1:56 p.m. UTC | #13
Hi,

On 07.03.2024 03:17, Puranjay Mohan wrote:
> 
> Hi Alex,
> 
> On Wed, Mar 6, 2024 at 9:35 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>>
>> Hi Puranjay,
>>
>> On 06/03/2024 17:59, Puranjay Mohan wrote:
>>> This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
>>> This allows each ftrace callsite to provide an ftrace_ops to the common
>>> ftrace trampoline, allowing each callsite to invoke distinct tracer
>>> functions without the need to fall back to list processing or to
>>> allocate custom trampolines for each callsite. This significantly speeds
>>> up cases where multiple distinct trace functions are used and callsites
>>> are mostly traced by a single tracer.
>>>
>>> The idea and most of the implementation is taken from the ARM64's
>>> implementation of the same feature. The idea is to place a pointer to
>>> the ftrace_ops as a literal at a fixed offset from the function entry
>>> point, which can be recovered by the common ftrace trampoline.
>>>
>>> We use -fpatchable-function-entry to reserve 8 bytes above the function
>>> entry by emitting 2 4 byte or 4 2 byte  nops depending on the presence of
>>> CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
>>> to the associated ftrace_ops for that callsite. Functions are aligned to
>>> 8 bytes to make sure that the accesses to this literal are atomic.
>>>
>>> This approach allows for directly invoking ftrace_ops::func even for
>>> ftrace_ops which are dynamically-allocated (or part of a module),
>>> without going via ftrace_ops_list_func.
>>>
>>> I've benchamrked this with the ftrace_ops sample module on Qemu, with
>>> the next version, I will provide benchmarks on real hardware:
>>>
>>> Without this patch:
>>>
>>> +-----------------------+-----------------+----------------------------+
>>> |  Number of tracers    | Total time (ns) | Per-call average time      |
>>> |-----------------------+-----------------+----------------------------|
>>> | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
>>> |----------+------------+-----------------+------------+---------------|
>>> |        0 |          0 |        15615700 |        156 |             - |
>>> |        0 |          1 |        15917600 |        159 |             - |
>>> |        0 |          2 |        15668000 |        156 |             - |
>>> |        0 |         10 |        14971500 |        149 |             - |
>>> |        0 |        100 |        15417600 |        154 |             - |
>>> |        0 |        200 |        15387000 |        153 |             - |
>>> |----------+------------+-----------------+------------+---------------|
>>> |        1 |          0 |       119906800 |       1199 |          1043 |
>>> |        1 |          1 |       137428600 |       1374 |          1218 |
>>> |        1 |          2 |       159562400 |       1374 |          1218 |
>>> |        1 |         10 |       302099900 |       3020 |          2864 |
>>> |        1 |        100 |      2008785500 |      20087 |         19931 |
>>> |        1 |        200 |      3965221900 |      39652 |         39496 |
>>> |----------+------------+-----------------+------------+---------------|
>>> |        1 |          0 |       119166700 |       1191 |          1035 |
>>> |        2 |          0 |       157999900 |       1579 |          1423 |
>>> |       10 |          0 |       425370100 |       4253 |          4097 |
>>> |      100 |          0 |      3595252100 |      35952 |         35796 |
>>> |      200 |          0 |      7023485700 |      70234 |         70078 |
>>> +----------+------------+-----------------+------------+---------------+
>>>
>>> Note: per-call overhead is estimated relative to the baseline case with
>>> 0 relevant tracers and 0 irrelevant tracers.
>>>
>>> With this patch:
>>>
>>> +-----------------------+-----------------+----------------------------+
>>> |   Number of tracers   | Total time (ns) | Per-call average time      |
>>> |-----------------------+-----------------+----------------------------|
>>> | Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
>>> |----------+------------+-----------------+------------+---------------|
>>> |        0 |          0 |        15254600 |        152 |             - |
>>> |        0 |          1 |        16136700 |        161 |             - |
>>> |        0 |          2 |        15329500 |        153 |             - |
>>> |        0 |         10 |        15148800 |        151 |             - |
>>> |        0 |        100 |        15746900 |        157 |             - |
>>> |        0 |        200 |        15737400 |        157 |             - |
>>> |----------+------------+-----------------+------------+---------------|
>>> |        1 |          0 |        47909000 |        479 |           327 |
>>> |        1 |          1 |        48297400 |        482 |           330 |
>>> |        1 |          2 |        47314100 |        473 |           321 |
>>> |        1 |         10 |        47844900 |        478 |           326 |
>>> |        1 |        100 |        46591900 |        465 |           313 |
>>> |        1 |        200 |        47178900 |        471 |           319 |
>>> |----------+------------+-----------------+------------+---------------|
>>> |        1 |          0 |        46715800 |        467 |           315 |
>>> |        2 |          0 |       155134500 |       1551 |          1399 |
>>> |       10 |          0 |       442672800 |       4426 |          4274 |
>>> |      100 |          0 |      4092353900 |      40923 |         40771 |
>>> |      200 |          0 |      7135796400 |      71357 |         71205 |
>>> +----------+------------+-----------------+------------+---------------+
>>>
>>> Note: per-call overhead is estimated relative to the baseline case with
>>> 0 relevant tracers and 0 irrelevant tracers.
>>>
>>> As can be seen from the above:
>>>
>>>    a) Whenever there is a single relevant tracer function associated with a
>>>       tracee, the overhead of invoking the tracer is constant, and does not
>>>       scale with the number of tracers which are *not* associated with that
>>>       tracee.
>>>
>>>    b) The overhead for a single relevant tracer has dropped to ~1/3 of the
>>>       overhead prior to this series (from 1035ns to 315ns). This is largely
>>>       due to permitting calls to dynamically-allocated ftrace_ops without
>>>       going through ftrace_ops_list_func.
>>>
>>> Why is this patch a RFC patch:
>>>    1. I saw some rcu stalls on Qemu and need to debug them and see if they
>>>       were introduced by this patch.
>>
>>
>> FYI, I'm currently working on debugging such issues (and other) with the
>> *current* ftrace implementation, so probably not caused by your
>> patchset. But keep debugging too, maybe this introduces other issues or
>> even better, you'll find the root cause :)
>>
>>
>>>    2. This needs to be tested thoroughly on real hardware.
>>>    3. Seeking reviews to fix any fundamental problems with this patch that I
>>>       might have missed due to my lack of RISC-V architecture knowledge.
>>>    4. I would like to benchmark this on real hardware and put the results in
>>>       the commit message.
>>>
>>> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
>>> ---
>>>    arch/riscv/Kconfig              |  2 ++
>>>    arch/riscv/Makefile             |  8 +++++
>>>    arch/riscv/include/asm/ftrace.h |  3 ++
>>>    arch/riscv/kernel/asm-offsets.c |  3 ++
>>>    arch/riscv/kernel/ftrace.c      | 59 +++++++++++++++++++++++++++++++++
>>>    arch/riscv/kernel/mcount-dyn.S  | 42 ++++++++++++++++++++---
>>>    6 files changed, 112 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index 0bfcfec67ed5..e474742e23b2 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -78,6 +78,7 @@ config RISCV
>>>        select EDAC_SUPPORT
>>>        select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
>>>        select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
>>> +     select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS
>>
>>
>> A recent discussion [1] states that -falign-functions cannot guarantee
>> this alignment for all code and that gcc developers came up with a new
>> option [2]: WDYT? I have added Andy and Evgenii in +cc to help on that.
> 

Yes, the recent GCC supports -fmin-function-alignment option which makes 
sure all functions are properly aligned.

I used the following patch on top of Andy's patchset [3]:
-----------
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index b33b787c8b07..b831ae62672d 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -14,10 +14,15 @@ endif
  ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
  	LDFLAGS_vmlinux += --no-relax
  	KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
+ifeq ($(CONFIG_CC_IS_GCC),y)
+	CC_FLAGS_FTRACE := -fmin-function-alignment=4
+else
+	CC_FLAGS_FTRACE := -falign-functions=4
+endif
  ifeq ($(CONFIG_RISCV_ISA_C),y)
-	CC_FLAGS_FTRACE := -fpatchable-function-entry=12 -falign-functions=4
+	CC_FLAGS_FTRACE += -fpatchable-function-entry=12
  else
-	CC_FLAGS_FTRACE := -fpatchable-function-entry=6 -falign-functions=4
+	CC_FLAGS_FTRACE += -fpatchable-function-entry=6
  endif
  endif

-----------

We probably need something like this in arch/riscv/Kconfig as well, to 
allow dynamic Ftrace only if GCC supports -fmin-function-alignment:
-----------
  config GCC_SUPPORTS_DYNAMIC_FTRACE
  	def_bool CC_IS_GCC
-	depends on $(cc-option,-fpatchable-function-entry=8)
+	depends on $(cc-option,-fpatchable-function-entry=6)
+	depends on $(cc-option,-fmin-function-alignment=4)
-----------


> I saw arm64 uses the same and assumes this guarantee, maybe it is broken too?
> Will look into the discussion and try to use the other option. I am
> currently using Clang to build the kernel.

I tried dynamic Ftrace with clang too (LLVM 17.0.6 built from the 
official RISC-V toolchain). It seems, clang has no problems with 
alignment of functions, unlike GCC, and 'falign-functions=4' is enough 
to align everything properly.

However, there is a serious issue with clang and dynamic Ftrace: [4].

In short: for static functions with 9 or more arguments, like 
__find_rr_leaf 
(https://elixir.bootlin.com/linux/v6.8-rc4/source/net/ipv6/route.c#L786), 
clang may switch to "fastcall" calling convention. This way, the 9th 
argument will be passed in the register t2, the 10th - in t3, etc., 
rather than on the stack like RISC-V ABI requires.

This causes kernel craches when trying to enable "function", 
"function_graph" and other tracers for such functions. The 
implementation of Ftrace assumes that t0 - t3 can be safely clobbered at 
the beginning of the function. clang assumes otherwise.

Unless a compiler option or something to disable switching to fastcall 
convention is added (there is currently no such option in clang, it 
seems), dynamic Ftrace cannot be used safely with clang-built kernels.

> 
>>
>> [1]
>> https://lore.kernel.org/linux-riscv/4fe4567b-96be-4102-952b-7d39109b2186@yadro.com/
>> [2]
>> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=0f5a9a00e3ab1fe96142f304cfbcf3f63b15f326
>>
>>
>>>        select GENERIC_ARCH_TOPOLOGY
>>>        select GENERIC_ATOMIC64 if !64BIT
>>>        select GENERIC_CLOCKEVENTS_BROADCAST if SMP
>>> @@ -127,6 +128,7 @@ config RISCV
>>>        select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
>>>        select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>>>        select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
>>> +     select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_REGS && !CFI_CLANG)
>>>        select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
>>>        select HAVE_FUNCTION_GRAPH_TRACER
>>>        select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
>>> diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
>>> index 252d63942f34..875ad5dc3d32 100644
>>> --- a/arch/riscv/Makefile
>>> +++ b/arch/riscv/Makefile
>>> @@ -14,12 +14,20 @@ endif
>>>    ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
>>>        LDFLAGS_vmlinux += --no-relax
>>>        KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
>>> +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS), y)
>>> +ifeq ($(CONFIG_RISCV_ISA_C),y)
>>> +     CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
>>> +else
>>> +     CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
>>> +endif
>>> +else
>>>    ifeq ($(CONFIG_RISCV_ISA_C),y)
>>>        CC_FLAGS_FTRACE := -fpatchable-function-entry=4
>>>    else
>>>        CC_FLAGS_FTRACE := -fpatchable-function-entry=2
>>>    endif
>>>    endif
>>> +endif
>>>
>>>    ifeq ($(CONFIG_CMODEL_MEDLOW),y)
>>>    KBUILD_CFLAGS_MODULE += -mcmodel=medany
>>> diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
>>> index 329172122952..c9a84222c9ea 100644
>>> --- a/arch/riscv/include/asm/ftrace.h
>>> +++ b/arch/riscv/include/asm/ftrace.h
>>> @@ -28,6 +28,9 @@
>>>    void MCOUNT_NAME(void);
>>>    static inline unsigned long ftrace_call_adjust(unsigned long addr)
>>>    {
>>> +     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
>>> +             return addr + 8;
>>> +
>>>        return addr;
>>>    }
>>>
>>> diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
>>> index a03129f40c46..7d7c4b486852 100644
>>> --- a/arch/riscv/kernel/asm-offsets.c
>>> +++ b/arch/riscv/kernel/asm-offsets.c
>>> @@ -488,4 +488,7 @@ void asm_offsets(void)
>>>        DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
>>>        OFFSET(STACKFRAME_FP, stackframe, fp);
>>>        OFFSET(STACKFRAME_RA, stackframe, ra);
>>> +#ifdef CONFIG_FUNCTION_TRACER
>>> +     DEFINE(FTRACE_OPS_FUNC,         offsetof(struct ftrace_ops, func));
>>> +#endif
>>>    }
>>> diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
>>> index f5aa24d9e1c1..e2e75e15d32e 100644
>>> --- a/arch/riscv/kernel/ftrace.c
>>> +++ b/arch/riscv/kernel/ftrace.c
>>> @@ -82,9 +82,52 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
>>>        return 0;
>>>    }
>>>
>>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>>> +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
>>> +{
>>> +     const struct ftrace_ops *ops = NULL;
>>> +
>>> +     if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
>>> +             ops = ftrace_find_unique_ops(rec);
>>> +             WARN_ON_ONCE(!ops);
>>> +     }
>>> +
>>> +     if (!ops)
>>> +             ops = &ftrace_list_ops;
>>> +
>>> +     return ops;
>>> +}
>>> +
>>> +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
>>> +                           const struct ftrace_ops *ops)
>>> +{
>>> +     unsigned long literal = rec->ip - 8;
>>> +
>>> +     return patch_text_nosync((void *)literal, &ops, sizeof(ops));
> 
> ^^
> I will change this to use a new function that does a 64 bit write and
> doesn't do flush_icache_range() as naturally aligned writes are
> atomic and therefore synchronization is not required here.
> 
>>> +}
>>> +
>>> +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
>>> +{
>>> +     return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
>>> +}
>>> +
>>> +static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
>>> +{
>>> +     return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
>>> +}
>>> +#else
>>> +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
>>> +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
>>> +#endif
>>> +
>>>    int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>>>    {
>>>        unsigned int call[2];
>>> +     int ret;
>>> +
>>> +     ret = ftrace_rec_update_ops(rec);
>>> +     if (ret)
>>> +             return ret;
>>>
>>>        make_call_t0(rec->ip, addr, call);
>>>
>>> @@ -98,6 +141,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
>>>                    unsigned long addr)
>>>    {
>>>        unsigned int nops[2] = {NOP4, NOP4};
>>> +     int ret;
>>> +
>>> +     ret = ftrace_rec_set_nop_ops(rec);
>>> +     if (ret)
>>> +             return ret;
>>>
>>>        if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
>>>                return -EPERM;
>>> @@ -125,6 +173,13 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
>>>
>>>    int ftrace_update_ftrace_func(ftrace_func_t func)
>>>    {
>>> +     /*
>>> +      * When using CALL_OPS, the function to call is associated with the
>>> +      * call site, and we don't have a global function pointer to update.
>>> +      */
>>> +     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
>>> +             return 0;
>>> +
>>>        int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
>>>                                       (unsigned long)func, true, true);
>>>        if (!ret) {
>>> @@ -147,6 +202,10 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
>>>        make_call_t0(caller, old_addr, call);
>>>        ret = ftrace_check_current_call(caller, call);
>>>
>>> +     if (ret)
>>> +             return ret;
>>> +
>>> +     ret = ftrace_rec_update_ops(rec);
>>>        if (ret)
>>>                return ret;
>>>
>>> diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
>>> index b7561288e8da..cb241e36e514 100644
>>> --- a/arch/riscv/kernel/mcount-dyn.S
>>> +++ b/arch/riscv/kernel/mcount-dyn.S
>>> @@ -191,11 +191,35 @@
>>>        .endm
>>>
>>>        .macro PREPARE_ARGS
>>> -     addi    a0, t0, -FENTRY_RA_OFFSET
>>> +     addi    a0, t0, -FENTRY_RA_OFFSET       // ip (callsite's auipc insn)
>>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>>> +     /*
>>> +      * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
>>> +      * function entry, these are later overwritten with the pointer to the
>>> +      * associated struct ftrace_ops.
>>> +      *
>>> +      * -8: &ftrace_ops of the associated tracer function.
>>> +      *<ftrace enable>:
>>> +      *  0: auipc  t0/ra, 0x?
>>> +      *  4: jalr   t0/ra, ?(t0/ra)
>>> +      *
>>> +      * -8: &ftrace_nop_ops
>>> +      *<ftrace disable>:
>>> +      *  0: nop
>>> +      *  4: nop
>>> +      *
>>> +      * t0 is set to ip+8 after the jalr is executed at the callsite,
>>> +      * so we find the associated op at t0-16.
>>> +      */
>>> +     mv      a1, ra                          // parent_ip
>>> +     REG_L   a2, -16(t0)                     // op
>>> +     REG_L   ra, FTRACE_OPS_FUNC(a2)         // op->func
>>> +#else
>>>        la      a1, function_trace_op
>>> -     REG_L   a2, 0(a1)
>>> -     mv      a1, ra
>>> -     mv      a3, sp
>>> +     REG_L   a2, 0(a1)                       // op
>>> +     mv      a1, ra                          // parent_ip
>>> +#endif
>>> +     mv      a3, sp                          // regs
>>>        .endm
>>>
>>>    #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
>>> @@ -233,8 +257,12 @@ SYM_FUNC_START(ftrace_regs_caller)
>>>        SAVE_ABI_REGS 1
>>>        PREPARE_ARGS
>>>
>>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>>> +     jalr ra
>>> +#else
>>>    SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
>>>        call    ftrace_stub
>>> +#endif
>>>
>>>        RESTORE_ABI_REGS 1
>>>        bnez    t1, .Ldirect
>>> @@ -247,9 +275,13 @@ SYM_FUNC_START(ftrace_caller)
>>>        SAVE_ABI_REGS 0
>>>        PREPARE_ARGS
>>>
>>> -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
> 
> ^^ this hunk is a mistake, I will fix it in the next version.
> 
>>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
>>> +     jalr ra
>>> +#else
>>> +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
>>>        call    ftrace_stub
>>>
>>> +#endif
>>>        RESTORE_ABI_REGS 0
>>>        jr      t0
>>>    SYM_FUNC_END(ftrace_caller)
>>
>>
>> As I'm diving into ftrace right now, I'll give a proper review soon. But
>> as a note, I noticed that the function_graph tracer, when enabled, makes
>> the whole system unresponsive (but still up, just very slow). A fix I
>> sent recently seems to really improve that if you're interested in
>> testing it (I am :)). You can find it here:
>> https://lore.kernel.org/linux-riscv/20240229121056.203419-1-alexghiti@rivosinc.com/
> 
> I saw the same issue where function_graph was making the system slow on Qemu.
> What hardware do you use for testing? or are you testing on Qemu as well?
> 
> I tested your patch, it speeds up the process of patching the
> instructions so the following
> command completes ~2.5 seconds faster compared to without your patch.
> $ time echo function_graph > current_tracer
> 
> But at runtime the system is still slow and laggy with function_graph,
> I guess because my
> Qemu setup is not powerful enough to run function_graph.
> 
> Thanks,
> Puranjay
> 

[3] 
https://lore.kernel.org/all/CAJF2gTRc487qEme5tQw4ich5xuf8_DYhTB8fp=TXav3Gs4KWnQ@mail.gmail.com/T/#mdba180cfd0c9a6df3da49e9c330649ce5151a24a

[4] https://github.com/llvm/llvm-project/issues/83111 - "RISC-V ABI 
break: x7/t2 register is used to pass the 9th argument to a function in 
certain cases"

Regards,
Evgenii
Mark Rutland March 12, 2024, 1:42 p.m. UTC | #14
Hi Bjorn

(apologies, my corporate mail server has butchered your name here).

There's a big info dump below; I realise this sounds like a sales pitch for
CALL_OPS, but my intent is more to say "here are some dragons you may not have
spotted".

On Thu, Mar 07, 2024 at 08:27:40PM +0100, Bj"orn T"opel wrote:
> Puranjay!
> 
> Puranjay Mohan <puranjay12@gmail.com> writes:
> 
> > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> > This allows each ftrace callsite to provide an ftrace_ops to the common
> > ftrace trampoline, allowing each callsite to invoke distinct tracer
> > functions without the need to fall back to list processing or to
> > allocate custom trampolines for each callsite. This significantly speeds
> > up cases where multiple distinct trace functions are used and callsites
> > are mostly traced by a single tracer.
> >
> > The idea and most of the implementation is taken from the ARM64's
> > implementation of the same feature. The idea is to place a pointer to
> > the ftrace_ops as a literal at a fixed offset from the function entry
> > point, which can be recovered by the common ftrace trampoline.
> 
> Not really a review, but some more background; Another rationale (on-top
> of the improved per-call performance!) for CALL_OPS was to use it to
> build ftrace direct call support (which BPF uses a lot!). Mark, please
> correct me if I'm lying here!

Yep; it gives us the ability to do a number of per-callsite things, including
direct calls.

> On Arm64, CALL_OPS makes it possible to implement direct calls, while
> only patching one BL instruction -- nice!

The key thing here isn't that we patch a single instruction (since we have ot
patch the ops pointer too!); it's that we can safely patch either of the ops
pointer or BL/NOP at any time while threads are concurrently executing.

If you have a multi-instruction sequence, then threads can be preempted
mid-sequence, and it's very painful/complex to handle all of the races that
entails.

For example, if your callsites use a sequence:

	AUIPC <tmp>, <funcptr>
	JALR <tmp2>, <funcptr>(<tmp>)

Using stop_machine() won't allow you to patch that safely as some threads
could be stuck mid-sequence, e.g.

	AUIPC <tmp>, <funcptr>
	[ preempted here ]
	JALR <tmp2>, <funcptr>(<tmp>)

... and you can't update the JALR to use a new funcptr immediate until those
have completed the sequence.

There are ways around that, but they're complicated and/or expensive, e.g.

* Use a sequence of multiple patches, starting with replacing the JALR with an
  exception-generating instruction with a fixup handler, which is sort-of what
  x86 does with UD2. This may require multiple passes with
  synchronize_rcu_tasks() to make sure all threads have seen the latest
  instructions, and that cannot be done under stop_machine(), so if you need
  stop_machine() for CMODx reasons, you may need to use that several times with
  intervening calls to synchronize_rcu_tasks().

* Have the patching logic manually go over each thread and fix up the pt_regs
  for the interrupted thread. This is pretty horrid since you could have nested
  exceptions and a task could have several pt_regs which might require
  updating.

The CALL_OPS approach is a bit easier to deal with as we can patch the
per-callsite pointer atomically, then we can (possibly) enable/disable the
callsite's branch, then wait for threads to drain once. 

As a heads-up, there are some latent/generic issues with DYNAMIC_FTRACE
generally in this area (CALL_OPs happens to side-step those, but trampoline
usage is currently affected):

  https://lore.kernel.org/lkml/Zenx_Q0UiwMbSAdP@FVFF77S0Q05N/

... I'm looking into fixing that at the moment, and it looks like that's likely
to require some per-architecture changes.

> On RISC-V we cannot use use the same ideas as Arm64 straight off,
> because the range of jal (compare to BL) is simply too short (+/-1M).
> So, on RISC-V we need to use a full auipc/jal pair (the text patching
> story is another chapter, but let's leave that aside for now). Since we
> have to patch multiple instructions, the cmodx situation doesn't really
> improve with CALL_OPS.

The branch range thing is annoying, but I think this boils down to the same
problem as arm64 has with needing a "MOV <tmp>, LR" instruction that we have to
patch in once at boot time. You could do the same and patch in the AUIPC once,
e.g. have

| 	NOP
| 	NOP 
| func:
| 	AUIPC <tmp>, <common_ftrace_caller>
| 	JALR <tmp2>, <common_ftrace_caller>(<tmp>) // patched with NOP

... which'd look very similar to arm64's sequence:

| 	NOP
| 	NOP
| func:
| 	MOV X9, LR
| 	BL ftrace_caller // patched with NOP

... which I think means it *might* be better from a cmodx perspective?

> Let's say that we continue building on your patch and implement direct
> calls on CALL_OPS for RISC-V as well.
> 
> From Florent's commit message for direct calls:
> 
>   |    There are a few cases to distinguish:
>   |    - If a direct call ops is the only one tracing a function:
>   |      - If the direct called trampoline is within the reach of a BL
>   |        instruction
>   |         -> the ftrace patchsite jumps to the trampoline
>   |      - Else
>   |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
>   |            reads the ops pointer in the patchsite and jumps to the direct
>   |            call address stored in the ops
>   |    - Else
>   |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
>   |         ops literal points to ftrace_list_ops so it iterates over all
>   |         registered ftrace ops, including the direct call ops and calls its
>   |         call_direct_funcs handler which stores the direct called
>   |         trampoline's address in the ftrace_regs and the ftrace_caller
>   |         trampoline will return to that address instead of returning to the
>   |         traced function
> 
> On RISC-V, where auipc/jalr is used, the direct called trampoline would
> always be reachable, and then first Else-clause would never be entered.
> This means the the performance for direct calls would be the same as the
> one we have today (i.e. no regression!).
> 
> RISC-V does like x86 does (-ish) -- patch multiple instructions, long
> reach.
> 
> Arm64 uses CALL_OPS and patch one instruction BL.
> 
> Now, with this background in mind, compared to what we have today,
> CALL_OPS would give us (again assuming we're using it for direct calls):
> 
> * Better performance for tracer per-call (faster ops lookup) GOOD
> * Larger text size (function alignment + extra nops) BAD
> * Same direct call performance NEUTRAL
> * Same complicated text patching required NEUTRAL

Is your current sequence safe for preemptible kernels (i.e. with PREEMPT_FULL=y
or PREEMPT_DYNAMIC=y + "preempt=full" on the kernel cmdline) ?

Looking at v6.8, IIUC you have:

	// unpatched		//patched
	NOP			AUIPC
	NOP			JALR

What prevents a thread being preempted mid-sequence such that it executes:

	NOP
	[ preempted ]
	[ stop_machine() used to patch text ]
	[ restored ]
	JALR

... ?

... or when changing the call:

	AUIPC		// old funcptr
	[ preempted ]
	[ stop_machine() used to patch text ]
	[ restored ]
	JALR		// new funcptr
... ?

I suspect those might both be broken, but it's difficult to hit the race and so
testing hasn't revealed that so far.

> It would be interesting to see how the per-call performance would
> improve on x86 with CALL_OPS! ;-)

Heh. ;)

> I'm trying to wrap my head if it makes sense to have it on RISC-V, given
> that we're a bit different from Arm64. Does the scale tip to the GOOD
> side?
>
> Oh, and we really need to see performance numbers on real HW! I have a
> VF2 that I could try this series on.

I'll have to leave those two to you. I suspect the preempt/stop_machine()
might just tip that from NEUTRAL to GOOD.

Mark.
Björn Töpel March 13, 2024, 11:23 a.m. UTC | #15
Mark Rutland <mark.rutland@arm.com> writes:

> Hi Bjorn
>
> (apologies, my corporate mail server has butchered your name here).

Ha! That's the price I have to pay for carrying double-umlauts
everywhere. Thanks for getting back with a really useful answer!

>> On Arm64, CALL_OPS makes it possible to implement direct calls, while
>> only patching one BL instruction -- nice!
>
> The key thing here isn't that we patch a single instruction (since we have ot
> patch the ops pointer too!); it's that we can safely patch either of the ops
> pointer or BL/NOP at any time while threads are concurrently executing.

...which indeed is a very nice property!

> If you have a multi-instruction sequence, then threads can be preempted
> mid-sequence, and it's very painful/complex to handle all of the races that
> entails.

Oh yes. RISC-V is currently using auipc/jalr with stop_machine(), and
also requires that preemtion is off. Unusable to put it blunt.

> For example, if your callsites use a sequence:
>
> 	AUIPC <tmp>, <funcptr>
> 	JALR <tmp2>, <funcptr>(<tmp>)
>
> Using stop_machine() won't allow you to patch that safely as some threads
> could be stuck mid-sequence, e.g.
>
> 	AUIPC <tmp>, <funcptr>
> 	[ preempted here ]
> 	JALR <tmp2>, <funcptr>(<tmp>)
>
> ... and you can't update the JALR to use a new funcptr immediate until those
> have completed the sequence.
>
> There are ways around that, but they're complicated and/or expensive, e.g.
>
> * Use a sequence of multiple patches, starting with replacing the JALR with an
>   exception-generating instruction with a fixup handler, which is sort-of what
>   x86 does with UD2. This may require multiple passes with
>   synchronize_rcu_tasks() to make sure all threads have seen the latest
>   instructions, and that cannot be done under stop_machine(), so if you need
>   stop_machine() for CMODx reasons, you may need to use that several times with
>   intervening calls to synchronize_rcu_tasks().
>
> * Have the patching logic manually go over each thread and fix up the pt_regs
>   for the interrupted thread. This is pretty horrid since you could have nested
>   exceptions and a task could have several pt_regs which might require
>   updating.

Yup, and both of these have rather unplesant overhead.

> The CALL_OPS approach is a bit easier to deal with as we can patch the
> per-callsite pointer atomically, then we can (possibly) enable/disable the
> callsite's branch, then wait for threads to drain once. 
>
> As a heads-up, there are some latent/generic issues with DYNAMIC_FTRACE
> generally in this area (CALL_OPs happens to side-step those, but trampoline
> usage is currently affected):
>
>   https://lore.kernel.org/lkml/Zenx_Q0UiwMbSAdP@FVFF77S0Q05N/
>
> ... I'm looking into fixing that at the moment, and it looks like that's likely
> to require some per-architecture changes.
>
>> On RISC-V we cannot use use the same ideas as Arm64 straight off,
>> because the range of jal (compare to BL) is simply too short (+/-1M).
>> So, on RISC-V we need to use a full auipc/jal pair (the text patching
>> story is another chapter, but let's leave that aside for now). Since we
>> have to patch multiple instructions, the cmodx situation doesn't really
>> improve with CALL_OPS.
>
> The branch range thing is annoying, but I think this boils down to the same
> problem as arm64 has with needing a "MOV <tmp>, LR" instruction that we have to
> patch in once at boot time. You could do the same and patch in the AUIPC once,
> e.g. have
>
> | 	NOP
> | 	NOP 
> | func:
> | 	AUIPC <tmp>, <common_ftrace_caller>
> | 	JALR <tmp2>, <common_ftrace_caller>(<tmp>) // patched with NOP
>
> ... which'd look very similar to arm64's sequence:
>
> | 	NOP
> | 	NOP
> | func:
> | 	MOV X9, LR
> | 	BL ftrace_caller // patched with NOP
>
> ... which I think means it *might* be better from a cmodx perspective?
>
>> Let's say that we continue building on your patch and implement direct
>> calls on CALL_OPS for RISC-V as well.
>> 
>> From Florent's commit message for direct calls:
>> 
>>   |    There are a few cases to distinguish:
>>   |    - If a direct call ops is the only one tracing a function:
>>   |      - If the direct called trampoline is within the reach of a BL
>>   |        instruction
>>   |         -> the ftrace patchsite jumps to the trampoline
>>   |      - Else
>>   |         -> the ftrace patchsite jumps to the ftrace_caller trampoline which
>>   |            reads the ops pointer in the patchsite and jumps to the direct
>>   |            call address stored in the ops
>>   |    - Else
>>   |      -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
>>   |         ops literal points to ftrace_list_ops so it iterates over all
>>   |         registered ftrace ops, including the direct call ops and calls its
>>   |         call_direct_funcs handler which stores the direct called
>>   |         trampoline's address in the ftrace_regs and the ftrace_caller
>>   |         trampoline will return to that address instead of returning to the
>>   |         traced function
>> 
>> On RISC-V, where auipc/jalr is used, the direct called trampoline would
>> always be reachable, and then first Else-clause would never be entered.
>> This means the the performance for direct calls would be the same as the
>> one we have today (i.e. no regression!).
>> 
>> RISC-V does like x86 does (-ish) -- patch multiple instructions, long
>> reach.
>> 
>> Arm64 uses CALL_OPS and patch one instruction BL.
>> 
>> Now, with this background in mind, compared to what we have today,
>> CALL_OPS would give us (again assuming we're using it for direct calls):
>> 
>> * Better performance for tracer per-call (faster ops lookup) GOOD
>> * Larger text size (function alignment + extra nops) BAD
>> * Same direct call performance NEUTRAL
>> * Same complicated text patching required NEUTRAL
>
> Is your current sequence safe for preemptible kernels (i.e. with PREEMPT_FULL=y
> or PREEMPT_DYNAMIC=y + "preempt=full" on the kernel cmdline) ?

It's very much not, and was in-fact presented by Andy (Cc) discussed at
length at Plumbers two years back.

Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
trampolines changes quite a bit.

The more I look at the pains of patching two instruction ("split
immediates"), the better "patch data" + one insn patching look.

I which we had longer instructions, that could fit a 48b address or
more! ;-)


Again, thanks for a thought provoking reply!
Björn
Puranjay Mohan March 14, 2024, 2:16 p.m. UTC | #16
Björn Töpel <bjorn@kernel.org> writes:

>
> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
> trampolines changes quite a bit.
>
> The more I look at the pains of patching two instruction ("split
> immediates"), the better "patch data" + one insn patching look.

I was looking at how dynamic trampolines would be implemented for RISC-V.

With CALL-OPS we need to patch the auipc+jalr at function entry only, the
ops pointer above the function can be patched atomically.

With a dynamic trampoline we need a auipc+jalr pair at function entry to jump
to the trampoline and then another auipc+jalr pair to jump from trampoline to
ops->func. When the ops->func is modified, we would need to update the
auipc+jalr at in the trampoline.

So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines?

Thanks,
Puranjay
Björn Töpel March 14, 2024, 3:07 p.m. UTC | #17
Puranjay Mohan <puranjay12@gmail.com> writes:

> Björn Töpel <bjorn@kernel.org> writes:
>
>>
>> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
>> trampolines changes quite a bit.
>>
>> The more I look at the pains of patching two instruction ("split
>> immediates"), the better "patch data" + one insn patching look.
>
> I was looking at how dynamic trampolines would be implemented for RISC-V.
>
> With CALL-OPS we need to patch the auipc+jalr at function entry only, the
> ops pointer above the function can be patched atomically.
>
> With a dynamic trampoline we need a auipc+jalr pair at function entry to jump
> to the trampoline and then another auipc+jalr pair to jump from trampoline to
> ops->func. When the ops->func is modified, we would need to update the
> auipc+jalr at in the trampoline.
>
> So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines?

Yeah. Honestly, we need to figure out the patching story prior
choosing the path, so let's start there.

After reading Mark's reply, and discussing with OpenJDK folks (who does
the most crazy text patching on all platforms), having to patch multiple
instructions (where the address materialization is split over multiple
instructions) is a no-go. It's just a too big can of worms. So, if we
can only patch one insn, it's CALL_OPS.

A couple of options (in addition to Andy's), and all require a
per-function landing address ala CALL_OPS) tweaking what Mark is doing
on Arm (given the poor branch range).

...and maybe we'll get RISC-V rainbows/unicorns in the future getting
better reach (full 64b! ;-)).

A) Use auipc/jalr, only patch jalr to take us to a common
   dispatcher/trampoline
  
 | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
 | ...
 | func:
 |   ...make sure ra isn't messed up...
 |   aupic
 |   nop <=> jalr # Text patch point -> common_dispatch
 |   ACTUAL_FUNC
 | 
 | common_dispatch:
 |   load <func_trace_target_data_8B> based on ra
 |   jalr
 |   ...

The auipc is never touched, and will be overhead. Also, we need a mv to
store ra in a scratch register as well -- like Arm. We'll have two insn
per-caller overhead for a disabled caller.

B) Use jal, which can only take us +/-1M, and requires multiple
   dispatchers (and tracking which one to use, and properly distribute
   them. Ick.)

 | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
 | ...
 | func:
 |   ...make sure ra isn't messed up...
 |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
 |   ACTUAL_FUNC
 | 
 | within_1M_to_func_dispatch:
 |   load <func_trace_target_data_8B> based on ra
 |   jalr

C) Use jal, which can only take us +/-1M, and use a per-function
   trampoline requires multiple dispatchers (and tracking which one to
   use). Blows up text size A LOT.

 | <func_trace_target_data_8B> # somewhere, but probably on a different cacheline than the .text to avoid ping-ongs
 | ...
 | per_func_dispatch
 |   load <func_trace_target_data_8B> based on ra
 |   jalr
 | func:
 |   ...make sure ra isn't messed up...
 |   nop <=> jal # Text patch point -> per_func_dispatch
 |   ACTUAL_FUNC

It's a bit sad that we'll always have to have a dispatcher/trampoline,
but it's still better than stop_machine(). (And we'll need a fencei IPI
as well, but only one. ;-))

Today, I'm leaning towards A (which is what Mark suggested, and also
Robbin).. Any other options?

[Now how do we implement OPTPROBES? I'm kidding. ;-)]


Björn
Björn Töpel March 14, 2024, 8:50 p.m. UTC | #18
Björn Töpel <bjorn@kernel.org> writes:

> Puranjay Mohan <puranjay12@gmail.com> writes:
>
>> Björn Töpel <bjorn@kernel.org> writes:
>>
>>>
>>> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
>>> trampolines changes quite a bit.
>>>
>>> The more I look at the pains of patching two instruction ("split
>>> immediates"), the better "patch data" + one insn patching look.
>>
>> I was looking at how dynamic trampolines would be implemented for RISC-V.
>>
>> With CALL-OPS we need to patch the auipc+jalr at function entry only, the
>> ops pointer above the function can be patched atomically.
>>
>> With a dynamic trampoline we need a auipc+jalr pair at function entry to jump
>> to the trampoline and then another auipc+jalr pair to jump from trampoline to
>> ops->func. When the ops->func is modified, we would need to update the
>> auipc+jalr at in the trampoline.
>>
>> So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines?
>
> Yeah. Honestly, we need to figure out the patching story prior
> choosing the path, so let's start there.
>
> After reading Mark's reply, and discussing with OpenJDK folks (who does
> the most crazy text patching on all platforms), having to patch multiple
> instructions (where the address materialization is split over multiple
> instructions) is a no-go. It's just a too big can of worms. So, if we
> can only patch one insn, it's CALL_OPS.
>
> A couple of options (in addition to Andy's), and all require a
> per-function landing address ala CALL_OPS) tweaking what Mark is doing
> on Arm (given the poor branch range).
>
> ...and maybe we'll get RISC-V rainbows/unicorns in the future getting
> better reach (full 64b! ;-)).
>
> A) Use auipc/jalr, only patch jalr to take us to a common
>    dispatcher/trampoline
>   
>  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>  | ...
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   aupic
>  |   nop <=> jalr # Text patch point -> common_dispatch
>  |   ACTUAL_FUNC
>  | 
>  | common_dispatch:
>  |   load <func_trace_target_data_8B> based on ra
>  |   jalr
>  |   ...
>
> The auipc is never touched, and will be overhead. Also, we need a mv to
> store ra in a scratch register as well -- like Arm. We'll have two insn
> per-caller overhead for a disabled caller.
>
> B) Use jal, which can only take us +/-1M, and requires multiple
>    dispatchers (and tracking which one to use, and properly distribute
>    them. Ick.)
>
>  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>  | ...
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
>  |   ACTUAL_FUNC
>  | 
>  | within_1M_to_func_dispatch:
>  |   load <func_trace_target_data_8B> based on ra
>  |   jalr
>
> C) Use jal, which can only take us +/-1M, and use a per-function
>    trampoline requires multiple dispatchers (and tracking which one to
>    use). Blows up text size A LOT.
>
>  | <func_trace_target_data_8B> # somewhere, but probably on a different cacheline than the .text to avoid ping-ongs
>  | ...
>  | per_func_dispatch
>  |   load <func_trace_target_data_8B> based on ra
>  |   jalr
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   nop <=> jal # Text patch point -> per_func_dispatch
>  |   ACTUAL_FUNC

Brendan proposed yet another option, "in-function dispatch":

D) 
  | <func_trace_target_data_8B_per_function> # idk somewhere
  | ...
  | func:
  |    mv tmp1, ra
  |    aupic tmp2, <func_trace_target_data_8B_per_function:upper>
  |    mv tmp3, zero <=> ld tmp3, tmp2<func_trace_target_data_8B_per_function:lower>
  |    nop <=> jalr ra, tmp3
  |    ACTUAL_FUNC

There are 4 CMODX possiblities:
   mv, nop:  fully disabled, no problems
   mv, jalr: We will jump to zero. We would need to have the inst
             page/access fault handler take care of this case. Especially
             if we align the instructions so that they can be patched
             together, being interrupted in the middle and taking this
             path will be rare.
  ld, nop:   no problems
  ld, jalr:  fully enabled, no problems

Patching is a 64b store/sd, and we only need a fence.i at the end, since
we can handle all 4 possibilities.

For the disabled case we'll have:
A) mv, aupic, nop
D) mv, aupic, mv, nop.

Puranjay, I've flipped. Let's go Mark's CALL_OPS together with a new
text patch mechanism w/o stop_machine().


Björn
Andy Chiu March 20, 2024, 4:41 p.m. UTC | #19
Hi Björn,

On Fri, Mar 15, 2024 at 4:50 AM Björn Töpel <bjorn@kernel.org> wrote:
>
> Björn Töpel <bjorn@kernel.org> writes:
>
> > Puranjay Mohan <puranjay12@gmail.com> writes:
> >
> >> Björn Töpel <bjorn@kernel.org> writes:
> >>
> >>>
> >>> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
> >>> trampolines changes quite a bit.
> >>>
> >>> The more I look at the pains of patching two instruction ("split
> >>> immediates"), the better "patch data" + one insn patching look.
> >>
> >> I was looking at how dynamic trampolines would be implemented for RISC-V.
> >>
> >> With CALL-OPS we need to patch the auipc+jalr at function entry only, the
> >> ops pointer above the function can be patched atomically.
> >>
> >> With a dynamic trampoline we need a auipc+jalr pair at function entry to jump
> >> to the trampoline and then another auipc+jalr pair to jump from trampoline to
> >> ops->func. When the ops->func is modified, we would need to update the
> >> auipc+jalr at in the trampoline.
> >>
> >> So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines?
> >
> > Yeah. Honestly, we need to figure out the patching story prior
> > choosing the path, so let's start there.
> >
> > After reading Mark's reply, and discussing with OpenJDK folks (who does
> > the most crazy text patching on all platforms), having to patch multiple
> > instructions (where the address materialization is split over multiple
> > instructions) is a no-go. It's just a too big can of worms. So, if we
> > can only patch one insn, it's CALL_OPS.
> >
> > A couple of options (in addition to Andy's), and all require a
> > per-function landing address ala CALL_OPS) tweaking what Mark is doing
> > on Arm (given the poor branch range).
> >
> > ...and maybe we'll get RISC-V rainbows/unicorns in the future getting
> > better reach (full 64b! ;-)).
> >
> > A) Use auipc/jalr, only patch jalr to take us to a common
> >    dispatcher/trampoline
> >
> >  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
> >  | ...
> >  | func:
> >  |   ...make sure ra isn't messed up...
> >  |   aupic
> >  |   nop <=> jalr # Text patch point -> common_dispatch
> >  |   ACTUAL_FUNC
> >  |
> >  | common_dispatch:
> >  |   load <func_trace_target_data_8B> based on ra
> >  |   jalr
> >  |   ...
> >
> > The auipc is never touched, and will be overhead. Also, we need a mv to
> > store ra in a scratch register as well -- like Arm. We'll have two insn
> > per-caller overhead for a disabled caller.
> >
> > B) Use jal, which can only take us +/-1M, and requires multiple
> >    dispatchers (and tracking which one to use, and properly distribute
> >    them. Ick.)
> >
> >  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
> >  | ...
> >  | func:
> >  |   ...make sure ra isn't messed up...
> >  |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
> >  |   ACTUAL_FUNC
> >  |
> >  | within_1M_to_func_dispatch:
> >  |   load <func_trace_target_data_8B> based on ra
> >  |   jalr
> >
> > C) Use jal, which can only take us +/-1M, and use a per-function
> >    trampoline requires multiple dispatchers (and tracking which one to
> >    use). Blows up text size A LOT.
> >
> >  | <func_trace_target_data_8B> # somewhere, but probably on a different cacheline than the .text to avoid ping-ongs
> >  | ...
> >  | per_func_dispatch
> >  |   load <func_trace_target_data_8B> based on ra
> >  |   jalr
> >  | func:
> >  |   ...make sure ra isn't messed up...
> >  |   nop <=> jal # Text patch point -> per_func_dispatch
> >  |   ACTUAL_FUNC
>
> Brendan proposed yet another option, "in-function dispatch":
>
> D)
>   | <func_trace_target_data_8B_per_function> # idk somewhere
>   | ...
>   | func:
>   |    mv tmp1, ra
>   |    aupic tmp2, <func_trace_target_data_8B_per_function:upper>
>   |    mv tmp3, zero <=> ld tmp3, tmp2<func_trace_target_data_8B_per_function:lower>
>   |    nop <=> jalr ra, tmp3
>   |    ACTUAL_FUNC

My patch series takes a similar "in-function dispatch" approach. A
difference is that the <func_trace_target_data_8B_per_function> is
embedded within each function entry. I'd like to have it moved to a
run-time allocated array to reduce total text size.

Another difference is that my series changes the first instruction to
"j ACTUAL_FUNC" for the "ftrace disable" case. As long as the
architecture guarantees the atomicity of the first instruction, then
we are safe. For example, we are safe if the first instruction could
only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is
always valid, we can fix "mv + jalr" down so we don't have to
play with the exception handler trick. The guarantee from arch would
require ziccif (in RVA22) though, but I think it is the same for us
(unless with stop_machine). For ziccif, I would rather call that out
during boot than blindly assume.

However, one thing I am not very sure is: do we need a destination
address in a "per-function" manner? It seems like most of the time the
destination address can only be ftrace_call, or ftrace_regs_call. If
the number of destination addresses is very few, then we could
potentially reduce the size of
<func_trace_target_data_8B_per_function>.

>
> There are 4 CMODX possiblities:
>    mv, nop:  fully disabled, no problems
>    mv, jalr: We will jump to zero. We would need to have the inst
>              page/access fault handler take care of this case. Especially
>              if we align the instructions so that they can be patched
>              together, being interrupted in the middle and taking this
>              path will be rare.
>   ld, nop:   no problems
>   ld, jalr:  fully enabled, no problems
>
> Patching is a 64b store/sd, and we only need a fence.i at the end, since
> we can handle all 4 possibilities.
>
> For the disabled case we'll have:
> A) mv, aupic, nop
> D) mv, aupic, mv, nop.
>
> Puranjay, I've flipped. Let's go Mark's CALL_OPS together with a new
> text patch mechanism w/o stop_machine().
>
>
> Björn

Cheers,
Andy
Mark Rutland March 20, 2024, 6:03 p.m. UTC | #20
On Thu, Mar 14, 2024 at 04:07:33PM +0100, Bj"orn T"opel wrote:
> After reading Mark's reply, and discussing with OpenJDK folks (who does
> the most crazy text patching on all platforms), having to patch multiple
> instructions (where the address materialization is split over multiple
> instructions) is a no-go. It's just a too big can of worms. So, if we
> can only patch one insn, it's CALL_OPS.
> 
> A couple of options (in addition to Andy's), and all require a
> per-function landing address ala CALL_OPS) tweaking what Mark is doing
> on Arm (given the poor branch range).
> 
> ..and maybe we'll get RISC-V rainbows/unicorns in the future getting
> better reach (full 64b! ;-)).
> 
> A) Use auipc/jalr, only patch jalr to take us to a common
>    dispatcher/trampoline
>   
>  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>  | ...
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   aupic
>  |   nop <=> jalr # Text patch point -> common_dispatch
>  |   ACTUAL_FUNC
>  | 
>  | common_dispatch:
>  |   load <func_trace_target_data_8B> based on ra
>  |   jalr
>  |   ...
> 
> The auipc is never touched, and will be overhead. Also, we need a mv to
> store ra in a scratch register as well -- like Arm. We'll have two insn
> per-caller overhead for a disabled caller.

Is the AUIPC a significant overhead? IIUC that's similar to Arm's ADRP, and I'd
have expected that to be pretty cheap.

IIUC your JALR can choose which destination register to store the return
address in, and if so, you could leave the original ra untouched (and recover
that in the common trampoline). Have I misunderstood that?

Maybe that doesn't play nicely with something else?

> B) Use jal, which can only take us +/-1M, and requires multiple
>    dispatchers (and tracking which one to use, and properly distribute
>    them. Ick.)
> 
>  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>  | ...
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
>  |   ACTUAL_FUNC
>  | 
>  | within_1M_to_func_dispatch:
>  |   load <func_trace_target_data_8B> based on ra
>  |   jalr
> 
> C) Use jal, which can only take us +/-1M, and use a per-function
>    trampoline requires multiple dispatchers (and tracking which one to
>    use). Blows up text size A LOT.
> 
>  | <func_trace_target_data_8B> # somewhere, but probably on a different cacheline than the .text to avoid ping-ongs
>  | ...
>  | per_func_dispatch
>  |   load <func_trace_target_data_8B> based on ra
>  |   jalr
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   nop <=> jal # Text patch point -> per_func_dispatch
>  |   ACTUAL_FUNC

Beware that with option (C) you'll need to handle that in your unwinder for
RELIABLE_STACKTRACE. If you don't have a symbol for per_func_dispatch (or
func_trace_target_data_8B), PC values within per_func_dispatch would be
symbolized as the prior function/data.

> It's a bit sad that we'll always have to have a dispatcher/trampoline,
> but it's still better than stop_machine(). (And we'll need a fencei IPI
> as well, but only one. ;-))
> 
> Today, I'm leaning towards A (which is what Mark suggested, and also
> Robbin).. Any other options?

Assuming my understanding of JALR above is correct, I reckon A is the nicest
option out of A/B/C.

Mark.
Björn Töpel March 21, 2024, 8:48 a.m. UTC | #21
Andy,

Pulling out the A option:

>> > A) Use auipc/jalr, only patch jalr to take us to a common
>> >    dispatcher/trampoline
>> >
>> >  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>> >  | ...
>> >  | func:
>> >  |   ...make sure ra isn't messed up...
>> >  |   aupic
>> >  |   nop <=> jalr # Text patch point -> common_dispatch
>> >  |   ACTUAL_FUNC
>> >  |
>> >  | common_dispatch:
>> >  |   load <func_trace_target_data_8B> based on ra
>> >  |   jalr
>> >  |   ...
>> >
>> > The auipc is never touched, and will be overhead. Also, we need a mv to
>> > store ra in a scratch register as well -- like Arm. We'll have two insn
>> > per-caller overhead for a disabled caller.
>
> My patch series takes a similar "in-function dispatch" approach. A
> difference is that the <func_trace_target_data_8B_per_function> is
> embedded within each function entry. I'd like to have it moved to a
> run-time allocated array to reduce total text size.

This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like
instructions (save ra, prepare jump with auipc). I think that's a
reasonable overhead.

> Another difference is that my series changes the first instruction to
> "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the
> architecture guarantees the atomicity of the first instruction, then
> we are safe. For example, we are safe if the first instruction could
> only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is
> always valid, we can fix "mv + jalr" down so we don't have to
> play with the exception handler trick. The guarantee from arch would
> require ziccif (in RVA22) though, but I think it is the same for us
> (unless with stop_machine). For ziccif, I would rather call that out
> during boot than blindly assume.

I'm maybe biased, but I'd prefer the A) over your version with the
unconditional jump. A) has the overhead of two, I'd say, free
instructions (again "Meten is Weten!" ;-)).

> However, one thing I am not very sure is: do we need a destination
> address in a "per-function" manner? It seems like most of the time the
> destination address can only be ftrace_call, or ftrace_regs_call. If
> the number of destination addresses is very few, then we could
> potentially reduce the size of
> <func_trace_target_data_8B_per_function>.

Yes, we do need a per-function manner. BPF, e.g., uses
dynamically/JIT:ed trampolines/targets.



Björn
Björn Töpel March 21, 2024, 8:58 a.m. UTC | #22
Mark,

Mark Rutland <mark.rutland@arm.com> writes:

>> A) Use auipc/jalr, only patch jalr to take us to a common
>>    dispatcher/trampoline
>>   
>>  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>>  | ...
>>  | func:
>>  |   ...make sure ra isn't messed up...
>>  |   aupic
>>  |   nop <=> jalr # Text patch point -> common_dispatch
>>  |   ACTUAL_FUNC
>>  | 
>>  | common_dispatch:
>>  |   load <func_trace_target_data_8B> based on ra
>>  |   jalr
>>  |   ...
>> 
>> The auipc is never touched, and will be overhead. Also, we need a mv to
>> store ra in a scratch register as well -- like Arm. We'll have two insn
>> per-caller overhead for a disabled caller.
>
> Is the AUIPC a significant overhead? IIUC that's similar to Arm's ADRP, and I'd
> have expected that to be pretty cheap.

No, reg-to-reg moves are dirt cheap in my book.

> IIUC your JALR can choose which destination register to store the return
> address in, and if so, you could leave the original ra untouched (and recover
> that in the common trampoline). Have I misunderstood that?
>
> Maybe that doesn't play nicely with something else?

No, you're right, we can link to another register, and shave off an
instruction. I can imagine that some implementation prefer x1/x5 for
branch prediction reasons, but that's something that we can measure on. 

So, 1-2 movs + nop are unconditionally executed on the disabled case.
(1-2 depending on the ra save/jalr reg strategy).

>> B) Use jal, which can only take us +/-1M, and requires multiple
>>    dispatchers (and tracking which one to use, and properly distribute
>>    them. Ick.)
>> 
>>  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>>  | ...
>>  | func:
>>  |   ...make sure ra isn't messed up...
>>  |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
>>  |   ACTUAL_FUNC
>>  | 
>>  | within_1M_to_func_dispatch:
>>  |   load <func_trace_target_data_8B> based on ra
>>  |   jalr
>> 
>> C) Use jal, which can only take us +/-1M, and use a per-function
>>    trampoline requires multiple dispatchers (and tracking which one to
>>    use). Blows up text size A LOT.
>> 
>>  | <func_trace_target_data_8B> # somewhere, but probably on a different cacheline than the .text to avoid ping-ongs
>>  | ...
>>  | per_func_dispatch
>>  |   load <func_trace_target_data_8B> based on ra
>>  |   jalr
>>  | func:
>>  |   ...make sure ra isn't messed up...
>>  |   nop <=> jal # Text patch point -> per_func_dispatch
>>  |   ACTUAL_FUNC
>
> Beware that with option (C) you'll need to handle that in your unwinder for
> RELIABLE_STACKTRACE. If you don't have a symbol for per_func_dispatch (or
> func_trace_target_data_8B), PC values within per_func_dispatch would be
> symbolized as the prior function/data.

Good point (but I don't like C much...)!

>> It's a bit sad that we'll always have to have a dispatcher/trampoline,
>> but it's still better than stop_machine(). (And we'll need a fencei IPI
>> as well, but only one. ;-))
>> 
>> Today, I'm leaning towards A (which is what Mark suggested, and also
>> Robbin).. Any other options?
>
> Assuming my understanding of JALR above is correct, I reckon A is the nicest
> option out of A/B/C.

Yes! +1!


Björn
Steven Rostedt March 21, 2024, 2:49 p.m. UTC | #23
On Tue, 12 Mar 2024 13:42:28 +0000
Mark Rutland <mark.rutland@arm.com> wrote:

> > It would be interesting to see how the per-call performance would
> > improve on x86 with CALL_OPS! ;-)  
> 
> Heh. ;)

But this would require adding -fpatchable-function-entry on x86, which
would increase the size of text, which could possibly have a performance
regression when tracing is disabled.

I'd have no problem if someone were to implement it, but there's a strict
requirement that it does not slow down the system when tracing is disabled.
As tracing is a second class citizen compared to the rest of the kernel.

-- Steve
Andy Chiu March 21, 2024, 5:39 p.m. UTC | #24
On Thu, Mar 21, 2024 at 4:48 PM Björn Töpel <bjorn@kernel.org> wrote:
>
> Andy,
>
> Pulling out the A option:
>
> >> > A) Use auipc/jalr, only patch jalr to take us to a common
> >> >    dispatcher/trampoline
> >> >
> >> >  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
> >> >  | ...
> >> >  | func:
> >> >  |   ...make sure ra isn't messed up...
> >> >  |   aupic
> >> >  |   nop <=> jalr # Text patch point -> common_dispatch
> >> >  |   ACTUAL_FUNC
> >> >  |
> >> >  | common_dispatch:
> >> >  |   load <func_trace_target_data_8B> based on ra
> >> >  |   jalr
> >> >  |   ...
> >> >
> >> > The auipc is never touched, and will be overhead. Also, we need a mv to
> >> > store ra in a scratch register as well -- like Arm. We'll have two insn
> >> > per-caller overhead for a disabled caller.
> >
> > My patch series takes a similar "in-function dispatch" approach. A
> > difference is that the <func_trace_target_data_8B_per_function> is
> > embedded within each function entry. I'd like to have it moved to a
> > run-time allocated array to reduce total text size.
>
> This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like
> instructions (save ra, prepare jump with auipc). I think that's a
> reasonable overhead.
>
> > Another difference is that my series changes the first instruction to
> > "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the
> > architecture guarantees the atomicity of the first instruction, then
> > we are safe. For example, we are safe if the first instruction could
> > only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is
> > always valid, we can fix "mv + jalr" down so we don't have to
> > play with the exception handler trick. The guarantee from arch would
> > require ziccif (in RVA22) though, but I think it is the same for us
> > (unless with stop_machine). For ziccif, I would rather call that out
> > during boot than blindly assume.
>
> I'm maybe biased, but I'd prefer the A) over your version with the
> unconditional jump. A) has the overhead of two, I'd say, free
> instructions (again "Meten is Weten!" ;-)).

Yes, I'd also prefer A for less overall patch size. We can also
optimize the overhead with a direct jump if that makes sense. Though,
we need to sort out a way to map functions to corresponding
trampolines. A direct way I could image is CALL_OPS'ish patching
style, if the ftrace destination has to be patched in a per-function
manner. For example:

<index-in-dispatch-list>
func_symbol:
auipc t0, common_dispatch:high <=> j actual_func:
jalr t0, common_dispatch:low(t0)

common_dispatch:
load t1, index + dispatch-list
ld t1, 0(t1)
jr t1


>
> > However, one thing I am not very sure is: do we need a destination
> > address in a "per-function" manner? It seems like most of the time the
> > destination address can only be ftrace_call, or ftrace_regs_call. If
> > the number of destination addresses is very few, then we could
> > potentially reduce the size of
> > <func_trace_target_data_8B_per_function>.
>
> Yes, we do need a per-function manner. BPF, e.g., uses
> dynamically/JIT:ed trampolines/targets.
>
>
>
> Björn

Cheers,
Andy
Björn Töpel March 21, 2024, 6:10 p.m. UTC | #25
Andy Chiu <andy.chiu@sifive.com> writes:

> On Thu, Mar 21, 2024 at 4:48 PM Björn Töpel <bjorn@kernel.org> wrote:
>>
>> Andy,
>>
>> Pulling out the A option:
>>
>> >> > A) Use auipc/jalr, only patch jalr to take us to a common
>> >> >    dispatcher/trampoline
>> >> >
>> >> >  | <func_trace_target_data_8B> # probably on a data cache-line != func .text to avoid ping-pong
>> >> >  | ...
>> >> >  | func:
>> >> >  |   ...make sure ra isn't messed up...
>> >> >  |   aupic
>> >> >  |   nop <=> jalr # Text patch point -> common_dispatch
>> >> >  |   ACTUAL_FUNC
>> >> >  |
>> >> >  | common_dispatch:
>> >> >  |   load <func_trace_target_data_8B> based on ra
>> >> >  |   jalr
>> >> >  |   ...
>> >> >
>> >> > The auipc is never touched, and will be overhead. Also, we need a mv to
>> >> > store ra in a scratch register as well -- like Arm. We'll have two insn
>> >> > per-caller overhead for a disabled caller.
>> >
>> > My patch series takes a similar "in-function dispatch" approach. A
>> > difference is that the <func_trace_target_data_8B_per_function> is
>> > embedded within each function entry. I'd like to have it moved to a
>> > run-time allocated array to reduce total text size.
>>
>> This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like
>> instructions (save ra, prepare jump with auipc). I think that's a
>> reasonable overhead.
>>
>> > Another difference is that my series changes the first instruction to
>> > "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the
>> > architecture guarantees the atomicity of the first instruction, then
>> > we are safe. For example, we are safe if the first instruction could
>> > only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is
>> > always valid, we can fix "mv + jalr" down so we don't have to
>> > play with the exception handler trick. The guarantee from arch would
>> > require ziccif (in RVA22) though, but I think it is the same for us
>> > (unless with stop_machine). For ziccif, I would rather call that out
>> > during boot than blindly assume.
>>
>> I'm maybe biased, but I'd prefer the A) over your version with the
>> unconditional jump. A) has the overhead of two, I'd say, free
>> instructions (again "Meten is Weten!" ;-)).
>
> Yes, I'd also prefer A for less overall patch size. We can also
> optimize the overhead with a direct jump if that makes sense. Though,
> we need to sort out a way to map functions to corresponding
> trampolines. A direct way I could image is CALL_OPS'ish patching
> style, if the ftrace destination has to be patched in a per-function
> manner. For example:
>
> <index-in-dispatch-list>
> func_symbol:
> auipc t0, common_dispatch:high <=> j actual_func:
> jalr t0, common_dispatch:low(t0)
>
> common_dispatch:
> load t1, index + dispatch-list
> ld t1, 0(t1)
> jr t1

Yup, exactly like that (but I'd put the acutal target ptr in there,
instead of an additional indirection. Exactly what Mark does for arm64).

When we enter the common_dispatch, the ptr <index-in-dispatch-list>
would be -12(t0).

As for patching auipc or jalr, I guess we need to measure what's best.
My knee-jerk would be always auipc is better than jump -- but let's
measure. ;-)


Björn
Steven Rostedt March 21, 2024, 8:02 p.m. UTC | #26
On Tue, 12 Mar 2024 13:42:28 +0000
Mark Rutland <mark.rutland@arm.com> wrote:

> There are ways around that, but they're complicated and/or expensive, e.g.
> 
> * Use a sequence of multiple patches, starting with replacing the JALR with an
>   exception-generating instruction with a fixup handler, which is sort-of what
>   x86 does with UD2. This may require multiple passes with
>   synchronize_rcu_tasks() to make sure all threads have seen the latest
>   instructions, and that cannot be done under stop_machine(), so if you need
>   stop_machine() for CMODx reasons, you may need to use that several times with
>   intervening calls to synchronize_rcu_tasks().

Just for clarification. x86 doesn't use UD2 for updating the call sites. It
uses the breakpoint (0xcc) operation. This is because x86 instructions are
not a fixed size and can cross cache boundaries, making updates to text
sections dangerous as another CPU may get half the old instruction and half
the new one!

Thus, when we have:

	0f 1f 44 00 00		nop

and want to convert it to:

	e8 e7 57 07 00		call ftrace_caller

We have to first add a breakpoint:

	cc 17 44 00 00

Send an IPI to all CPUs so that they see the breakpoint (IPI is equivalent
to a memory barrier).

We have a ftrace breakpoint handler that will look at the function that the
breakpoint happened on. If it was a nop, it will just skip over the rest of
the instruction, and return from the break point handler just after the
"cc 17 44 00 00". If it's supposed to be a function, it will push the
return to after the instruction onto the stack, and return from the break
point handler to ftrace_caller. (Although things changed a little since
this update is now handled by text_poke_bp_batch()).

Then it changes the rest of the instruction:

	cc e7 57 07 00

Sends out another IPI to all CPUs and removes the break point with the new
instruction op.

	e8 e7 57 07 00

and now all the callers of this function will call the ftrace_caller.

-- Steve
Robbin Ehn March 25, 2024, 12:50 p.m. UTC | #27
Hey,

> <index-in-dispatch-list>
> func_symbol:
> auipc t0, common_dispatch:high <=> j actual_func:
> jalr t0, common_dispatch:low(t0)
>

If you are patching in a jump, I don't see why you wouldn't jump over
ld+jalr? (no need for common dispatch)
Patching jalr with nop, and keeping auipc, addresses the issue with
having to jump in the disabled case.
But needs either common dispatch or per func dispatch.

Thanks, Robbin

> common_dispatch:
> load t1, index + dispatch-list
> ld t1, 0(t1)
> jr t1
>
>
> >
> > > However, one thing I am not very sure is: do we need a destination
> > > address in a "per-function" manner? It seems like most of the time the
> > > destination address can only be ftrace_call, or ftrace_regs_call. If
> > > the number of destination addresses is very few, then we could
> > > potentially reduce the size of
> > > <func_trace_target_data_8B_per_function>.
> >
> > Yes, we do need a per-function manner. BPF, e.g., uses
> > dynamically/JIT:ed trampolines/targets.
> >
> >
> >
> > Björn
>
> Cheers,
> Andy
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0bfcfec67ed5..e474742e23b2 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -78,6 +78,7 @@  config RISCV
 	select EDAC_SUPPORT
 	select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
+	select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS
 	select GENERIC_ARCH_TOPOLOGY
 	select GENERIC_ATOMIC64 if !64BIT
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
@@ -127,6 +128,7 @@  config RISCV
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_REGS && !CFI_CLANG)
 	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 252d63942f34..875ad5dc3d32 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -14,12 +14,20 @@  endif
 ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
 	LDFLAGS_vmlinux += --no-relax
 	KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
+ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS), y)
+ifeq ($(CONFIG_RISCV_ISA_C),y)
+	CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
+else
+	CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
+endif
+else
 ifeq ($(CONFIG_RISCV_ISA_C),y)
 	CC_FLAGS_FTRACE := -fpatchable-function-entry=4
 else
 	CC_FLAGS_FTRACE := -fpatchable-function-entry=2
 endif
 endif
+endif
 
 ifeq ($(CONFIG_CMODEL_MEDLOW),y)
 KBUILD_CFLAGS_MODULE += -mcmodel=medany
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 329172122952..c9a84222c9ea 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -28,6 +28,9 @@ 
 void MCOUNT_NAME(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
+	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
+		return addr + 8;
+
 	return addr;
 }
 
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index a03129f40c46..7d7c4b486852 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -488,4 +488,7 @@  void asm_offsets(void)
 	DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
 	OFFSET(STACKFRAME_FP, stackframe, fp);
 	OFFSET(STACKFRAME_RA, stackframe, ra);
+#ifdef CONFIG_FUNCTION_TRACER
+	DEFINE(FTRACE_OPS_FUNC,		offsetof(struct ftrace_ops, func));
+#endif
 }
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index f5aa24d9e1c1..e2e75e15d32e 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -82,9 +82,52 @@  static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
 	return 0;
 }
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
+{
+	const struct ftrace_ops *ops = NULL;
+
+	if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+		ops = ftrace_find_unique_ops(rec);
+		WARN_ON_ONCE(!ops);
+	}
+
+	if (!ops)
+		ops = &ftrace_list_ops;
+
+	return ops;
+}
+
+static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
+			      const struct ftrace_ops *ops)
+{
+	unsigned long literal = rec->ip - 8;
+
+	return patch_text_nosync((void *)literal, &ops, sizeof(ops));
+}
+
+static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
+{
+	return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
+}
+
+static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
+{
+	return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
+}
+#else
+static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
+static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
+#endif
+
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
 	unsigned int call[2];
+	int ret;
+
+	ret = ftrace_rec_update_ops(rec);
+	if (ret)
+		return ret;
 
 	make_call_t0(rec->ip, addr, call);
 
@@ -98,6 +141,11 @@  int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		    unsigned long addr)
 {
 	unsigned int nops[2] = {NOP4, NOP4};
+	int ret;
+
+	ret = ftrace_rec_set_nop_ops(rec);
+	if (ret)
+		return ret;
 
 	if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
 		return -EPERM;
@@ -125,6 +173,13 @@  int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
+	/*
+	 * When using CALL_OPS, the function to call is associated with the
+	 * call site, and we don't have a global function pointer to update.
+	 */
+	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
+		return 0;
+
 	int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
 				       (unsigned long)func, true, true);
 	if (!ret) {
@@ -147,6 +202,10 @@  int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 	make_call_t0(caller, old_addr, call);
 	ret = ftrace_check_current_call(caller, call);
 
+	if (ret)
+		return ret;
+
+	ret = ftrace_rec_update_ops(rec);
 	if (ret)
 		return ret;
 
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index b7561288e8da..cb241e36e514 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -191,11 +191,35 @@ 
 	.endm
 
 	.macro PREPARE_ARGS
-	addi	a0, t0, -FENTRY_RA_OFFSET
+	addi	a0, t0, -FENTRY_RA_OFFSET	// ip (callsite's auipc insn)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	/*
+	 * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
+	 * function entry, these are later overwritten with the pointer to the
+	 * associated struct ftrace_ops.
+	 *
+	 * -8: &ftrace_ops of the associated tracer function.
+	 *<ftrace enable>:
+	 *  0: auipc  t0/ra, 0x?
+	 *  4: jalr   t0/ra, ?(t0/ra)
+	 *
+	 * -8: &ftrace_nop_ops
+	 *<ftrace disable>:
+	 *  0: nop
+	 *  4: nop
+	 *
+	 * t0 is set to ip+8 after the jalr is executed at the callsite,
+	 * so we find the associated op at t0-16.
+	 */
+	mv 	a1, ra				// parent_ip
+	REG_L   a2, -16(t0)			// op
+	REG_L   ra, FTRACE_OPS_FUNC(a2)		// op->func
+#else
 	la	a1, function_trace_op
-	REG_L	a2, 0(a1)
-	mv	a1, ra
-	mv	a3, sp
+	REG_L	a2, 0(a1)			// op
+	mv	a1, ra				// parent_ip
+#endif
+	mv	a3, sp				// regs
 	.endm
 
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
@@ -233,8 +257,12 @@  SYM_FUNC_START(ftrace_regs_caller)
 	SAVE_ABI_REGS 1
 	PREPARE_ARGS
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	jalr ra
+#else
 SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
 	call	ftrace_stub
+#endif
 
 	RESTORE_ABI_REGS 1
 	bnez	t1, .Ldirect
@@ -247,9 +275,13 @@  SYM_FUNC_START(ftrace_caller)
 	SAVE_ABI_REGS 0
 	PREPARE_ARGS
 
-SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	jalr ra
+#else
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
 	call	ftrace_stub
 
+#endif
 	RESTORE_ABI_REGS 0
 	jr	t0
 SYM_FUNC_END(ftrace_caller)