diff mbox series

[RFC] riscv: use gp to save percpu offset

Message ID 20240824004920.35877-1-cuiyunhui@bytedance.com (mailing list archive)
State New
Headers show
Series [RFC] riscv: use gp to save percpu offset | expand

Commit Message

yunhui cui Aug. 24, 2024, 12:49 a.m. UTC
Compared to directly fetching the per-CPU offset from memory (or cache),
using the global pointer (gp) to store the per-CPU offset can save one
memory access.

When compiling the kernel, the following command needs to be explicitly
specified:
export KCFLAGS="... -mno-relax"
export KAFLAGS="... -mno-relax"

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 arch/riscv/include/asm/asm.h      | 18 ++++++------------
 arch/riscv/include/asm/percpu.h   | 24 ++++++++++++++++++++++++
 arch/riscv/kernel/asm-offsets.c   |  1 +
 arch/riscv/kernel/entry.S         |  4 ++--
 arch/riscv/kernel/head.S          |  9 ---------
 arch/riscv/kernel/smpboot.c       |  7 +++++++
 arch/riscv/kernel/suspend_entry.S |  2 --
 7 files changed, 40 insertions(+), 25 deletions(-)
 create mode 100644 arch/riscv/include/asm/percpu.h

Comments

Christoph Lameter (Ampere) Aug. 24, 2024, 1:57 a.m. UTC | #1
On Sat, 24 Aug 2024, Yunhui Cui wrote:

> Compared to directly fetching the per-CPU offset from memory (or cache),
> using the global pointer (gp) to store the per-CPU offset can save one
> memory access.

Yes! That is a step in the right direction.

Is there something like gp relative addressing so that we can do loads
and stores relative to gp as well?

Are there atomics that can do read modify write relative to GP? That would
get  you to comparable per cpu efficiency to x86. x86 can do relative
addressing and RMV in one instruction which allows one to drop the preempt
enable/disable since one instruction cannot be interrupted.
yunhui cui Aug. 30, 2024, 10:01 a.m. UTC | #2
Hi Christoph,


On Sat, Aug 24, 2024 at 9:57 AM Christoph Lameter (Ampere)
<cl@gentwo.org> wrote:
>
> On Sat, 24 Aug 2024, Yunhui Cui wrote:
>
> > Compared to directly fetching the per-CPU offset from memory (or cache),
> > using the global pointer (gp) to store the per-CPU offset can save one
> > memory access.
>
> Yes! That is a step in the right direction.
>
> Is there something like gp relative addressing so that we can do loads
> and stores relative to gp as well?
>
> Are there atomics that can do read modify write relative to GP? That would
> get  you to comparable per cpu efficiency to x86. x86 can do relative
> addressing and RMV in one instruction which allows one to drop the preempt
> enable/disable since one instruction cannot be interrupted.

Your suggestion is excellent. If conditions permit, we can indeed move
closer to the x86 architecture.

Thanks,
Yunhui
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h
index 776354895b81..be4e4e5ac134 100644
--- a/arch/riscv/include/asm/asm.h
+++ b/arch/riscv/include/asm/asm.h
@@ -109,19 +109,13 @@ 
 	REG_L \dst, 0(\dst)
 .endm
 
-#ifdef CONFIG_SHADOW_CALL_STACK
-/* gp is used as the shadow call stack pointer instead */
-.macro load_global_pointer
+.macro load_pcpu_off_gp tmp
+	REG_L \tmp, TASK_TI_CPU(tp)
+	slli \tmp, \tmp, 3
+	la gp, __per_cpu_offset
+	add gp, gp, \tmp
+	REG_L gp, 0(gp)
 .endm
-#else
-/* load __global_pointer to gp */
-.macro load_global_pointer
-.option push
-.option norelax
-	la gp, __global_pointer$
-.option pop
-.endm
-#endif /* CONFIG_SHADOW_CALL_STACK */
 
 	/* save all GPs except x1 ~ x5 */
 	.macro save_from_x6_to_x31
diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..858d0a93ff14
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+static inline void set_my_cpu_offset(unsigned long off)
+{
+	asm volatile("addi gp, %0, 0" :: "r" (off));
+}
+
+static inline unsigned long __kern_my_cpu_offset(void)
+{
+	unsigned long off;
+
+	asm ("mv %0, gp":"=r" (off) :);
+	return off;
+}
+
+#define __my_cpu_offset __kern_my_cpu_offset()
+
+#include <asm-generic/percpu.h>
+
+#endif /* __ASM_PERCPU_H */
+
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index b09ca5f944f7..5cc6d1de4ab4 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -36,6 +36,7 @@  void asm_offsets(void)
 	OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
 	OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
 	OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
+	OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
 	OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
 	OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
 	OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index ac2e908d4418..39d7e66567cf 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -77,8 +77,8 @@  SYM_CODE_START(handle_exception)
 	 */
 	csrw CSR_SCRATCH, x0
 
-	/* Load the global pointer */
-	load_global_pointer
+	/* load __per_cpu_offset[cpu] to gp*/
+	load_pcpu_off_gp t6
 
 	/* Load the kernel shadow call stack pointer if coming from userspace */
 	scs_load_current_if_task_changed s5
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 356d5397b2a2..aa3d22967eef 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -110,9 +110,6 @@  relocate_enable_mmu:
 	la a0, .Lsecondary_park
 	csrw CSR_TVEC, a0
 
-	/* Reload the global pointer */
-	load_global_pointer
-
 	/*
 	 * Switch to kernel page tables.  A full fence is necessary in order to
 	 * avoid using the trampoline translations, which are only correct for
@@ -131,9 +128,6 @@  secondary_start_sbi:
 	csrw CSR_IE, zero
 	csrw CSR_IP, zero
 
-	/* Load the global pointer */
-	load_global_pointer
-
 	/*
 	 * Disable FPU & VECTOR to detect illegal usage of
 	 * floating point or vector in kernel space
@@ -228,9 +222,6 @@  SYM_CODE_START(_start_kernel)
 	csrr a0, CSR_MHARTID
 #endif /* CONFIG_RISCV_M_MODE */
 
-	/* Load the global pointer */
-	load_global_pointer
-
 	/*
 	 * Disable FPU & VECTOR to detect illegal usage of
 	 * floating point or vector in kernel space
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 0f8f1c95ac38..844aede75662 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -41,6 +41,11 @@ 
 
 static DECLARE_COMPLETION(cpu_running);
 
+void __init smp_prepare_boot_cpu(void)
+{
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+}
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int cpuid;
@@ -212,6 +217,8 @@  asmlinkage __visible void smp_callin(void)
 	struct mm_struct *mm = &init_mm;
 	unsigned int curr_cpuid = smp_processor_id();
 
+	set_my_cpu_offset(per_cpu_offset(curr_cpuid));
+
 	if (has_vector()) {
 		/*
 		 * Return as early as possible so the hart with a mismatching
diff --git a/arch/riscv/kernel/suspend_entry.S b/arch/riscv/kernel/suspend_entry.S
index 2d54f309c140..0ec850489e0c 100644
--- a/arch/riscv/kernel/suspend_entry.S
+++ b/arch/riscv/kernel/suspend_entry.S
@@ -60,8 +60,6 @@  SYM_FUNC_START(__cpu_suspend_enter)
 SYM_FUNC_END(__cpu_suspend_enter)
 
 SYM_TYPED_FUNC_START(__cpu_resume_enter)
-	/* Load the global pointer */
-	load_global_pointer
 
 #ifdef CONFIG_MMU
 	/* Save A0 and A1 */