Message ID | 20250123-riscv_optimize_entry-v2-1-7c259492d508@rivosinc.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | entry: Move ret_from_fork() to C and inline syscall_exit_to_user_mode() | expand |
On Thu, Jan 23, 2025 at 2:15 PM Charlie Jenkins <charlie@rivosinc.com> wrote: > > Move the main section of ret_from_fork() to C to allow inlining of > syscall_exit_to_user_mode(). > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> > --- > arch/riscv/include/asm/asm-prototypes.h | 1 + > arch/riscv/kernel/entry.S | 15 ++++++--------- > arch/riscv/kernel/process.c | 14 ++++++++++++-- > 3 files changed, 19 insertions(+), 11 deletions(-) > > diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h > index cd627ec289f163a630b73dd03dd52a6b28692997..733ff609778797001006c33bba9e3cc5b1f15387 100644 > --- a/arch/riscv/include/asm/asm-prototypes.h > +++ b/arch/riscv/include/asm/asm-prototypes.h > @@ -52,6 +52,7 @@ DECLARE_DO_ERROR_INFO(do_trap_ecall_s); > DECLARE_DO_ERROR_INFO(do_trap_ecall_m); > DECLARE_DO_ERROR_INFO(do_trap_break); > > +asmlinkage void ret_from_fork(void *fn_arg, int (*fn)(void *), struct pt_regs *regs); > asmlinkage void handle_bad_stack(struct pt_regs *regs); > asmlinkage void do_page_fault(struct pt_regs *regs); > asmlinkage void do_irq(struct pt_regs *regs); > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S > index 33a5a9f2a0d4e1eeccfb3621b9e518b88e1b0704..9225c322279aa90e737b1d7144db084319cf8103 100644 > --- a/arch/riscv/kernel/entry.S > +++ b/arch/riscv/kernel/entry.S > @@ -319,17 +319,14 @@ SYM_CODE_END(handle_kernel_stack_overflow) > ASM_NOKPROBE(handle_kernel_stack_overflow) > #endif > > -SYM_CODE_START(ret_from_fork) > +SYM_CODE_START(ret_from_fork_asm) > call schedule_tail > - beqz s0, 1f /* not from kernel thread */ > - /* Call fn(arg) */ > - move a0, s1 > - jalr s0 > -1: > - move a0, sp /* pt_regs */ > - call syscall_exit_to_user_mode > + move a0, s1 /* fn */ > + move a1, s0 /* fn_arg */ > + move a2, sp /* pt_regs */ > + call ret_from_fork > j ret_from_exception > -SYM_CODE_END(ret_from_fork) > +SYM_CODE_END(ret_from_fork_asm) > > #ifdef CONFIG_IRQ_STACKS > /* > diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c > index 58b6482c2bf662bf5224ca50c8e21a68760a6b41..0d07e6d8f6b57beba438dbba5e8c74a014582bee 100644 > --- a/arch/riscv/kernel/process.c > +++ b/arch/riscv/kernel/process.c > @@ -17,7 +17,9 @@ > #include <linux/ptrace.h> > #include <linux/uaccess.h> > #include <linux/personality.h> > +#include <linux/entry-common.h> > > +#include <asm/asm-prototypes.h> > #include <asm/unistd.h> > #include <asm/processor.h> > #include <asm/csr.h> > @@ -36,7 +38,7 @@ unsigned long __stack_chk_guard __read_mostly; > EXPORT_SYMBOL(__stack_chk_guard); > #endif > > -extern asmlinkage void ret_from_fork(void); > +extern asmlinkage void ret_from_fork_asm(void); > > void noinstr arch_cpu_idle(void) > { > @@ -206,6 +208,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) > return 0; > } > > +asmlinkage void ret_from_fork(void *fn_arg, int (*fn)(void *), struct pt_regs *regs) > +{ > + if (unlikely(fn)) > + fn(fn_arg); > + > + syscall_exit_to_user_mode(regs); > +} > + > int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) > { > unsigned long clone_flags = args->flags; > @@ -242,7 +252,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) > p->thread.riscv_v_flags = 0; > if (has_vector()) > riscv_v_thread_alloc(p); > - p->thread.ra = (unsigned long)ret_from_fork; > + p->thread.ra = (unsigned long)ret_from_fork_asm; > p->thread.sp = (unsigned long)childregs; /* kernel sp */ > return 0; > } > > -- > 2.43.0 > > Is there a specific reason you didn't move the call to schedule_tail() to the C function, like on x86? Brian Gerst
On Fri, Jan 24, 2025 at 08:14:08AM -0500, Brian Gerst wrote: > On Thu, Jan 23, 2025 at 2:15 PM Charlie Jenkins <charlie@rivosinc.com> wrote: > > > > Move the main section of ret_from_fork() to C to allow inlining of > > syscall_exit_to_user_mode(). > > > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> > > --- > > arch/riscv/include/asm/asm-prototypes.h | 1 + > > arch/riscv/kernel/entry.S | 15 ++++++--------- > > arch/riscv/kernel/process.c | 14 ++++++++++++-- > > 3 files changed, 19 insertions(+), 11 deletions(-) > > > > diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h > > index cd627ec289f163a630b73dd03dd52a6b28692997..733ff609778797001006c33bba9e3cc5b1f15387 100644 > > --- a/arch/riscv/include/asm/asm-prototypes.h > > +++ b/arch/riscv/include/asm/asm-prototypes.h > > @@ -52,6 +52,7 @@ DECLARE_DO_ERROR_INFO(do_trap_ecall_s); > > DECLARE_DO_ERROR_INFO(do_trap_ecall_m); > > DECLARE_DO_ERROR_INFO(do_trap_break); > > > > +asmlinkage void ret_from_fork(void *fn_arg, int (*fn)(void *), struct pt_regs *regs); > > asmlinkage void handle_bad_stack(struct pt_regs *regs); > > asmlinkage void do_page_fault(struct pt_regs *regs); > > asmlinkage void do_irq(struct pt_regs *regs); > > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S > > index 33a5a9f2a0d4e1eeccfb3621b9e518b88e1b0704..9225c322279aa90e737b1d7144db084319cf8103 100644 > > --- a/arch/riscv/kernel/entry.S > > +++ b/arch/riscv/kernel/entry.S > > @@ -319,17 +319,14 @@ SYM_CODE_END(handle_kernel_stack_overflow) > > ASM_NOKPROBE(handle_kernel_stack_overflow) > > #endif > > > > -SYM_CODE_START(ret_from_fork) > > +SYM_CODE_START(ret_from_fork_asm) > > call schedule_tail > > - beqz s0, 1f /* not from kernel thread */ > > - /* Call fn(arg) */ > > - move a0, s1 > > - jalr s0 > > -1: > > - move a0, sp /* pt_regs */ > > - call syscall_exit_to_user_mode > > + move a0, s1 /* fn */ > > + move a1, s0 /* fn_arg */ > > + move a2, sp /* pt_regs */ > > + call ret_from_fork > > j ret_from_exception > > -SYM_CODE_END(ret_from_fork) > > +SYM_CODE_END(ret_from_fork_asm) > > > > #ifdef CONFIG_IRQ_STACKS > > /* > > diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c > > index 58b6482c2bf662bf5224ca50c8e21a68760a6b41..0d07e6d8f6b57beba438dbba5e8c74a014582bee 100644 > > --- a/arch/riscv/kernel/process.c > > +++ b/arch/riscv/kernel/process.c > > @@ -17,7 +17,9 @@ > > #include <linux/ptrace.h> > > #include <linux/uaccess.h> > > #include <linux/personality.h> > > +#include <linux/entry-common.h> > > > > +#include <asm/asm-prototypes.h> > > #include <asm/unistd.h> > > #include <asm/processor.h> > > #include <asm/csr.h> > > @@ -36,7 +38,7 @@ unsigned long __stack_chk_guard __read_mostly; > > EXPORT_SYMBOL(__stack_chk_guard); > > #endif > > > > -extern asmlinkage void ret_from_fork(void); > > +extern asmlinkage void ret_from_fork_asm(void); > > > > void noinstr arch_cpu_idle(void) > > { > > @@ -206,6 +208,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) > > return 0; > > } > > > > +asmlinkage void ret_from_fork(void *fn_arg, int (*fn)(void *), struct pt_regs *regs) > > +{ > > + if (unlikely(fn)) > > + fn(fn_arg); > > + > > + syscall_exit_to_user_mode(regs); > > +} > > + > > int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) > > { > > unsigned long clone_flags = args->flags; > > @@ -242,7 +252,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) > > p->thread.riscv_v_flags = 0; > > if (has_vector()) > > riscv_v_thread_alloc(p); > > - p->thread.ra = (unsigned long)ret_from_fork; > > + p->thread.ra = (unsigned long)ret_from_fork_asm; > > p->thread.sp = (unsigned long)childregs; /* kernel sp */ > > return 0; > > } > > > > -- > > 2.43.0 > > > > > > Is there a specific reason you didn't move the call to schedule_tail() > to the C function, like on x86? Yes, the generated code ends up being dramatically worse if schedule_tail() is moved into C. This is because the arg for schedule_tail() is already in a0 so the extra stack manipulation instructions end up taking up a lot of instructions. With this change: <ret_from_fork_asm>: ff65b097 auipc ra,0xff65b 1ee080e7 jalr 494(ra) # ffffffff8005038a <schedule_tail> 8526 mv a0,s1 85a2 mv a1,s0 860a mv a2,sp ff61b097 auipc ra,0xff61b 606080e7 jalr 1542(ra) # ffffffff800107b0 <ret_from_fork> b5f5 j ffffffff809f509e <ret_from_exception> <ret_from_fork>: 1101 addi sp,sp,-32 e822 sd s0,16(sp) ec06 sd ra,24(sp) 1000 addi s0,sp,32 e991 bnez a1,ffffffff800107cc <ret_from_fork+0x1c> 8532 mv a0,a2 009db097 auipc ra,0x9db a32080e7 jalr -1486(ra) # ffffffff809eb1ee <syscall_exit_to_user_mode> 60e2 ld ra,24(sp) 6442 ld s0,16(sp) 6105 addi sp,sp,32 8082 ret <following instructions used only in the kernel thread case> fec43423 sd a2,-24(s0) 9582 jalr a1 fe843603 ld a2,-24(s0) 8532 mv a0,a2 009db097 auipc ra,0x9db a16080e7 jalr -1514(ra) # ffffffff809eb1ee <syscall_exit_to_user_mode> 60e2 ld ra,24(sp) 6442 ld s0,16(sp) 6105 addi sp,sp,32 8082 ret Contrasted with what this looks like if schedule_tail() is called from C. <ret_from_fork_asm>: 85a6 mv a1,s1 8622 mv a2,s0 868a mv a3,sp ff61b097 auipc ra,0xff61b 60e080e7 jalr 1550(ra) # ffffffff800107b0 <ret_from_fork> bdd5 j ffffffff809f509e <ret_from_exception> <ret_from_fork>: 7179 addi sp,sp,-48 f022 sd s0,32(sp) ec26 sd s1,24(sp) e84a sd s2,16(sp) e44e sd s3,8(sp) f406 sd ra,40(sp) 1800 addi s0,sp,48 84b2 mv s1,a2 89ae mv s3,a1 8936 mv s2,a3 3c73f0ef jal ffffffff8005038a <schedule_tail> ec89 bnez s1,ffffffff800107e2 <ret_from_fork+0x32> 854a mv a0,s2 009db097 auipc ra,0x9db a22080e7 jalr -1502(ra) # ffffffff809eb1ee <syscall_exit_to_user_mode> 70a2 ld ra,40(sp) 7402 ld s0,32(sp) 64e2 ld s1,24(sp) 6942 ld s2,16(sp) 69a2 ld s3,8(sp) 6145 addi sp,sp,48 8082 ret 854e mv a0,s3 9482 jalr s1 b7d5 j ffffffff800107ca <ret_from_fork+0x1a> ret_from_fork_asm ends up being 2 instructions more when calling from asm, but the user fork ret_from_fork ends up being only 12 instructions rather than 22 instructions when calling from C. If we were able to mix asm and C code in a naked function we would be able to get rid of the stack manipulation and still be able to inline C but we don't live in that world... - Charlie > > > Brian Gerst
diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index cd627ec289f163a630b73dd03dd52a6b28692997..733ff609778797001006c33bba9e3cc5b1f15387 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -52,6 +52,7 @@ DECLARE_DO_ERROR_INFO(do_trap_ecall_s); DECLARE_DO_ERROR_INFO(do_trap_ecall_m); DECLARE_DO_ERROR_INFO(do_trap_break); +asmlinkage void ret_from_fork(void *fn_arg, int (*fn)(void *), struct pt_regs *regs); asmlinkage void handle_bad_stack(struct pt_regs *regs); asmlinkage void do_page_fault(struct pt_regs *regs); asmlinkage void do_irq(struct pt_regs *regs); diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 33a5a9f2a0d4e1eeccfb3621b9e518b88e1b0704..9225c322279aa90e737b1d7144db084319cf8103 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -319,17 +319,14 @@ SYM_CODE_END(handle_kernel_stack_overflow) ASM_NOKPROBE(handle_kernel_stack_overflow) #endif -SYM_CODE_START(ret_from_fork) +SYM_CODE_START(ret_from_fork_asm) call schedule_tail - beqz s0, 1f /* not from kernel thread */ - /* Call fn(arg) */ - move a0, s1 - jalr s0 -1: - move a0, sp /* pt_regs */ - call syscall_exit_to_user_mode + move a0, s1 /* fn */ + move a1, s0 /* fn_arg */ + move a2, sp /* pt_regs */ + call ret_from_fork j ret_from_exception -SYM_CODE_END(ret_from_fork) +SYM_CODE_END(ret_from_fork_asm) #ifdef CONFIG_IRQ_STACKS /* diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 58b6482c2bf662bf5224ca50c8e21a68760a6b41..0d07e6d8f6b57beba438dbba5e8c74a014582bee 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -17,7 +17,9 @@ #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/personality.h> +#include <linux/entry-common.h> +#include <asm/asm-prototypes.h> #include <asm/unistd.h> #include <asm/processor.h> #include <asm/csr.h> @@ -36,7 +38,7 @@ unsigned long __stack_chk_guard __read_mostly; EXPORT_SYMBOL(__stack_chk_guard); #endif -extern asmlinkage void ret_from_fork(void); +extern asmlinkage void ret_from_fork_asm(void); void noinstr arch_cpu_idle(void) { @@ -206,6 +208,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +asmlinkage void ret_from_fork(void *fn_arg, int (*fn)(void *), struct pt_regs *regs) +{ + if (unlikely(fn)) + fn(fn_arg); + + syscall_exit_to_user_mode(regs); +} + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; @@ -242,7 +252,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.riscv_v_flags = 0; if (has_vector()) riscv_v_thread_alloc(p); - p->thread.ra = (unsigned long)ret_from_fork; + p->thread.ra = (unsigned long)ret_from_fork_asm; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; }
Move the main section of ret_from_fork() to C to allow inlining of syscall_exit_to_user_mode(). Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> --- arch/riscv/include/asm/asm-prototypes.h | 1 + arch/riscv/kernel/entry.S | 15 ++++++--------- arch/riscv/kernel/process.c | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 11 deletions(-)