diff mbox series

[RFC,perf/core,06/11] uprobes: Add uprobe syscall to speed up uprobe

Message ID 20241105133405.2703607-7-jolsa@kernel.org (mailing list archive)
State New
Headers show
Series uprobes: Add support to optimize usdt probes on x86_64 | expand

Commit Message

Jiri Olsa Nov. 5, 2024, 1:34 p.m. UTC
Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.

The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.

The syscall handler reads the return address of the initiall call
to retrieve the original 'breakpoint' address.

With this address we find the related uprobe object and call its
consumers.

TODO allow to call uprobe syscall only from uprobe trampoline.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 arch/x86/kernel/uprobes.c              | 48 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  2 ++
 include/linux/uprobes.h                |  2 ++
 kernel/events/uprobes.c                | 35 +++++++++++++++++++
 kernel/sys_ni.c                        |  1 +
 6 files changed, 89 insertions(+)
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7093ee21c0d1..f6299d57afe5 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@ 
 333	common	io_pgetevents		sys_io_pgetevents
 334	common	rseq			sys_rseq
 335	common	uretprobe		sys_uretprobe
+336	common	uprobe			sys_uprobe
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 22a17c149a55..02aa4519b677 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -425,6 +425,54 @@  SYSCALL_DEFINE0(uretprobe)
 	return -1;
 }
 
+SYSCALL_DEFINE0(uprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	unsigned long bp_vaddr;
+	int err;
+
+	err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
+	if (err) {
+		force_sig(SIGILL);
+		return -1;
+	}
+
+	handle_syscall_uprobe(regs, bp_vaddr - 5);
+	return 0;
+}
+
+asm (
+	".pushsection .rodata\n"
+	".global uprobe_trampoline_entry\n"
+	"uprobe_trampoline_entry:\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"push %rax\n"
+	"movq $" __stringify(__NR_uprobe) ", %rax\n"
+	"syscall\n"
+	"pop %rax\n"
+	"pop %r11\n"
+	"pop %rcx\n"
+	"ret\n"
+	".global uprobe_trampoline_end\n"
+	"uprobe_trampoline_end:\n"
+	".popsection\n"
+);
+
+extern __visible u8 uprobe_trampoline_entry[];
+extern __visible u8 uprobe_trampoline_end[];
+
+void *arch_uprobe_trampoline(unsigned long *psize)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+
+	if (user_64bit_mode(regs)) {
+		*psize = uprobe_trampoline_end - uprobe_trampoline_entry;
+		return uprobe_trampoline_entry;
+	}
+	return NULL;
+}
+
 /*
  * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5758104921e6..a2573f9dd248 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -981,6 +981,8 @@  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
 asmlinkage long sys_uretprobe(void);
 
+asmlinkage long sys_uprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 222d8e82cee2..4024e6ea52a4 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -231,6 +231,8 @@  extern bool arch_uprobe_is_register(uprobe_opcode_t *insn, int len, void *data);
 struct tramp_area *get_tramp_area(unsigned long vaddr);
 void put_tramp_area(struct tramp_area *area);
 bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr);
+extern void *arch_uprobe_trampoline(unsigned long *psize);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a44305c559a4..b8399684231c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -621,6 +621,11 @@  bool __weak arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr)
 	return false;
 }
 
+void * __weak arch_uprobe_trampoline(unsigned long *psize)
+{
+	return NULL;
+}
+
 static unsigned long find_nearest_page(unsigned long vaddr)
 {
 	struct mm_struct *mm = current->mm;
@@ -673,7 +678,13 @@  static struct tramp_area *create_tramp_area(unsigned long vaddr)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
+	unsigned long tramp_size;
 	struct tramp_area *area;
+	void *tramp;
+
+	tramp = arch_uprobe_trampoline(&tramp_size);
+	if (!tramp)
+		return NULL;
 
 	vaddr = find_nearest_page(vaddr);
 	if (!vaddr)
@@ -690,6 +701,8 @@  static struct tramp_area *create_tramp_area(unsigned long vaddr)
 	refcount_set(&area->ref, 1);
 	area->vaddr = vaddr;
 
+	arch_uprobe_copy_ixol(area->page, 0, tramp, tramp_size);
+
 	vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
 				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
 				&tramp_mapping);
@@ -2757,6 +2770,28 @@  static void handle_swbp(struct pt_regs *regs)
 	rcu_read_unlock_trace();
 }
 
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	rcu_read_lock_trace();
+	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+	if (!uprobe)
+		goto unlock;
+
+	if (!get_utask())
+		goto unlock;
+
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		goto unlock;
+
+	handler_chain(uprobe, regs);
+
+unlock:
+	rcu_read_unlock_trace();
+}
+
 /*
  * Perform required fix-ups and disable singlestep.
  * Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@  COND_SYSCALL(setuid16);
 COND_SYSCALL(rseq);
 
 COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);