diff mbox series

[RFC,perf/core,05/11] uprobes: Add mapping for optimized uprobe trampolines

Message ID 20241105133405.2703607-6-jolsa@kernel.org (mailing list archive)
State New
Headers show
Series uprobes: Add support to optimize usdt probes on x86_64 | expand

Commit Message

Jiri Olsa Nov. 5, 2024, 1:33 p.m. UTC
Adding interface to add special mapping for user space page that will be
used as place holder for uprobe trampoline in following changes.

The get_tramp_area(vaddr) function either finds 'callable' page or create
new one.  The 'callable' means it's reachable by call instruction (from
vaddr argument) and is decided by each arch via new arch_uprobe_is_callable
function.

The put_tramp_area function either drops refcount or destroys the special
mapping and all the maps are clean up when the process goes down.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/uprobes.h |  12 ++++
 kernel/events/uprobes.c | 141 ++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c           |   2 +
 3 files changed, 155 insertions(+)

Comments

Peter Zijlstra Nov. 5, 2024, 2:23 p.m. UTC | #1
On Tue, Nov 05, 2024 at 02:33:59PM +0100, Jiri Olsa wrote:
> Adding interface to add special mapping for user space page that will be
> used as place holder for uprobe trampoline in following changes.
> 
> The get_tramp_area(vaddr) function either finds 'callable' page or create
> new one.  The 'callable' means it's reachable by call instruction (from
> vaddr argument) and is decided by each arch via new arch_uprobe_is_callable
> function.
> 
> The put_tramp_area function either drops refcount or destroys the special
> mapping and all the maps are clean up when the process goes down.

In another thread somewhere, Andrii mentioned that Meta has executables
with more than 4G of .text. This isn't going to work for them, is it?
Jiri Olsa Nov. 5, 2024, 4:33 p.m. UTC | #2
On Tue, Nov 05, 2024 at 03:23:27PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 05, 2024 at 02:33:59PM +0100, Jiri Olsa wrote:
> > Adding interface to add special mapping for user space page that will be
> > used as place holder for uprobe trampoline in following changes.
> > 
> > The get_tramp_area(vaddr) function either finds 'callable' page or create
> > new one.  The 'callable' means it's reachable by call instruction (from
> > vaddr argument) and is decided by each arch via new arch_uprobe_is_callable
> > function.
> > 
> > The put_tramp_area function either drops refcount or destroys the special
> > mapping and all the maps are clean up when the process goes down.
> 
> In another thread somewhere, Andrii mentioned that Meta has executables
> with more than 4G of .text. This isn't going to work for them, is it?
> 

not if you can't reach the trampoline from the probed address

jirka
diff mbox series

Patch

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index be306028ed59..222d8e82cee2 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -172,6 +172,15 @@  struct xol_area;
 
 struct uprobes_state {
 	struct xol_area		*xol_area;
+	struct hlist_head	tramp_head;
+	struct mutex		tramp_mutex;
+};
+
+struct tramp_area {
+	unsigned long		vaddr;
+	struct page		*page;
+	struct hlist_node	node;
+	refcount_t		ref;
 };
 
 extern void __init uprobes_init(void);
@@ -219,6 +228,9 @@  extern int uprobe_verify_opcode(struct page *page, unsigned long vaddr, uprobe_o
 extern int arch_uprobe_verify_opcode(struct page *page, unsigned long vaddr,
 				     uprobe_opcode_t *new_opcode, void *data);
 extern bool arch_uprobe_is_register(uprobe_opcode_t *insn, int len, void *data);
+struct tramp_area *get_tramp_area(unsigned long vaddr);
+void put_tramp_area(struct tramp_area *area);
+bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 944d9df1f081..a44305c559a4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -616,6 +616,145 @@  set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
 			(uprobe_opcode_t *)&auprobe->insn, UPROBE_SWBP_INSN_SIZE, NULL);
 }
 
+bool __weak arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr)
+{
+	return false;
+}
+
+static unsigned long find_nearest_page(unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma, *prev;
+	VMA_ITERATOR(vmi, mm, 0);
+
+	prev = vma_next(&vmi);
+	vma = vma_next(&vmi);
+	while (vma) {
+		if (vma->vm_start - prev->vm_end  >= PAGE_SIZE &&
+		    arch_uprobe_is_callable(prev->vm_end, vaddr))
+			return prev->vm_end;
+
+		prev = vma;
+		vma = vma_next(&vmi);
+	}
+
+	return 0;
+}
+
+static vm_fault_t tramp_fault(const struct vm_special_mapping *sm,
+			      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct hlist_head *head = &vma->vm_mm->uprobes_state.tramp_head;
+	struct tramp_area *area;
+
+	hlist_for_each_entry(area, head, node) {
+		if (vma->vm_start == area->vaddr) {
+			vmf->page = area->page;
+			get_page(vmf->page);
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+	return -EPERM;
+}
+
+static const struct vm_special_mapping tramp_mapping = {
+	.name = "[uprobes-trampoline]",
+	.fault = tramp_fault,
+	.mremap = tramp_mremap,
+};
+
+static struct tramp_area *create_tramp_area(unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct tramp_area *area;
+
+	vaddr = find_nearest_page(vaddr);
+	if (!vaddr)
+		return NULL;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (unlikely(!area))
+		return NULL;
+
+	area->page = alloc_page(GFP_HIGHUSER);
+	if (!area->page)
+		goto free_area;
+
+	refcount_set(&area->ref, 1);
+	area->vaddr = vaddr;
+
+	vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+				&tramp_mapping);
+	if (!IS_ERR(vma))
+		return area;
+
+	__free_page(area->page);
+ free_area:
+	kfree(area);
+	return NULL;
+}
+
+struct tramp_area *get_tramp_area(unsigned long vaddr)
+{
+	struct uprobes_state *state = &current->mm->uprobes_state;
+	struct tramp_area *area = NULL;
+
+	mutex_lock(&state->tramp_mutex);
+	hlist_for_each_entry(area, &state->tramp_head, node) {
+		if (arch_uprobe_is_callable(area->vaddr, vaddr)) {
+			refcount_inc(&area->ref);
+			goto unlock;
+		}
+	}
+
+	area = create_tramp_area(vaddr);
+	if (area)
+		hlist_add_head(&area->node, &state->tramp_head);
+
+unlock:
+	mutex_unlock(&state->tramp_mutex);
+	return area;
+}
+
+static void destroy_tramp_area(struct tramp_area *area)
+{
+	hlist_del(&area->node);
+	put_page(area->page);
+	kfree(area);
+}
+
+void put_tramp_area(struct tramp_area *area)
+{
+	struct mm_struct *mm = current->mm;
+	struct uprobes_state *state = &mm->uprobes_state;
+
+	if (area == NULL)
+		return;
+
+	mutex_lock(&state->tramp_mutex);
+	if (refcount_dec_and_test(&area->ref))
+		destroy_tramp_area(area);
+	mutex_unlock(&state->tramp_mutex);
+}
+
+static void clear_tramp_head(struct mm_struct *mm)
+{
+	struct uprobes_state *state = &mm->uprobes_state;
+	struct tramp_area *area;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(area, n, &state->tramp_head, node)
+		destroy_tramp_area(area);
+}
+
 /* uprobe should have guaranteed positive refcount */
 static struct uprobe *get_uprobe(struct uprobe *uprobe)
 {
@@ -1788,6 +1927,8 @@  void uprobe_clear_state(struct mm_struct *mm)
 	delayed_uprobe_remove(NULL, mm);
 	mutex_unlock(&delayed_uprobe_lock);
 
+	clear_tramp_head(mm);
+
 	if (!area)
 		return;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..b1fe431e5cce 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1248,6 +1248,8 @@  static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
 	mm->uprobes_state.xol_area = NULL;
+	mutex_init(&mm->uprobes_state.tramp_mutex);
+	INIT_HLIST_HEAD(&mm->uprobes_state.tramp_head);
 #endif
 }