diff mbox series

[RFCv2,12/18] uprobes/x86: Add support to optimize uprobes

Message ID 20250224140151.667679-13-jolsa@kernel.org (mailing list archive)
State RFC
Headers show
Series uprobes: Add support to optimize usdt probes on x86_64 | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Jiri Olsa Feb. 24, 2025, 2:01 p.m. UTC
Putting together all the previously added pieces to support optimized
uprobes on top of 5-byte nop instruction.

The current uprobe execution goes through following:
  - installs breakpoint instruction over original instruction
  - exception handler hit and calls related uprobe consumers
  - and either simulates original instruction or does out of line single step
    execution of it
  - returns to user space

The optimized uprobe path

  - checks the original instruction is 5-byte nop (plus other checks)
  - adds (or uses existing) user space trampoline and overwrites original
    instruction (5-byte nop) with call to user space trampoline
  - the user space trampoline executes uprobe syscall that calls related uprobe
    consumers
  - trampoline returns back to next instruction

This approach won't speed up all uprobes as it's limited to using nop5 as
original instruction, but we could use nop5 as USDT probe instruction (which
uses single byte nop ATM) and speed up the USDT probes.

This patch overloads related arch functions in uprobe_write_opcode and
set_orig_insn so they can install call instruction if needed.

The arch_uprobe_optimize triggers the uprobe optimization and is called after
first uprobe hit. I originally had it called on uprobe installation but then
it clashed with elf loader, because the user space trampoline was added in a
place where loader might need to put elf segments, so I decided to do it after
first uprobe hit when loading is done.

We do not unmap and release uprobe trampoline when it's no longer needed,
because there's no easy way to make sure none of the threads is still
inside the trampoline. But we do not waste memory, because there's just
single page for all the uprobe trampoline mappings.

We do waste frmae on page mapping for every 4GB by keeping the uprobe
trampoline page mapped, but that seems ok.

Attaching the speed up from benchs/run_bench_uprobes.sh script:

current:
        usermode-count :  818.836 ± 2.842M/s
        syscall-count  :    8.917 ± 0.003M/s
        uprobe-nop     :    3.056 ± 0.013M/s
        uprobe-push    :    2.903 ± 0.002M/s
        uprobe-ret     :    1.533 ± 0.001M/s
-->     uprobe-nop5    :    1.492 ± 0.000M/s
        uretprobe-nop  :    1.783 ± 0.000M/s
        uretprobe-push :    1.672 ± 0.001M/s
        uretprobe-ret  :    1.067 ± 0.002M/s
-->     uretprobe-nop5 :    1.052 ± 0.000M/s

after the change:

        usermode-count :  818.386 ± 1.886M/s
        syscall-count  :    8.923 ± 0.003M/s
        uprobe-nop     :    3.086 ± 0.005M/s
        uprobe-push    :    2.751 ± 0.001M/s
        uprobe-ret     :    1.481 ± 0.000M/s
-->     uprobe-nop5    :    4.016 ± 0.002M/s
        uretprobe-nop  :    1.712 ± 0.008M/s
        uretprobe-push :    1.616 ± 0.001M/s
        uretprobe-ret  :    1.052 ± 0.000M/s
-->     uretprobe-nop5 :    2.015 ± 0.000M/s

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/include/asm/uprobes.h |   6 ++
 arch/x86/kernel/uprobes.c      | 191 ++++++++++++++++++++++++++++++++-
 include/linux/uprobes.h        |   6 +-
 kernel/events/uprobes.c        |  16 ++-
 4 files changed, 209 insertions(+), 10 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..7d4df920bb59 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,10 @@  typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +49,8 @@  struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index e8aebbda83bc..73ddff823904 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@ 
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -768,7 +769,7 @@  static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
 	return NULL;
 }
 
-static __maybe_unused struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr)
+static struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr)
 {
 	struct uprobes_state *state = &current->mm->uprobes_state;
 	struct uprobe_trampoline *tramp = NULL;
@@ -794,7 +795,7 @@  static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
 	kfree(tramp);
 }
 
-static __maybe_unused void uprobe_trampoline_put(struct uprobe_trampoline *tramp)
+static void uprobe_trampoline_put(struct uprobe_trampoline *tramp)
 {
 	if (tramp == NULL)
 		return;
@@ -807,6 +808,7 @@  struct mm_uprobe {
 	struct rb_node rb_node;
 	unsigned long auprobe;
 	unsigned long vaddr;
+	bool optimized;
 };
 
 #define __node_2_mm_uprobe(node) rb_entry((node), struct mm_uprobe, rb_node)
@@ -874,6 +876,7 @@  static struct mm_uprobe *insert_mm_uprobe(struct mm_struct *mm, struct arch_upro
 	if (mmu) {
 		mmu->auprobe = (unsigned long) auprobe;
 		mmu->vaddr = vaddr;
+		mmu->optimized = false;
 		RB_CLEAR_NODE(&mmu->rb_node);
 		rb_add(&mmu->rb_node, &mm->uprobes_state.root_uprobes, __mm_uprobe_less);
 	}
@@ -886,6 +889,134 @@  static void destroy_mm_uprobe(struct mm_uprobe *mmu, struct rb_root *root)
 	kfree(mmu);
 }
 
+enum {
+	OPT_PART,
+	OPT_INSN,
+	UNOPT_INT3,
+	UNOPT_PART,
+};
+
+struct write_opcode_ctx {
+	unsigned long base;
+	int update;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+	return *insn == CALL_INSN_OPCODE;
+}
+
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+		       int nbytes, void *data)
+{
+	struct write_opcode_ctx *ctx = data;
+	uprobe_opcode_t old_opcode[5];
+
+	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+	switch (ctx->update) {
+	case OPT_PART:
+	case OPT_INSN:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case UNOPT_INT3:
+		if (is_call_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case UNOPT_PART:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	}
+
+	return -1;
+}
+
+static int write_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr,
+		      uprobe_opcode_t *insn, int nbytes, void *ctx)
+{
+	return uprobe_write(auprobe, mm, vaddr, insn, nbytes, verify_insn, false, ctx);
+}
+
+static void relative_call(void *dest, long from, long to)
+{
+	struct __packed __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} *insn;
+
+	insn = (struct __arch_relative_insn *)dest;
+	insn->raddr = (s32)(to - (from + 5));
+	insn->op = CALL_INSN_OPCODE;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr,
+			 unsigned long tramp)
+{
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	char call[5];
+	int err;
+
+	relative_call(call, vaddr, tramp);
+
+	/*
+	 * We are in state where breakpoint (int3) is installed on top of first
+	 * byte of the nop5 instruction. We will do following steps to overwrite
+	 * this to call instruction:
+	 *
+	 * - sync cores
+	 * - write last 4 bytes of the call instruction
+	 * - sync cores
+	 * - update the call instruction opcode
+	 */
+	text_poke_sync();
+
+	ctx.update = OPT_PART;
+	err = write_insn(auprobe, mm, vaddr + 1, call + 1, 4, &ctx);
+	if (err)
+		return err;
+
+	text_poke_sync();
+
+	ctx.update = OPT_INSN;
+	return write_insn(auprobe, mm, vaddr, call, 1, &ctx);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	int err;
+
+	/*
+	 * We need to overwrite call instruction into nop5 instruction with
+	 * breakpoint (int3) installed on top of its first byte. We will:
+	 *
+	 * - overwrite call opcode with breakpoint (int3)
+	 * - sync cores
+	 * - write last 4 bytes of the nop5 instruction
+	 * - sync cores
+	 */
+
+	ctx.update = UNOPT_INT3;
+	err = write_insn(auprobe, mm, vaddr, &int3, 1, &ctx);
+	if (err)
+		return err;
+
+	text_poke_sync();
+
+	ctx.update = UNOPT_PART;
+	err = write_insn(auprobe, mm, vaddr + 1, (uprobe_opcode_t *) auprobe->insn + 1, 4, &ctx);
+
+	text_poke_sync();
+	return err;
+}
+
 int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
 	struct mm_uprobe *mmu;
@@ -905,6 +1036,8 @@  int set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned lo
 	mmu = find_mm_uprobe(mm, auprobe, vaddr);
 	if (!mmu)
 		return 0;
+	if (mmu->optimized)
+		WARN_ON_ONCE(swbp_unoptimize(auprobe, mm, vaddr));
 	destroy_mm_uprobe(mmu, &mm->uprobes_state.root_uprobes);
 	return uprobe_write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn, true);
 }
@@ -937,6 +1070,41 @@  static bool emulate_nop5_insn(struct arch_uprobe *auprobe)
 {
 	return is_nop5_insn((uprobe_opcode_t *) &auprobe->insn);
 }
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct uprobe_trampoline *tramp;
+	struct mm_uprobe *mmu;
+
+	if (!test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags))
+		return;
+
+	mmap_write_lock(mm);
+	mmu = find_mm_uprobe(mm, auprobe, vaddr);
+	if (!mmu || mmu->optimized)
+		goto unlock;
+
+	tramp = uprobe_trampoline_get(vaddr);
+	if (!tramp)
+		goto unlock;
+
+	if (WARN_ON_ONCE(swbp_optimize(auprobe, mm, vaddr, tramp->vaddr)))
+		uprobe_trampoline_put(tramp);
+	else
+		mmu->optimized = true;
+
+unlock:
+	mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	if (!is_nop5_insn((uprobe_opcode_t *) &auprobe->insn))
+		return false;
+	/* We can't do cross page atomic writes yet. */
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
@@ -954,6 +1122,10 @@  static bool emulate_nop5_insn(struct arch_uprobe *auprobe)
 {
 	return false;
 }
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	return false;
+}
 #endif /* CONFIG_X86_64 */
 
 struct uprobe_xol_ops {
@@ -1317,6 +1489,9 @@  int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret)
 		return ret;
 
+	if (can_optimize(auprobe, addr))
+		set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
 	ret = branch_setup_xol_ops(auprobe, &insn);
 	if (ret != -ENOSYS)
 		return ret;
@@ -1523,15 +1698,23 @@  arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 {
 	int rasize = sizeof_long(regs), nleft;
 	unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+	unsigned long off = 0;
+
+	/*
+	 * Optimized uprobe goes through uprobe trampoline which adds 4 8-byte
+	 * values on stack, check uprobe_trampoline_entry for details.
+	 */
+	if (!swbp)
+		off = 4*8;
 
-	if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+	if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp + off, rasize))
 		return -1;
 
 	/* check whether address has been already hijacked */
 	if (orig_ret_vaddr == trampoline_vaddr)
 		return orig_ret_vaddr;
 
-	nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+	nleft = copy_to_user((void __user *)regs->sp + off, &trampoline_vaddr, rasize);
 	if (likely(!nleft)) {
 		if (shstk_update_last_frame(trampoline_vaddr)) {
 			force_sig(SIGSEGV);
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index bd726daa4428..22c79905754c 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -190,7 +190,8 @@  struct uprobes_state {
 #endif
 };
 
-typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode, int nbytes);
+typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode,
+				     int nbytes, void *data);
 
 extern void __init uprobes_init(void);
 extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
@@ -202,7 +203,7 @@  extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr,
 			       uprobe_opcode_t, bool);
 extern int uprobe_write(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr,
-			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool orig);
+			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool orig, void *data);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@@ -239,6 +240,7 @@  extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 extern void arch_uprobe_clear_state(struct mm_struct *mm);
 extern void arch_uprobe_init_state(struct mm_struct *mm);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c690cde4442c..ba4f18bac73e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -264,7 +264,8 @@  static void uprobe_copy_to_page(struct page *page, unsigned long vaddr, const vo
 	kunmap_atomic(kaddr);
 }
 
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, int nbytes)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+			 int nbytes, void *data)
 {
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
@@ -473,12 +474,12 @@  static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 			unsigned long vaddr, uprobe_opcode_t opcode, bool orig)
 {
-	return uprobe_write(auprobe, mm, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode, orig);
+	return uprobe_write(auprobe, mm, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode, orig, NULL);
 }
 
 int uprobe_write(struct arch_uprobe *auprobe, struct mm_struct *mm,
 		 unsigned long vaddr, uprobe_opcode_t *insn,
-		 int nbytes, uprobe_write_verify_t verify, bool orig)
+		 int nbytes, uprobe_write_verify_t verify, bool orig, void *data)
 {
 	struct page *old_page, *new_page;
 	struct vm_area_struct *vma;
@@ -494,7 +495,7 @@  int uprobe_write(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (IS_ERR(old_page))
 		return PTR_ERR(old_page);
 
-	ret = verify(old_page, vaddr, insn, nbytes);
+	ret = verify(old_page, vaddr, insn, nbytes, data);
 	if (ret <= 0)
 		goto put_old;
 
@@ -2668,6 +2669,10 @@  bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 	return true;
 }
 
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2732,6 +2737,9 @@  static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs, true);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;