diff mbox series

[bpf-next,06/13] uprobes/x86: Add uprobe syscall to speed up uprobe

Message ID 20241211133403.208920-7-jolsa@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series uprobes: Add support to optimize usdt probes on x86_64 | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 568 this patch: 568
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 19 maintainers not CCed: alexander.shishkin@linux.intel.com mingo@redhat.com linux-api@vger.kernel.org namhyung@kernel.org acme@kernel.org x86@kernel.org tglx@linutronix.de arnd@arndb.de brauner@kernel.org jpoimboe@kernel.org mark.rutland@arm.com bp@alien8.de adrian.hunter@intel.com hpa@zytor.com dave.hansen@linux.intel.com irogers@google.com linux-perf-users@vger.kernel.org luto@kernel.org kan.liang@linux.intel.com
netdev/build_clang success Errors and warnings before: 980 this patch: 980
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 15902 this patch: 15902
netdev/checkpatch warning CHECK: Lines should not end with a '(' CHECK: No space is necessary after a cast CHECK: extern prototypes should be avoided in .h files CHECK: spaces preferred around that '*' (ctx:VxV) WARNING: externs should be avoided in .c files WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: use of RCU tasks trace is incorrect outside BPF or core RCU code
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 7 this patch: 7
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-19 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Jiri Olsa Dec. 11, 2024, 1:33 p.m. UTC
Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.

The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.

The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.

Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.

We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 arch/x86/kernel/uprobes.c              | 80 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  2 +
 include/linux/uprobes.h                |  1 +
 kernel/events/uprobes.c                | 22 +++++++
 kernel/sys_ni.c                        |  1 +
 6 files changed, 107 insertions(+)

Comments

Thomas Weißschuh Dec. 13, 2024, 1:48 p.m. UTC | #1
On 2024-12-11 14:33:55+0100, Jiri Olsa wrote:
> Adding new uprobe syscall that calls uprobe handlers for given
> 'breakpoint' address.
> 
> The idea is that the 'breakpoint' address calls the user space
> trampoline which executes the uprobe syscall.
> 
> The syscall handler reads the return address of the initial call
> to retrieve the original 'breakpoint' address. With this address
> we find the related uprobe object and call its consumers.
> 
> Adding the arch_uprobe_trampoline_mapping function that provides
> uprobe trampoline mapping. This mapping is backed with one global
> page initialized at __init time and shared by the all the mapping
> instances.
> 
> We do not allow to execute uprobe syscall if the caller is not
> from uprobe trampoline mapping.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  arch/x86/kernel/uprobes.c              | 80 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 +
>  include/linux/uprobes.h                |  1 +
>  kernel/events/uprobes.c                | 22 +++++++
>  kernel/sys_ni.c                        |  1 +
>  6 files changed, 107 insertions(+)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5eb708bff1c7..88e388c7675b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -345,6 +345,7 @@
>  333	common	io_pgetevents		sys_io_pgetevents
>  334	common	rseq			sys_rseq
>  335	common	uretprobe		sys_uretprobe
> +336	common	uprobe			sys_uprobe
>  # don't use numbers 387 through 423, add new calls after the last
>  # 'common' entry
>  424	common	pidfd_send_signal	sys_pidfd_send_signal
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 22a17c149a55..23e4f2821cff 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -425,6 +425,86 @@ SYSCALL_DEFINE0(uretprobe)
>  	return -1;
>  }
>  
> +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
> +{
> +	return -EPERM;
> +}
> +
> +static struct vm_special_mapping tramp_mapping = {
> +	.name   = "[uprobes-trampoline]",
> +	.mremap = tramp_mremap,
> +};
> +
> +SYSCALL_DEFINE0(uprobe)
> +{
> +	struct pt_regs *regs = task_pt_regs(current);
> +	struct vm_area_struct *vma;
> +	unsigned long bp_vaddr;
> +	int err;
> +
> +	err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));

A #define for the magic values would be nice.

> +	if (err) {
> +		force_sig(SIGILL);
> +		return -1;
> +	}
> +
> +	/* Allow execution only from uprobe trampolines. */
> +	vma = vma_lookup(current->mm, regs->ip);
> +	if (!vma || vma->vm_private_data != (void *) &tramp_mapping) {

vma_is_special_mapping()

> +		force_sig(SIGILL);
> +		return -1;
> +	}
> +
> +	handle_syscall_uprobe(regs, bp_vaddr - 5);
> +	return 0;
> +}
> +
> +asm (
> +	".pushsection .rodata\n"
> +	".global uprobe_trampoline_entry\n"
> +	"uprobe_trampoline_entry:\n"
> +	"endbr64\n"
> +	"push %rcx\n"
> +	"push %r11\n"
> +	"push %rax\n"
> +	"movq $" __stringify(__NR_uprobe) ", %rax\n"
> +	"syscall\n"
> +	"pop %rax\n"
> +	"pop %r11\n"
> +	"pop %rcx\n"
> +	"ret\n"
> +	".global uprobe_trampoline_end\n"
> +	"uprobe_trampoline_end:\n"
> +	".popsection\n"
> +);
> +
> +extern __visible u8 uprobe_trampoline_entry[];
> +extern __visible u8 uprobe_trampoline_end[];
> +
> +const struct vm_special_mapping *arch_uprobe_trampoline_mapping(void)
> +{
> +	struct pt_regs *regs = task_pt_regs(current);
> +
> +	return user_64bit_mode(regs) ? &tramp_mapping : NULL;
> +}
> +
> +static int __init arch_uprobes_init(void)
> +{
> +	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
> +	static struct page *pages[2];
> +	struct page *page;
> +
> +	page = alloc_page(GFP_HIGHUSER);

That page could be in static memory, removing the need for the explicit
allocation. It could also be __ro_after_init.
Then tramp_mapping itself can be const.

Also this seems to waste the page on 32bit kernels.

> +	if (!page)
> +		return -ENOMEM;
> +	pages[0] = page;
> +	tramp_mapping.pages = (struct page **) &pages;

tramp_mapping.pages = pages; ?

> +	arch_uprobe_copy_ixol(page, 0, uprobe_trampoline_entry, size);
> +	return 0;
> +}
> +
> +late_initcall(arch_uprobes_init);
> +
>  /*
>   * If arch_uprobe->insn doesn't use rip-relative addressing, return
>   * immediately.  Otherwise, rewrite the instruction so that it accesses

[..]
Jiri Olsa Dec. 13, 2024, 2:51 p.m. UTC | #2
On Fri, Dec 13, 2024 at 02:48:00PM +0100, Thomas Weißschuh wrote:

SNIP

> > +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
> > +{
> > +	return -EPERM;
> > +}
> > +
> > +static struct vm_special_mapping tramp_mapping = {
> > +	.name   = "[uprobes-trampoline]",
> > +	.mremap = tramp_mremap,
> > +};
> > +
> > +SYSCALL_DEFINE0(uprobe)
> > +{
> > +	struct pt_regs *regs = task_pt_regs(current);
> > +	struct vm_area_struct *vma;
> > +	unsigned long bp_vaddr;
> > +	int err;
> > +
> > +	err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
> 
> A #define for the magic values would be nice.

the 3*8 is to skip 3 values pushed on stack and get the return ip value,
I'd prefer to keep 3*8 but it's definitely missing explaining comment
above, wdyt?

> 
> > +	if (err) {
> > +		force_sig(SIGILL);
> > +		return -1;
> > +	}
> > +
> > +	/* Allow execution only from uprobe trampolines. */
> > +	vma = vma_lookup(current->mm, regs->ip);
> > +	if (!vma || vma->vm_private_data != (void *) &tramp_mapping) {
> 
> vma_is_special_mapping()

did not know about this function, thanks

> 
> > +		force_sig(SIGILL);
> > +		return -1;
> > +	}
> > +
> > +	handle_syscall_uprobe(regs, bp_vaddr - 5);
> > +	return 0;
> > +}
> > +
> > +asm (
> > +	".pushsection .rodata\n"
> > +	".global uprobe_trampoline_entry\n"
> > +	"uprobe_trampoline_entry:\n"
> > +	"endbr64\n"
> > +	"push %rcx\n"
> > +	"push %r11\n"
> > +	"push %rax\n"
> > +	"movq $" __stringify(__NR_uprobe) ", %rax\n"
> > +	"syscall\n"
> > +	"pop %rax\n"
> > +	"pop %r11\n"
> > +	"pop %rcx\n"
> > +	"ret\n"
> > +	".global uprobe_trampoline_end\n"
> > +	"uprobe_trampoline_end:\n"
> > +	".popsection\n"
> > +);
> > +
> > +extern __visible u8 uprobe_trampoline_entry[];
> > +extern __visible u8 uprobe_trampoline_end[];
> > +
> > +const struct vm_special_mapping *arch_uprobe_trampoline_mapping(void)
> > +{
> > +	struct pt_regs *regs = task_pt_regs(current);
> > +
> > +	return user_64bit_mode(regs) ? &tramp_mapping : NULL;
> > +}
> > +
> > +static int __init arch_uprobes_init(void)
> > +{
> > +	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
> > +	static struct page *pages[2];
> > +	struct page *page;
> > +
> > +	page = alloc_page(GFP_HIGHUSER);
> 
> That page could be in static memory, removing the need for the explicit
> allocation. It could also be __ro_after_init.
> Then tramp_mapping itself can be const.

hum, how would that look like? I think that to get proper page object
you have to call alloc_page or some other page alloc family function..
what do I miss?

> 
> Also this seems to waste the page on 32bit kernels.

it's inside CONFIG_X86_64 ifdef

> 
> > +	if (!page)
> > +		return -ENOMEM;
> > +	pages[0] = page;
> > +	tramp_mapping.pages = (struct page **) &pages;
> 
> tramp_mapping.pages = pages; ?

I think the compiler will cry about *pages[2] vs **pages types mismatch,
but I'll double check that

thanks,
jirka

> 
> > +	arch_uprobe_copy_ixol(page, 0, uprobe_trampoline_entry, size);
> > +	return 0;
> > +}
> > +
> > +late_initcall(arch_uprobes_init);
> > +
> >  /*
> >   * If arch_uprobe->insn doesn't use rip-relative addressing, return
> >   * immediately.  Otherwise, rewrite the instruction so that it accesses
> 
> [..]
Thomas Weißschuh Dec. 13, 2024, 3:12 p.m. UTC | #3
On 2024-12-13 15:51:44+0100, Jiri Olsa wrote:
> On Fri, Dec 13, 2024 at 02:48:00PM +0100, Thomas Weißschuh wrote:
> 
> SNIP
> 
> > > +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
> > > +{
> > > +	return -EPERM;
> > > +}
> > > +
> > > +static struct vm_special_mapping tramp_mapping = {
> > > +	.name   = "[uprobes-trampoline]",
> > > +	.mremap = tramp_mremap,
> > > +};
> > > +
> > > +SYSCALL_DEFINE0(uprobe)
> > > +{
> > > +	struct pt_regs *regs = task_pt_regs(current);
> > > +	struct vm_area_struct *vma;
> > > +	unsigned long bp_vaddr;
> > > +	int err;
> > > +
> > > +	err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
> > 
> > A #define for the magic values would be nice.
> 
> the 3*8 is to skip 3 values pushed on stack and get the return ip value,
> I'd prefer to keep 3*8 but it's definitely missing explaining comment
> above, wdyt?

A comment sounds good.

> > > +	if (err) {
> > > +		force_sig(SIGILL);
> > > +		return -1;
> > > +	}
> > > +
> > > +	/* Allow execution only from uprobe trampolines. */
> > > +	vma = vma_lookup(current->mm, regs->ip);
> > > +	if (!vma || vma->vm_private_data != (void *) &tramp_mapping) {
> > 
> > vma_is_special_mapping()
> 
> did not know about this function, thanks
> 
> > 
> > > +		force_sig(SIGILL);
> > > +		return -1;
> > > +	}
> > > +
> > > +	handle_syscall_uprobe(regs, bp_vaddr - 5);
> > > +	return 0;
> > > +}
> > > +
> > > +asm (
> > > +	".pushsection .rodata\n"
> > > +	".global uprobe_trampoline_entry\n"
> > > +	"uprobe_trampoline_entry:\n"
> > > +	"endbr64\n"
> > > +	"push %rcx\n"
> > > +	"push %r11\n"
> > > +	"push %rax\n"
> > > +	"movq $" __stringify(__NR_uprobe) ", %rax\n"
> > > +	"syscall\n"
> > > +	"pop %rax\n"
> > > +	"pop %r11\n"
> > > +	"pop %rcx\n"
> > > +	"ret\n"
> > > +	".global uprobe_trampoline_end\n"
> > > +	"uprobe_trampoline_end:\n"
> > > +	".popsection\n"
> > > +);
> > > +
> > > +extern __visible u8 uprobe_trampoline_entry[];
> > > +extern __visible u8 uprobe_trampoline_end[];
> > > +
> > > +const struct vm_special_mapping *arch_uprobe_trampoline_mapping(void)
> > > +{
> > > +	struct pt_regs *regs = task_pt_regs(current);
> > > +
> > > +	return user_64bit_mode(regs) ? &tramp_mapping : NULL;
> > > +}
> > > +
> > > +static int __init arch_uprobes_init(void)
> > > +{
> > > +	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
> > > +	static struct page *pages[2];
> > > +	struct page *page;
> > > +
> > > +	page = alloc_page(GFP_HIGHUSER);
> > 
> > That page could be in static memory, removing the need for the explicit
> > allocation. It could also be __ro_after_init.
> > Then tramp_mapping itself can be const.
> 
> hum, how would that look like? I think that to get proper page object
> you have to call alloc_page or some other page alloc family function..
> what do I miss?

static u8 trampoline_page[PAGE_SIZE] __ro_after_init __aligned(PAGE_SIZE);
static struct page *tramp_mapping_pages[2] __ro_after_init;

static const struct vm_special_mapping tramp_mapping = {
	.name   = "[uprobes-trampoline]",
	.pages  = tramp_mapping_pages,
	.mremap = tramp_mremap,
};

static int __init arch_uprobes_init(void)
{
	...
	trampoline_pages[0] = virt_to_page(trampoline_page);
	...
}

Untested, but it's similar to the stuff the vDSO implementations are
doing which I am working with at the moment.

> > 
> > Also this seems to waste the page on 32bit kernels.
> 
> it's inside CONFIG_X86_64 ifdef
> 
> > 
> > > +	if (!page)
> > > +		return -ENOMEM;
> > > +	pages[0] = page;
> > > +	tramp_mapping.pages = (struct page **) &pages;
> > 
> > tramp_mapping.pages = pages; ?
> 
> I think the compiler will cry about *pages[2] vs **pages types mismatch,
> but I'll double check that

It compiles for me.

> thanks,
> jirka
> 
> > 
> > > +	arch_uprobe_copy_ixol(page, 0, uprobe_trampoline_entry, size);
> > > +	return 0;
> > > +}
> > > +
> > > +late_initcall(arch_uprobes_init);
> > > +
> > >  /*
> > >   * If arch_uprobe->insn doesn't use rip-relative addressing, return
> > >   * immediately.  Otherwise, rewrite the instruction so that it accesses
> > 
> > [..]
Jiri Olsa Dec. 13, 2024, 9:52 p.m. UTC | #4
On Fri, Dec 13, 2024 at 04:12:46PM +0100, Thomas Weißschuh wrote:

SNIP

> > > > +static int __init arch_uprobes_init(void)
> > > > +{
> > > > +	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
> > > > +	static struct page *pages[2];
> > > > +	struct page *page;
> > > > +
> > > > +	page = alloc_page(GFP_HIGHUSER);
> > > 
> > > That page could be in static memory, removing the need for the explicit
> > > allocation. It could also be __ro_after_init.
> > > Then tramp_mapping itself can be const.
> > 
> > hum, how would that look like? I think that to get proper page object
> > you have to call alloc_page or some other page alloc family function..
> > what do I miss?
> 
> static u8 trampoline_page[PAGE_SIZE] __ro_after_init __aligned(PAGE_SIZE);
> static struct page *tramp_mapping_pages[2] __ro_after_init;
> 
> static const struct vm_special_mapping tramp_mapping = {
> 	.name   = "[uprobes-trampoline]",
> 	.pages  = tramp_mapping_pages,
> 	.mremap = tramp_mremap,
> };
> 
> static int __init arch_uprobes_init(void)
> {
> 	...
> 	trampoline_pages[0] = virt_to_page(trampoline_page);
> 	...
> }
> 
> Untested, but it's similar to the stuff the vDSO implementations are
> doing which I am working with at the moment.

nice idea, better than allocating the page, will do that

> 
> > > 
> > > Also this seems to waste the page on 32bit kernels.
> > 
> > it's inside CONFIG_X86_64 ifdef
> > 
> > > 
> > > > +	if (!page)
> > > > +		return -ENOMEM;
> > > > +	pages[0] = page;
> > > > +	tramp_mapping.pages = (struct page **) &pages;
> > > 
> > > tramp_mapping.pages = pages; ?
> > 
> > I think the compiler will cry about *pages[2] vs **pages types mismatch,
> > but I'll double check that
> 
> It compiles for me.

ok

thanks,
jirka
Thomas Weißschuh Dec. 14, 2024, 1:21 p.m. UTC | #5
On 2024-12-13 22:52:15+0100, Jiri Olsa wrote:
> On Fri, Dec 13, 2024 at 04:12:46PM +0100, Thomas Weißschuh wrote:
> 
> SNIP
> 
> > > > > +static int __init arch_uprobes_init(void)
> > > > > +{
> > > > > +	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
> > > > > +	static struct page *pages[2];
> > > > > +	struct page *page;
> > > > > +
> > > > > +	page = alloc_page(GFP_HIGHUSER);
> > > > 
> > > > That page could be in static memory, removing the need for the explicit
> > > > allocation. It could also be __ro_after_init.
> > > > Then tramp_mapping itself can be const.
> > > 
> > > hum, how would that look like? I think that to get proper page object
> > > you have to call alloc_page or some other page alloc family function..
> > > what do I miss?
> > 
> > static u8 trampoline_page[PAGE_SIZE] __ro_after_init __aligned(PAGE_SIZE);
> > static struct page *tramp_mapping_pages[2] __ro_after_init;
> > 
> > static const struct vm_special_mapping tramp_mapping = {
> > 	.name   = "[uprobes-trampoline]",
> > 	.pages  = tramp_mapping_pages,
> > 	.mremap = tramp_mremap,
> > };
> > 
> > static int __init arch_uprobes_init(void)
> > {
> > 	...
> > 	trampoline_pages[0] = virt_to_page(trampoline_page);
> > 	...
> > }
> > 
> > Untested, but it's similar to the stuff the vDSO implementations are
> > doing which I am working with at the moment.
> 
> nice idea, better than allocating the page, will do that

Or even better yet, just allocate the whole page already in the inline
asm and avoid the copying, too:

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index b2420eeee23a..c5e6ca7f998a 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -462,7 +462,7 @@ SYSCALL_DEFINE0(uprobe)

 asm (
        ".pushsection .rodata\n"
-       ".global uprobe_trampoline_entry\n"
+       ".balign " __stringify(PAGE_SIZE) "\n"
        "uprobe_trampoline_entry:\n"
        "endbr64\n"
        "push %rcx\n"
@@ -474,13 +474,11 @@ asm (
        "pop %r11\n"
        "pop %rcx\n"
        "ret\n"
-       ".global uprobe_trampoline_end\n"
-       "uprobe_trampoline_end:\n"
+       ".balign " __stringify(PAGE_SIZE) "\n"
        ".popsection\n"
 );

-extern __visible u8 uprobe_trampoline_entry[];
-extern __visible u8 uprobe_trampoline_end[];
+extern u8 uprobe_trampoline_entry[];


If you want to keep the copying for some reason, the asm code should be
in the section ".init.rodata" as its not used afterwards.

(A bit bikesheddy, I admit)


Thomas
Jiri Olsa Dec. 16, 2024, 8:03 a.m. UTC | #6
On Sat, Dec 14, 2024 at 02:21:43PM +0100, Thomas Weißschuh wrote:
> On 2024-12-13 22:52:15+0100, Jiri Olsa wrote:
> > On Fri, Dec 13, 2024 at 04:12:46PM +0100, Thomas Weißschuh wrote:
> > 
> > SNIP
> > 
> > > > > > +static int __init arch_uprobes_init(void)
> > > > > > +{
> > > > > > +	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
> > > > > > +	static struct page *pages[2];
> > > > > > +	struct page *page;
> > > > > > +
> > > > > > +	page = alloc_page(GFP_HIGHUSER);
> > > > > 
> > > > > That page could be in static memory, removing the need for the explicit
> > > > > allocation. It could also be __ro_after_init.
> > > > > Then tramp_mapping itself can be const.
> > > > 
> > > > hum, how would that look like? I think that to get proper page object
> > > > you have to call alloc_page or some other page alloc family function..
> > > > what do I miss?
> > > 
> > > static u8 trampoline_page[PAGE_SIZE] __ro_after_init __aligned(PAGE_SIZE);
> > > static struct page *tramp_mapping_pages[2] __ro_after_init;
> > > 
> > > static const struct vm_special_mapping tramp_mapping = {
> > > 	.name   = "[uprobes-trampoline]",
> > > 	.pages  = tramp_mapping_pages,
> > > 	.mremap = tramp_mremap,
> > > };
> > > 
> > > static int __init arch_uprobes_init(void)
> > > {
> > > 	...
> > > 	trampoline_pages[0] = virt_to_page(trampoline_page);
> > > 	...
> > > }
> > > 
> > > Untested, but it's similar to the stuff the vDSO implementations are
> > > doing which I am working with at the moment.
> > 
> > nice idea, better than allocating the page, will do that
> 
> Or even better yet, just allocate the whole page already in the inline
> asm and avoid the copying, too:
> 
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index b2420eeee23a..c5e6ca7f998a 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -462,7 +462,7 @@ SYSCALL_DEFINE0(uprobe)
> 
>  asm (
>         ".pushsection .rodata\n"
> -       ".global uprobe_trampoline_entry\n"
> +       ".balign " __stringify(PAGE_SIZE) "\n"
>         "uprobe_trampoline_entry:\n"
>         "endbr64\n"
>         "push %rcx\n"
> @@ -474,13 +474,11 @@ asm (
>         "pop %r11\n"
>         "pop %rcx\n"
>         "ret\n"
> -       ".global uprobe_trampoline_end\n"
> -       "uprobe_trampoline_end:\n"
> +       ".balign " __stringify(PAGE_SIZE) "\n"
>         ".popsection\n"
>  );
> 
> -extern __visible u8 uprobe_trampoline_entry[];
> -extern __visible u8 uprobe_trampoline_end[];
> +extern u8 uprobe_trampoline_entry[];
> 
> 
> If you want to keep the copying for some reason, the asm code should be
> in the section ".init.rodata" as its not used afterwards.

perfect, no need for copy, I'll do what you propose above

> 
> (A bit bikesheddy, I admit)

thanks for the review,

jirka
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5eb708bff1c7..88e388c7675b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@ 
 333	common	io_pgetevents		sys_io_pgetevents
 334	common	rseq			sys_rseq
 335	common	uretprobe		sys_uretprobe
+336	common	uprobe			sys_uprobe
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 22a17c149a55..23e4f2821cff 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -425,6 +425,86 @@  SYSCALL_DEFINE0(uretprobe)
 	return -1;
 }
 
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+	return -EPERM;
+}
+
+static struct vm_special_mapping tramp_mapping = {
+	.name   = "[uprobes-trampoline]",
+	.mremap = tramp_mremap,
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct vm_area_struct *vma;
+	unsigned long bp_vaddr;
+	int err;
+
+	err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
+	if (err) {
+		force_sig(SIGILL);
+		return -1;
+	}
+
+	/* Allow execution only from uprobe trampolines. */
+	vma = vma_lookup(current->mm, regs->ip);
+	if (!vma || vma->vm_private_data != (void *) &tramp_mapping) {
+		force_sig(SIGILL);
+		return -1;
+	}
+
+	handle_syscall_uprobe(regs, bp_vaddr - 5);
+	return 0;
+}
+
+asm (
+	".pushsection .rodata\n"
+	".global uprobe_trampoline_entry\n"
+	"uprobe_trampoline_entry:\n"
+	"endbr64\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"push %rax\n"
+	"movq $" __stringify(__NR_uprobe) ", %rax\n"
+	"syscall\n"
+	"pop %rax\n"
+	"pop %r11\n"
+	"pop %rcx\n"
+	"ret\n"
+	".global uprobe_trampoline_end\n"
+	"uprobe_trampoline_end:\n"
+	".popsection\n"
+);
+
+extern __visible u8 uprobe_trampoline_entry[];
+extern __visible u8 uprobe_trampoline_end[];
+
+const struct vm_special_mapping *arch_uprobe_trampoline_mapping(void)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+
+	return user_64bit_mode(regs) ? &tramp_mapping : NULL;
+}
+
+static int __init arch_uprobes_init(void)
+{
+	unsigned long size = uprobe_trampoline_end - uprobe_trampoline_entry;
+	static struct page *pages[2];
+	struct page *page;
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page)
+		return -ENOMEM;
+	pages[0] = page;
+	tramp_mapping.pages = (struct page **) &pages;
+	arch_uprobe_copy_ixol(page, 0, uprobe_trampoline_entry, size);
+	return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
 /*
  * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6333204d451..002f4e1debe5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -994,6 +994,8 @@  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
 asmlinkage long sys_uretprobe(void);
 
+asmlinkage long sys_uprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index c4ee755ca2a1..5e9a33bfb747 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -232,6 +232,7 @@  extern struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr);
 extern void uprobe_trampoline_put(struct uprobe_trampoline *area);
 extern bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr);
 extern const struct vm_special_mapping *arch_uprobe_trampoline_mapping(void);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f57918c624da..52f38d1ef276 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2729,6 +2729,28 @@  static void handle_swbp(struct pt_regs *regs)
 	rcu_read_unlock_trace();
 }
 
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	rcu_read_lock_trace();
+	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+	if (!uprobe)
+		goto unlock;
+
+	if (!get_utask())
+		goto unlock;
+
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		goto unlock;
+
+	handler_chain(uprobe, regs);
+
+ unlock:
+	rcu_read_unlock_trace();
+}
+
 /*
  * Perform required fix-ups and disable singlestep.
  * Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@  COND_SYSCALL(setuid16);
 COND_SYSCALL(rseq);
 
 COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);