diff mbox series

[RFC,bpf-next,1/3] uprobe: Add uretprobe syscall to speed up return probe

Message ID 20240318093139.293497-2-jolsa@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series uprobe: uretprobe speed up | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 15227 this patch: 15227
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 16 maintainers not CCed: adrian.hunter@intel.com mingo@redhat.com arnd@arndb.de alexander.shishkin@linux.intel.com linux-arch@vger.kernel.org mark.rutland@arm.com dave.hansen@linux.intel.com linux-perf-users@vger.kernel.org sohil.mehta@intel.com acme@kernel.org geert@linux-m68k.org linux-api@vger.kernel.org luto@kernel.org irogers@google.com namhyung@kernel.org hpa@zytor.com
netdev/build_clang success Errors and warnings before: 3081 this patch: 3081
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 16144 this patch: 16145
netdev/checkpatch warning CHECK: Lines should not end with a '(' CHECK: No space is necessary after a cast CHECK: extern prototypes should be avoided in .h files WARNING: externs should be avoided in .c files
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Jiri Olsa March 18, 2024, 9:31 a.m. UTC
Adding uretprobe syscall instead of trap to speed up return probe.

At the moment the uretprobe setup/path is:

  - install entry uprobe

  - when the uprobe is hit overwrite probed function's return address on stack
    with address of the trampoline that contains breakpoint instruction

  - the breakpoint trap code handles the uretprobe consumers execution and
    jumps back to original return address

This patch changes the above trampoline's breakpoint instruction to new
ureprobe syscall call. This syscall does exactly the same job as the trap
with some extra work:

  - syscall trampoline must save original value for rax/r11/rcx registers
    on stack - rax is set to syscall number and r11/rcx are changed and
    used by syscall instruction

  - the syscall code reads the original values of those registers and restore
    those values in task's pt_regs area

Even with the extra registers handling code the having uretprobes handled
by syscalls shows speed improvement.

  On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)

  current:

    base           :   15.173 ± 0.052M/s
    uprobe-nop     :    2.797 ± 0.090M/s
    uprobe-push    :    2.580 ± 0.007M/s
    uprobe-ret     :    1.001 ± 0.003M/s
    uretprobe-nop  :    1.373 ± 0.002M/s
    uretprobe-push :    1.346 ± 0.002M/s
    uretprobe-ret  :    0.747 ± 0.001M/s

  with the fix:

    base           :   15.704 ± 0.076M/s
    uprobe-nop     :    2.841 ± 0.008M/s
    uprobe-push    :    2.666 ± 0.029M/s
    uprobe-ret     :    1.037 ± 0.008M/s
    uretprobe-nop  :    1.718 ± 0.010M/s  < ~25% speed up
    uretprobe-push :    1.658 ± 0.008M/s  < ~23% speed up
    uretprobe-ret  :    0.853 ± 0.004M/s  < ~14% speed up

  On Amd (AMD Ryzen 7 5700U)

  current:

    base           :    5.702 ± 0.003M/s
    uprobe-nop     :    1.505 ± 0.011M/s
    uprobe-push    :    1.388 ± 0.008M/s
    uprobe-ret     :    0.825 ± 0.001M/s
    uretprobe-nop  :    0.782 ± 0.001M/s
    uretprobe-push :    0.750 ± 0.001M/s
    uretprobe-ret  :    0.544 ± 0.001M/s

  with the fix:

    base           :    5.669 ± 0.004M/s
    uprobe-nop     :    1.539 ± 0.001M/s
    uprobe-push    :    1.385 ± 0.003M/s
    uprobe-ret     :    0.819 ± 0.001M/s
    uretprobe-nop  :    0.889 ± 0.001M/s < ~13% speed up
    uretprobe-push :    0.846 ± 0.001M/s < ~12% speed up
    uretprobe-ret  :    0.594 ± 0.000M/s <  ~9% speed up

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 arch/x86/kernel/uprobes.c              | 48 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  2 ++
 include/linux/uprobes.h                |  2 ++
 include/uapi/asm-generic/unistd.h      |  5 ++-
 kernel/events/uprobes.c                | 18 +++++++---
 kernel/sys_ni.c                        |  2 ++
 7 files changed, 73 insertions(+), 5 deletions(-)

Comments

Oleg Nesterov March 18, 2024, 2:22 p.m. UTC | #1
On 03/18, Jiri Olsa wrote:
>
> +SYSCALL_DEFINE0(uretprobe)
> +{
> +	struct pt_regs *regs = task_pt_regs(current);
> +	unsigned long sregs[3], err;
> +
> +	/*
> +	 * We set rax and syscall itself changes rcx and r11, so the syscall
> +	 * trampoline saves their original values on stack. We need to read
> +	 * them and set original register values and fix the rsp pointer back.
> +	 */
> +	err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
                                              ^^^^^^^^^^^^^^^^^

IIUC, it should be (void __user *)regs->sp to shut up the sparse checks.
The 1st "(void *)" typecast is not needed.

Correctness-wise looks good to me, FWIW

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Andrii Nakryiko March 19, 2024, 1:11 a.m. UTC | #2
On Mon, Mar 18, 2024 at 2:31 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Adding uretprobe syscall instead of trap to speed up return probe.
>
> At the moment the uretprobe setup/path is:
>
>   - install entry uprobe
>
>   - when the uprobe is hit overwrite probed function's return address on stack
>     with address of the trampoline that contains breakpoint instruction
>
>   - the breakpoint trap code handles the uretprobe consumers execution and
>     jumps back to original return address
>
> This patch changes the above trampoline's breakpoint instruction to new
> ureprobe syscall call. This syscall does exactly the same job as the trap
> with some extra work:
>
>   - syscall trampoline must save original value for rax/r11/rcx registers
>     on stack - rax is set to syscall number and r11/rcx are changed and
>     used by syscall instruction
>
>   - the syscall code reads the original values of those registers and restore
>     those values in task's pt_regs area
>
> Even with the extra registers handling code the having uretprobes handled
> by syscalls shows speed improvement.
>
>   On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)
>
>   current:
>
>     base           :   15.173 ± 0.052M/s
>     uprobe-nop     :    2.797 ± 0.090M/s
>     uprobe-push    :    2.580 ± 0.007M/s
>     uprobe-ret     :    1.001 ± 0.003M/s
>     uretprobe-nop  :    1.373 ± 0.002M/s
>     uretprobe-push :    1.346 ± 0.002M/s
>     uretprobe-ret  :    0.747 ± 0.001M/s
>
>   with the fix:
>
>     base           :   15.704 ± 0.076M/s
>     uprobe-nop     :    2.841 ± 0.008M/s
>     uprobe-push    :    2.666 ± 0.029M/s
>     uprobe-ret     :    1.037 ± 0.008M/s
>     uretprobe-nop  :    1.718 ± 0.010M/s  < ~25% speed up
>     uretprobe-push :    1.658 ± 0.008M/s  < ~23% speed up
>     uretprobe-ret  :    0.853 ± 0.004M/s  < ~14% speed up
>
>   On Amd (AMD Ryzen 7 5700U)
>
>   current:
>
>     base           :    5.702 ± 0.003M/s
>     uprobe-nop     :    1.505 ± 0.011M/s
>     uprobe-push    :    1.388 ± 0.008M/s
>     uprobe-ret     :    0.825 ± 0.001M/s
>     uretprobe-nop  :    0.782 ± 0.001M/s
>     uretprobe-push :    0.750 ± 0.001M/s
>     uretprobe-ret  :    0.544 ± 0.001M/s
>
>   with the fix:
>
>     base           :    5.669 ± 0.004M/s
>     uprobe-nop     :    1.539 ± 0.001M/s
>     uprobe-push    :    1.385 ± 0.003M/s
>     uprobe-ret     :    0.819 ± 0.001M/s
>     uretprobe-nop  :    0.889 ± 0.001M/s < ~13% speed up
>     uretprobe-push :    0.846 ± 0.001M/s < ~12% speed up
>     uretprobe-ret  :    0.594 ± 0.000M/s <  ~9% speed up
>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  arch/x86/kernel/uprobes.c              | 48 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 ++
>  include/linux/uprobes.h                |  2 ++
>  include/uapi/asm-generic/unistd.h      |  5 ++-
>  kernel/events/uprobes.c                | 18 +++++++---
>  kernel/sys_ni.c                        |  2 ++
>  7 files changed, 73 insertions(+), 5 deletions(-)
>

LGTM, but see some questions and nits below:

Acked-by: Andrii Nakryiko <andrii@kernel.org>

> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 7e8d46f4147f..af0a33ab06ee 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -383,6 +383,7 @@
>  459    common  lsm_get_self_attr       sys_lsm_get_self_attr
>  460    common  lsm_set_self_attr       sys_lsm_set_self_attr
>  461    common  lsm_list_modules        sys_lsm_list_modules
> +462    64      uretprobe               sys_uretprobe
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 6c07f6daaa22..069371e86180 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -12,6 +12,7 @@
>  #include <linux/ptrace.h>
>  #include <linux/uprobes.h>
>  #include <linux/uaccess.h>
> +#include <linux/syscalls.h>
>
>  #include <linux/kdebug.h>
>  #include <asm/processor.h>
> @@ -308,6 +309,53 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
>  }
>
>  #ifdef CONFIG_X86_64
> +
> +asm (
> +       ".pushsection .rodata\n"
> +       ".global uretprobe_syscall_entry\n"
> +       "uretprobe_syscall_entry:\n"
> +       "pushq %rax\n"
> +       "pushq %rcx\n"
> +       "pushq %r11\n"
> +       "movq $462, %rax\n"

nit: is it possible to avoid hardcoding 462 here? Can we use
__NR_uretprobe instead?

> +       "syscall\n"

oh, btw, do we need to save flags register as well or it's handled
somehow? I think according to manual syscall instruction does
something to rflags register. So do we need pushfq before syscall?

> +       ".global uretprobe_syscall_end\n"
> +       "uretprobe_syscall_end:\n"
> +       ".popsection\n"
> +);
> +
> +extern u8 uretprobe_syscall_entry[];
> +extern u8 uretprobe_syscall_end[];
> +
> +void *arch_uprobe_trampoline(unsigned long *psize)
> +{
> +       *psize = uretprobe_syscall_end - uretprobe_syscall_entry;
> +       return uretprobe_syscall_entry;
> +}
> +
> +SYSCALL_DEFINE0(uretprobe)
> +{
> +       struct pt_regs *regs = task_pt_regs(current);
> +       unsigned long sregs[3], err;
> +
> +       /*
> +        * We set rax and syscall itself changes rcx and r11, so the syscall
> +        * trampoline saves their original values on stack. We need to read
> +        * them and set original register values and fix the rsp pointer back.
> +        */
> +       err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
> +       WARN_ON_ONCE(err);
> +
> +       regs->r11 = sregs[0];
> +       regs->cx = sregs[1];
> +       regs->ax = sregs[2];
> +       regs->orig_ax = -1;
> +       regs->sp += sizeof(sregs);
> +
> +       uprobe_handle_trampoline(regs);

probably worth leaving a comment that uprobe_handle_trampoline() is
rewriting userspace RIP and so syscall "returns" to the original
caller.

> +       return regs->ax;

and this is making sure that caller function gets the correct function
return value, right? It's all a bit magical, so worth leaving a
comment here, IMO.

> +}
> +
>  /*
>   * If arch_uprobe->insn doesn't use rip-relative addressing, return
>   * immediately.  Otherwise, rewrite the instruction so that it accesses
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 77eb9b0e7685..db150794f89d 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -972,6 +972,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags);
>  /* x86 */
>  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
>
> +asmlinkage long sys_uretprobe(void);
> +
>  /* pciconfig: alpha, arm, arm64, ia64, sparc */
>  asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
>                                 unsigned long off, unsigned long len,
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..a490146ad89d 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -138,6 +138,8 @@ extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
>  extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
>  extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
>                                          void *src, unsigned long len);
> +extern void uprobe_handle_trampoline(struct pt_regs *regs);
> +extern void *arch_uprobe_trampoline(unsigned long *psize);
>  #else /* !CONFIG_UPROBES */
>  struct uprobes_state {
>  };
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 75f00965ab15..8a747cd1d735 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
>  #define __NR_lsm_list_modules 461
>  __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
>
> +#define __NR_uretprobe 462
> +__SYSCALL(__NR_uretprobe, sys_uretprobe)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 462
> +#define __NR_syscalls 463
>
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 929e98c62965..90395b16bde0 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1474,11 +1474,20 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
>         return ret;
>  }
>
> +void * __weak arch_uprobe_trampoline(unsigned long *psize)
> +{
> +       static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
> +
> +       *psize = UPROBE_SWBP_INSN_SIZE;
> +       return &insn;
> +}
> +
>  static struct xol_area *__create_xol_area(unsigned long vaddr)
>  {
>         struct mm_struct *mm = current->mm;
> -       uprobe_opcode_t insn = UPROBE_SWBP_INSN;
> +       unsigned long insns_size;
>         struct xol_area *area;
> +       void *insns;
>
>         area = kmalloc(sizeof(*area), GFP_KERNEL);
>         if (unlikely(!area))
> @@ -1502,7 +1511,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
>         /* Reserve the 1st slot for get_trampoline_vaddr() */
>         set_bit(0, area->bitmap);
>         atomic_set(&area->slot_count, 1);
> -       arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
> +       insns = arch_uprobe_trampoline(&insns_size);
> +       arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
>
>         if (!xol_add_vma(mm, area))
>                 return area;
> @@ -2123,7 +2133,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
>         return ri;
>  }
>
> -static void handle_trampoline(struct pt_regs *regs)
> +void uprobe_handle_trampoline(struct pt_regs *regs)
>  {
>         struct uprobe_task *utask;
>         struct return_instance *ri, *next;
> @@ -2188,7 +2198,7 @@ static void handle_swbp(struct pt_regs *regs)
>
>         bp_vaddr = uprobe_get_swbp_addr(regs);
>         if (bp_vaddr == get_trampoline_vaddr())
> -               return handle_trampoline(regs);
> +               return uprobe_handle_trampoline(regs);
>
>         uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
>         if (!uprobe) {
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index faad00cce269..be6195e0d078 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -391,3 +391,5 @@ COND_SYSCALL(setuid16);
>
>  /* restartable sequence */
>  COND_SYSCALL(rseq);
> +
> +COND_SYSCALL(uretprobe);
> --
> 2.44.0
>
Oleg Nesterov March 19, 2024, 6:32 a.m. UTC | #3
On 03/18, Andrii Nakryiko wrote:
>
> > +       "syscall\n"
>
> oh, btw, do we need to save flags register as well or it's handled
> somehow? I think according to manual syscall instruction does
> something to rflags register. So do we need pushfq before syscall?

The comment above entry_SYSCALL_64() says

	64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11

then entry_SYSCALL_64 does

	pushq   %r11                                    /* pt_regs->flags */

which should be restored on return to userspace. So I think that
only X86_EFLAGS_RF can be lost, we probably do not care.

Oleg.
Jiri Olsa March 19, 2024, 10:54 a.m. UTC | #4
On Mon, Mar 18, 2024 at 06:11:06PM -0700, Andrii Nakryiko wrote:

SNIP

> > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > index 7e8d46f4147f..af0a33ab06ee 100644
> > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > @@ -383,6 +383,7 @@
> >  459    common  lsm_get_self_attr       sys_lsm_get_self_attr
> >  460    common  lsm_set_self_attr       sys_lsm_set_self_attr
> >  461    common  lsm_list_modules        sys_lsm_list_modules
> > +462    64      uretprobe               sys_uretprobe
> >
> >  #
> >  # Due to a historical design error, certain syscalls are numbered differently
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 6c07f6daaa22..069371e86180 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -12,6 +12,7 @@
> >  #include <linux/ptrace.h>
> >  #include <linux/uprobes.h>
> >  #include <linux/uaccess.h>
> > +#include <linux/syscalls.h>
> >
> >  #include <linux/kdebug.h>
> >  #include <asm/processor.h>
> > @@ -308,6 +309,53 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
> >  }
> >
> >  #ifdef CONFIG_X86_64
> > +
> > +asm (
> > +       ".pushsection .rodata\n"
> > +       ".global uretprobe_syscall_entry\n"
> > +       "uretprobe_syscall_entry:\n"
> > +       "pushq %rax\n"
> > +       "pushq %rcx\n"
> > +       "pushq %r11\n"
> > +       "movq $462, %rax\n"
> 
> nit: is it possible to avoid hardcoding 462 here? Can we use
> __NR_uretprobe instead?

yep, will do that

> 
> > +       "syscall\n"
> 
> oh, btw, do we need to save flags register as well or it's handled
> somehow? I think according to manual syscall instruction does
> something to rflags register. So do we need pushfq before syscall?

it's saved and restored by syscall instruction.. but apart from RF
flag as Oleg mentioned, it looks like we don't need to care about
that one

> 
> > +       ".global uretprobe_syscall_end\n"
> > +       "uretprobe_syscall_end:\n"
> > +       ".popsection\n"
> > +);
> > +
> > +extern u8 uretprobe_syscall_entry[];
> > +extern u8 uretprobe_syscall_end[];
> > +
> > +void *arch_uprobe_trampoline(unsigned long *psize)
> > +{
> > +       *psize = uretprobe_syscall_end - uretprobe_syscall_entry;
> > +       return uretprobe_syscall_entry;
> > +}
> > +
> > +SYSCALL_DEFINE0(uretprobe)
> > +{
> > +       struct pt_regs *regs = task_pt_regs(current);
> > +       unsigned long sregs[3], err;
> > +
> > +       /*
> > +        * We set rax and syscall itself changes rcx and r11, so the syscall
> > +        * trampoline saves their original values on stack. We need to read
> > +        * them and set original register values and fix the rsp pointer back.
> > +        */
> > +       err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
> > +       WARN_ON_ONCE(err);
> > +
> > +       regs->r11 = sregs[0];
> > +       regs->cx = sregs[1];
> > +       regs->ax = sregs[2];
> > +       regs->orig_ax = -1;
> > +       regs->sp += sizeof(sregs);
> > +
> > +       uprobe_handle_trampoline(regs);
> 
> probably worth leaving a comment that uprobe_handle_trampoline() is
> rewriting userspace RIP and so syscall "returns" to the original
> caller.
> 
> > +       return regs->ax;
> 
> and this is making sure that caller function gets the correct function
> return value, right? It's all a bit magical, so worth leaving a
> comment here, IMO.

ok will add more comments about that

thanks,
jirka
Andrii Nakryiko March 19, 2024, 4:20 p.m. UTC | #5
On Mon, Mar 18, 2024 at 11:33 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 03/18, Andrii Nakryiko wrote:
> >
> > > +       "syscall\n"
> >
> > oh, btw, do we need to save flags register as well or it's handled
> > somehow? I think according to manual syscall instruction does
> > something to rflags register. So do we need pushfq before syscall?
>
> The comment above entry_SYSCALL_64() says
>
>         64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11
>
> then entry_SYSCALL_64 does
>
>         pushq   %r11                                    /* pt_regs->flags */
>
> which should be restored on return to userspace. So I think that
> only X86_EFLAGS_RF can be lost, we probably do not care.
>

Cool, thanks for elaborating!


> Oleg.
>
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7e8d46f4147f..af0a33ab06ee 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -383,6 +383,7 @@ 
 459	common	lsm_get_self_attr	sys_lsm_get_self_attr
 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
 461	common	lsm_list_modules	sys_lsm_list_modules
+462	64	uretprobe		sys_uretprobe
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c07f6daaa22..069371e86180 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -12,6 +12,7 @@ 
 #include <linux/ptrace.h>
 #include <linux/uprobes.h>
 #include <linux/uaccess.h>
+#include <linux/syscalls.h>
 
 #include <linux/kdebug.h>
 #include <asm/processor.h>
@@ -308,6 +309,53 @@  static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 }
 
 #ifdef CONFIG_X86_64
+
+asm (
+	".pushsection .rodata\n"
+	".global uretprobe_syscall_entry\n"
+	"uretprobe_syscall_entry:\n"
+	"pushq %rax\n"
+	"pushq %rcx\n"
+	"pushq %r11\n"
+	"movq $462, %rax\n"
+	"syscall\n"
+	".global uretprobe_syscall_end\n"
+	"uretprobe_syscall_end:\n"
+	".popsection\n"
+);
+
+extern u8 uretprobe_syscall_entry[];
+extern u8 uretprobe_syscall_end[];
+
+void *arch_uprobe_trampoline(unsigned long *psize)
+{
+	*psize = uretprobe_syscall_end - uretprobe_syscall_entry;
+	return uretprobe_syscall_entry;
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	unsigned long sregs[3], err;
+
+	/*
+	 * We set rax and syscall itself changes rcx and r11, so the syscall
+	 * trampoline saves their original values on stack. We need to read
+	 * them and set original register values and fix the rsp pointer back.
+	 */
+	err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
+	WARN_ON_ONCE(err);
+
+	regs->r11 = sregs[0];
+	regs->cx = sregs[1];
+	regs->ax = sregs[2];
+	regs->orig_ax = -1;
+	regs->sp += sizeof(sregs);
+
+	uprobe_handle_trampoline(regs);
+	return regs->ax;
+}
+
 /*
  * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 77eb9b0e7685..db150794f89d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -972,6 +972,8 @@  asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags);
 /* x86 */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
+asmlinkage long sys_uretprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f46e0ca0169c..a490146ad89d 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -138,6 +138,8 @@  extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
+extern void uprobe_handle_trampoline(struct pt_regs *regs);
+extern void *arch_uprobe_trampoline(unsigned long *psize);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 75f00965ab15..8a747cd1d735 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -842,8 +842,11 @@  __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
 #define __NR_lsm_list_modules 461
 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 
+#define __NR_uretprobe 462
+__SYSCALL(__NR_uretprobe, sys_uretprobe)
+
 #undef __NR_syscalls
-#define __NR_syscalls 462
+#define __NR_syscalls 463
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c62965..90395b16bde0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1474,11 +1474,20 @@  static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 	return ret;
 }
 
+void * __weak arch_uprobe_trampoline(unsigned long *psize)
+{
+	static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+	*psize = UPROBE_SWBP_INSN_SIZE;
+	return &insn;
+}
+
 static struct xol_area *__create_xol_area(unsigned long vaddr)
 {
 	struct mm_struct *mm = current->mm;
-	uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+	unsigned long insns_size;
 	struct xol_area *area;
+	void *insns;
 
 	area = kmalloc(sizeof(*area), GFP_KERNEL);
 	if (unlikely(!area))
@@ -1502,7 +1511,8 @@  static struct xol_area *__create_xol_area(unsigned long vaddr)
 	/* Reserve the 1st slot for get_trampoline_vaddr() */
 	set_bit(0, area->bitmap);
 	atomic_set(&area->slot_count, 1);
-	arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+	insns = arch_uprobe_trampoline(&insns_size);
+	arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
 
 	if (!xol_add_vma(mm, area))
 		return area;
@@ -2123,7 +2133,7 @@  static struct return_instance *find_next_ret_chain(struct return_instance *ri)
 	return ri;
 }
 
-static void handle_trampoline(struct pt_regs *regs)
+void uprobe_handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
 	struct return_instance *ri, *next;
@@ -2188,7 +2198,7 @@  static void handle_swbp(struct pt_regs *regs)
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
 	if (bp_vaddr == get_trampoline_vaddr())
-		return handle_trampoline(regs);
+		return uprobe_handle_trampoline(regs);
 
 	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
 	if (!uprobe) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index faad00cce269..be6195e0d078 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -391,3 +391,5 @@  COND_SYSCALL(setuid16);
 
 /* restartable sequence */
 COND_SYSCALL(rseq);
+
+COND_SYSCALL(uretprobe);