diff mbox series

[RFC,bpf-next,4/3] uprobe: ensure sys_uretprobe uses sysret

Message ID 20240319102523.GC20287@redhat.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series uprobe: uretprobe speed up | expand

Commit Message

Oleg Nesterov March 19, 2024, 10:25 a.m. UTC
Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
sure it makes sense.

But I am wondering if this change can speedup uretprobes a bit more. Any chance
you can test it?

With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().

With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
make it possible.

Comments?

Oleg.
---

Comments

Jiri Olsa March 19, 2024, 11:08 a.m. UTC | #1
On Tue, Mar 19, 2024 at 11:25:24AM +0100, Oleg Nesterov wrote:
> Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
> sure it makes sense.
> 
> But I am wondering if this change can speedup uretprobes a bit more. Any chance
> you can test it?
> 
> With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
> See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().

nice idea, looks like sysexit should be faster

> 
> With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
> sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
> make it possible.
> 
> Comments?
> 
> Oleg.
> ---
> 
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 069371e86180..b99f1d80a8c8 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -319,6 +319,9 @@ asm (
>  	"pushq %r11\n"
>  	"movq $462, %rax\n"
>  	"syscall\n"
> +	"popq %r11\n"
> +	"popq %rcx\n"
> +	"retq\n"

using rax space on stack for return pointer, cool :)

I'll run the test with this change

thanks,
jirka

>  	".global uretprobe_syscall_end\n"
>  	"uretprobe_syscall_end:\n"
>  	".popsection\n"
> @@ -336,23 +339,20 @@ void *arch_uprobe_trampoline(unsigned long *psize)
>  SYSCALL_DEFINE0(uretprobe)
>  {
>  	struct pt_regs *regs = task_pt_regs(current);
> -	unsigned long sregs[3], err;
> +	unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
> +	unsigned long ip, err;
>  
> -	/*
> -	 * We set rax and syscall itself changes rcx and r11, so the syscall
> -	 * trampoline saves their original values on stack. We need to read
> -	 * them and set original register values and fix the rsp pointer back.
> -	 */
> -	err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
> -	WARN_ON_ONCE(err);
> -
> -	regs->r11 = sregs[0];
> -	regs->cx = sregs[1];
> -	regs->ax = sregs[2];
> +	ip = regs->ip;
>  	regs->orig_ax = -1;
> -	regs->sp += sizeof(sregs);
> +	err = get_user(regs->ax, ax_and_ret);
> +	WARN_ON_ONCE(err);
>  
>  	uprobe_handle_trampoline(regs);
> +
> +	err = put_user(regs->ip, ax_and_ret);
> +	WARN_ON_ONCE(err);
> +	regs->ip = ip;
> +
>  	return regs->ax;
>  }
>  
>
Andrii Nakryiko March 19, 2024, 4:25 p.m. UTC | #2
On Tue, Mar 19, 2024 at 4:08 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Tue, Mar 19, 2024 at 11:25:24AM +0100, Oleg Nesterov wrote:
> > Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
> > sure it makes sense.
> >
> > But I am wondering if this change can speedup uretprobes a bit more. Any chance
> > you can test it?
> >
> > With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
> > See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().
>
> nice idea, looks like sysexit should be faster
>
> >
> > With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
> > sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
> > make it possible.
> >
> > Comments?
> >
> > Oleg.
> > ---
> >
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 069371e86180..b99f1d80a8c8 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -319,6 +319,9 @@ asm (
> >       "pushq %r11\n"
> >       "movq $462, %rax\n"
> >       "syscall\n"
> > +     "popq %r11\n"
> > +     "popq %rcx\n"
> > +     "retq\n"
>
> using rax space on stack for return pointer, cool :)
>
> I'll run the test with this change
>

I can do some benchmarking on my side as well, given I have everything
set up for this anyways. Thanks for the help with speeding all this
up!

BTW, Jiri, what are you plans regarding sys_uprobe (entry probe
optimization through syscall), while we are on the topic?


> thanks,
> jirka
>
> >       ".global uretprobe_syscall_end\n"
> >       "uretprobe_syscall_end:\n"
> >       ".popsection\n"
> > @@ -336,23 +339,20 @@ void *arch_uprobe_trampoline(unsigned long *psize)
> >  SYSCALL_DEFINE0(uretprobe)
> >  {
> >       struct pt_regs *regs = task_pt_regs(current);
> > -     unsigned long sregs[3], err;
> > +     unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
> > +     unsigned long ip, err;
> >
> > -     /*
> > -      * We set rax and syscall itself changes rcx and r11, so the syscall
> > -      * trampoline saves their original values on stack. We need to read
> > -      * them and set original register values and fix the rsp pointer back.
> > -      */
> > -     err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
> > -     WARN_ON_ONCE(err);
> > -
> > -     regs->r11 = sregs[0];
> > -     regs->cx = sregs[1];
> > -     regs->ax = sregs[2];
> > +     ip = regs->ip;
> >       regs->orig_ax = -1;
> > -     regs->sp += sizeof(sregs);
> > +     err = get_user(regs->ax, ax_and_ret);
> > +     WARN_ON_ONCE(err);
> >
> >       uprobe_handle_trampoline(regs);
> > +
> > +     err = put_user(regs->ip, ax_and_ret);
> > +     WARN_ON_ONCE(err);
> > +     regs->ip = ip;
> > +
> >       return regs->ax;
> >  }
> >
> >
Oleg Nesterov March 19, 2024, 4:38 p.m. UTC | #3
On 03/19, Andrii Nakryiko wrote:
>
> On Tue, Mar 19, 2024 at 4:08 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> >
> > I'll run the test with this change
> >
>
> I can do some benchmarking on my side as well, given I have everything
> set up for this anyways. Thanks for the help with speeding all this
> up!

Great, thank you both!

but just in case let me repeat, this is just a poc, the patch is obviously
incomplete and needs more discussion even _if_ it actually makes the syscall
faster, and I am not sure it does.

Oleg.
Jiri Olsa March 19, 2024, 7:31 p.m. UTC | #4
On Tue, Mar 19, 2024 at 12:08:35PM +0100, Jiri Olsa wrote:
> On Tue, Mar 19, 2024 at 11:25:24AM +0100, Oleg Nesterov wrote:
> > Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
> > sure it makes sense.
> > 
> > But I am wondering if this change can speedup uretprobes a bit more. Any chance
> > you can test it?
> > 
> > With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
> > See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().
> 
> nice idea, looks like sysexit should be faster
> 
> > 
> > With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
> > sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
> > make it possible.
> > 
> > Comments?
> > 
> > Oleg.
> > ---
> > 
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 069371e86180..b99f1d80a8c8 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -319,6 +319,9 @@ asm (
> >  	"pushq %r11\n"
> >  	"movq $462, %rax\n"
> >  	"syscall\n"
> > +	"popq %r11\n"
> > +	"popq %rcx\n"
> > +	"retq\n"
> 
> using rax space on stack for return pointer, cool :)
> 
> I'll run the test with this change

I got bigger speed up on intel, amd stays the same (I'll double check that)

current:
  base           :   16.133 ± 0.035M/s
  uprobe-nop     :    3.003 ± 0.002M/s
  uprobe-push    :    2.829 ± 0.001M/s
  uprobe-ret     :    1.101 ± 0.001M/s
  uretprobe-nop  :    1.485 ± 0.001M/s
  uretprobe-push :    1.447 ± 0.000M/s
  uretprobe-ret  :    0.812 ± 0.000M/s

fix:
  base           :   16.522 ± 0.103M/s
  uprobe-nop     :    2.920 ± 0.034M/s
  uprobe-push    :    2.749 ± 0.047M/s
  uprobe-ret     :    1.094 ± 0.003M/s
  uretprobe-nop  :    2.004 ± 0.006M/s  < ~34% speed up
  uretprobe-push :    1.940 ± 0.003M/s  < ~34% speed up
  uretprobe-ret  :    0.921 ± 0.050M/s  < ~13% speed up

original fix:
  base           :   15.704 ± 0.076M/s
  uprobe-nop     :    2.841 ± 0.008M/s
  uprobe-push    :    2.666 ± 0.029M/s
  uprobe-ret     :    1.037 ± 0.008M/s
  uretprobe-nop  :    1.718 ± 0.010M/s  < ~25% speed up
  uretprobe-push :    1.658 ± 0.008M/s  < ~23% speed up
  uretprobe-ret  :    0.853 ± 0.004M/s  < ~14% speed up


jirka
Jiri Olsa March 19, 2024, 7:35 p.m. UTC | #5
On Tue, Mar 19, 2024 at 09:25:57AM -0700, Andrii Nakryiko wrote:
> On Tue, Mar 19, 2024 at 4:08 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Tue, Mar 19, 2024 at 11:25:24AM +0100, Oleg Nesterov wrote:
> > > Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
> > > sure it makes sense.
> > >
> > > But I am wondering if this change can speedup uretprobes a bit more. Any chance
> > > you can test it?
> > >
> > > With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
> > > See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().
> >
> > nice idea, looks like sysexit should be faster
> >
> > >
> > > With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
> > > sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
> > > make it possible.
> > >
> > > Comments?
> > >
> > > Oleg.
> > > ---
> > >
> > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > > index 069371e86180..b99f1d80a8c8 100644
> > > --- a/arch/x86/kernel/uprobes.c
> > > +++ b/arch/x86/kernel/uprobes.c
> > > @@ -319,6 +319,9 @@ asm (
> > >       "pushq %r11\n"
> > >       "movq $462, %rax\n"
> > >       "syscall\n"
> > > +     "popq %r11\n"
> > > +     "popq %rcx\n"
> > > +     "retq\n"
> >
> > using rax space on stack for return pointer, cool :)
> >
> > I'll run the test with this change
> >
> 
> I can do some benchmarking on my side as well, given I have everything

that'd be great, thanks

> set up for this anyways. Thanks for the help with speeding all this
> up!
> 
> BTW, Jiri, what are you plans regarding sys_uprobe (entry probe
> optimization through syscall), while we are on the topic?

I plan to work on that after this one is sorted out

jirka
Andrii Nakryiko March 19, 2024, 8:13 p.m. UTC | #6
On Tue, Mar 19, 2024 at 12:32 PM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Tue, Mar 19, 2024 at 12:08:35PM +0100, Jiri Olsa wrote:
> > On Tue, Mar 19, 2024 at 11:25:24AM +0100, Oleg Nesterov wrote:
> > > Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
> > > sure it makes sense.
> > >
> > > But I am wondering if this change can speedup uretprobes a bit more. Any chance
> > > you can test it?
> > >
> > > With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
> > > See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().
> >
> > nice idea, looks like sysexit should be faster
> >
> > >
> > > With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
> > > sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
> > > make it possible.
> > >
> > > Comments?
> > >
> > > Oleg.
> > > ---
> > >
> > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > > index 069371e86180..b99f1d80a8c8 100644
> > > --- a/arch/x86/kernel/uprobes.c
> > > +++ b/arch/x86/kernel/uprobes.c
> > > @@ -319,6 +319,9 @@ asm (
> > >     "pushq %r11\n"
> > >     "movq $462, %rax\n"
> > >     "syscall\n"
> > > +   "popq %r11\n"
> > > +   "popq %rcx\n"
> > > +   "retq\n"
> >
> > using rax space on stack for return pointer, cool :)
> >
> > I'll run the test with this change
>
> I got bigger speed up on intel, amd stays the same (I'll double check that)
>
> current:
>   base           :   16.133 ± 0.035M/s
>   uprobe-nop     :    3.003 ± 0.002M/s
>   uprobe-push    :    2.829 ± 0.001M/s
>   uprobe-ret     :    1.101 ± 0.001M/s
>   uretprobe-nop  :    1.485 ± 0.001M/s
>   uretprobe-push :    1.447 ± 0.000M/s
>   uretprobe-ret  :    0.812 ± 0.000M/s
>
> fix:
>   base           :   16.522 ± 0.103M/s
>   uprobe-nop     :    2.920 ± 0.034M/s
>   uprobe-push    :    2.749 ± 0.047M/s
>   uprobe-ret     :    1.094 ± 0.003M/s
>   uretprobe-nop  :    2.004 ± 0.006M/s  < ~34% speed up
>   uretprobe-push :    1.940 ± 0.003M/s  < ~34% speed up
>   uretprobe-ret  :    0.921 ± 0.050M/s  < ~13% speed up
>
> original fix:
>   base           :   15.704 ± 0.076M/s
>   uprobe-nop     :    2.841 ± 0.008M/s
>   uprobe-push    :    2.666 ± 0.029M/s
>   uprobe-ret     :    1.037 ± 0.008M/s
>   uretprobe-nop  :    1.718 ± 0.010M/s  < ~25% speed up
>   uretprobe-push :    1.658 ± 0.008M/s  < ~23% speed up
>   uretprobe-ret  :    0.853 ± 0.004M/s  < ~14% speed up
>

My machine is slower, even though I turned out mitigations and stuff
like that, I feel like there are still some slow downs. But either
way, data is at least consistent as far as baseline goes (it's called
syscall-count now in my local changes I'm yet to submit), and yes,
Oleg's change does produce a noticeable speed up:

baseline
========
usermode-count :   79.509 ± 0.038M/s
syscall-count  :    9.550 ± 0.002M/s
uprobe-nop     :    1.530 ± 0.000M/s
uprobe-push    :    1.457 ± 0.000M/s
uprobe-ret     :    0.642 ± 0.000M/s
uretprobe-nop  :    0.777 ± 0.000M/s
uretprobe-push :    0.761 ± 0.000M/s
uretprobe-ret  :    0.459 ± 0.000M/s

Jiri
====
usermode-count :   79.515 ± 0.014M/s
syscall-count  :    9.439 ± 0.006M/s
uprobe-nop     :    1.520 ± 0.001M/s
uprobe-push    :    1.464 ± 0.000M/s
uprobe-ret     :    0.640 ± 0.000M/s
uretprobe-nop  :    0.893 ± 0.000M/s (+15%)
uretprobe-push :    0.867 ± 0.000M/s (+14%)
uretprobe-ret  :    0.498 ± 0.000M/s (+8.5%)

Oleg+Jiri
=========
usermode-count :   79.471 ± 0.078M/s
syscall-count  :    9.434 ± 0.007M/s
uprobe-nop     :    1.516 ± 0.003M/s
uprobe-push    :    1.463 ± 0.000M/s
uprobe-ret     :    0.640 ± 0.001M/s
uretprobe-nop  :    1.020 ± 0.001M/s (+31%)
uretprobe-push :    0.998 ± 0.001M/s (+31%)
uretprobe-ret  :    0.537 ± 0.000M/s (+17%)

So it's 2x of just Jiri's changes, which is a very nice boost! I only
tested on Intel CPU.


>
> jirka
Jiri Olsa March 20, 2024, 11:04 a.m. UTC | #7
On Tue, Mar 19, 2024 at 08:31:55PM +0100, Jiri Olsa wrote:
> On Tue, Mar 19, 2024 at 12:08:35PM +0100, Jiri Olsa wrote:
> > On Tue, Mar 19, 2024 at 11:25:24AM +0100, Oleg Nesterov wrote:
> > > Obviously not for inclusion yet ;) untested, lacks the comments, and I am not
> > > sure it makes sense.
> > > 
> > > But I am wondering if this change can speedup uretprobes a bit more. Any chance
> > > you can test it?
> > > 
> > > With 1/3 sys_uretprobe() changes regs->r11/cx, this is correct but implies iret.
> > > See the /* SYSRET requires RCX == RIP and R11 == EFLAGS */ code in do_syscall_64().
> > 
> > nice idea, looks like sysexit should be faster
> > 
> > > 
> > > With this patch uretprobe_syscall_entry restores rcx/r11 itself and does retq,
> > > sys_uretprobe() needs to hijack regs->ip after uprobe_handle_trampoline() to
> > > make it possible.
> > > 
> > > Comments?
> > > 
> > > Oleg.
> > > ---
> > > 
> > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > > index 069371e86180..b99f1d80a8c8 100644
> > > --- a/arch/x86/kernel/uprobes.c
> > > +++ b/arch/x86/kernel/uprobes.c
> > > @@ -319,6 +319,9 @@ asm (
> > >  	"pushq %r11\n"
> > >  	"movq $462, %rax\n"
> > >  	"syscall\n"
> > > +	"popq %r11\n"
> > > +	"popq %rcx\n"
> > > +	"retq\n"
> > 
> > using rax space on stack for return pointer, cool :)
> > 
> > I'll run the test with this change
> 
> I got bigger speed up on intel, amd stays the same (I'll double check that)

yes, I'm getting no speed up on AMD, but Intel's great

Oleg,
are you ok if I squash the patches together or you
want to send it separately?

jirka


> 
> current:
>   base           :   16.133 ± 0.035M/s
>   uprobe-nop     :    3.003 ± 0.002M/s
>   uprobe-push    :    2.829 ± 0.001M/s
>   uprobe-ret     :    1.101 ± 0.001M/s
>   uretprobe-nop  :    1.485 ± 0.001M/s
>   uretprobe-push :    1.447 ± 0.000M/s
>   uretprobe-ret  :    0.812 ± 0.000M/s
> 
> fix:
>   base           :   16.522 ± 0.103M/s
>   uprobe-nop     :    2.920 ± 0.034M/s
>   uprobe-push    :    2.749 ± 0.047M/s
>   uprobe-ret     :    1.094 ± 0.003M/s
>   uretprobe-nop  :    2.004 ± 0.006M/s  < ~34% speed up
>   uretprobe-push :    1.940 ± 0.003M/s  < ~34% speed up
>   uretprobe-ret  :    0.921 ± 0.050M/s  < ~13% speed up
> 
> original fix:
>   base           :   15.704 ± 0.076M/s
>   uprobe-nop     :    2.841 ± 0.008M/s
>   uprobe-push    :    2.666 ± 0.029M/s
>   uprobe-ret     :    1.037 ± 0.008M/s
>   uretprobe-nop  :    1.718 ± 0.010M/s  < ~25% speed up
>   uretprobe-push :    1.658 ± 0.008M/s  < ~23% speed up
>   uretprobe-ret  :    0.853 ± 0.004M/s  < ~14% speed up
> 
> 
> jirka
Oleg Nesterov March 20, 2024, 2:37 p.m. UTC | #8
On 03/20, Jiri Olsa wrote:
>
> are you ok if I squash the patches together

Yes, thanks, I am fine.

But lets discuss this change a bit more. So, with this poc we have the
(intentionally) oversimplified

	SYSCALL_DEFINE0(uretprobe)
	{
		struct pt_regs *regs = task_pt_regs(current);
		unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
		unsigned long ip, err;

		ip = regs->ip;
		regs->orig_ax = -1;
		err = get_user(regs->ax, ax_and_ret);
		WARN_ON_ONCE(err);

		uprobe_handle_trampoline(regs);

		err = put_user(regs->ip, ax_and_ret);
		WARN_ON_ONCE(err);
		regs->ip = ip;

		return regs->ax;
	}

I have no idea what uprobe consumers / bpf programs can do, so let me ask:

	- uprobe_consumer's will see the "wrong" values of regs->cx/r11/sp
	  Is it OK? If not - easy to fix.

	- can uprobe_consumer change regs->cx/r11 ? If yes - easy to fix.

	- can uprobe_consumer change regs->sp ? If yes - easy to fix too,
	  but needs a separate check/code.

Oleg.
Oleg Nesterov March 20, 2024, 3:28 p.m. UTC | #9
On 03/20, Oleg Nesterov wrote:
>
> On 03/20, Jiri Olsa wrote:
> >
> > are you ok if I squash the patches together
>
> Yes, thanks, I am fine.
>
> But lets discuss this change a bit more. So, with this poc we have the
> (intentionally) oversimplified
>
> 	SYSCALL_DEFINE0(uretprobe)
> 	{
> 		struct pt_regs *regs = task_pt_regs(current);
> 		unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
> 		unsigned long ip, err;
>
> 		ip = regs->ip;
> 		regs->orig_ax = -1;
> 		err = get_user(regs->ax, ax_and_ret);
> 		WARN_ON_ONCE(err);
>
> 		uprobe_handle_trampoline(regs);
>
> 		err = put_user(regs->ip, ax_and_ret);
> 		WARN_ON_ONCE(err);
> 		regs->ip = ip;
>
> 		return regs->ax;
> 	}
>
> I have no idea what uprobe consumers / bpf programs can do, so let me ask:
>
> 	- uprobe_consumer's will see the "wrong" values of regs->cx/r11/sp
> 	  Is it OK? If not - easy to fix.
>
> 	- can uprobe_consumer change regs->cx/r11 ? If yes - easy to fix.
>
> 	- can uprobe_consumer change regs->sp ? If yes - easy to fix too,
> 	  but needs a separate check/code.

IOW. If answer is "yes" to all the questions above, then we probably need
something like

	SYSCALL_DEFINE0(uretprobe)
	{
		struct pt_regs *regs = task_pt_regs(current);
		unsigned long err, ip, sp, r11_cx_ax[3];

		err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
		WARN_ON_ONCE(err);

		// Q1: apart from ax, do we really care?
		// expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
		regs->r11 = r11_cx_ax[0];
		regs->cx  = r11_cx_ax[1];
		regs->ax  = r11_cx_ax[2];
		regs->sp += sizeof(r11_cx_ax);
		regs->orig_ax = -1;

		ip = regs->ip;
		sp = regs->sp;

		uprobe_handle_trampoline(regs);

		// Q2: is it possible? do we care?
		// uprobe_consumer has changed sp, we can do nothing,
		// just return via iret.
		if (regs->sp != sp)
			return regs->ax;
		regs->sp -= sizeof(r11_cx_ax);

		// Q3: is it possible? do we care?
		// for the case uprobe_consumer has changed r11/cx
		r11_cx_ax[0] = regs->r11;
		r11_cx_ax[1] = regs->cx;

		// comment to explain this hack
		r11_cx_ax[2] = regs->ip;
		regs->ip = ip;

		err = copy_to_user((void __user*)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
		WARN_ON_ONCE(err);

		// ensure sysret, see do_syscall_64()
		regs->r11 = regs->flags;
		regs->cx  = regs->ip;

		return regs->ax;
	}

Oleg.
Andrii Nakryiko March 20, 2024, 5:44 p.m. UTC | #10
On Wed, Mar 20, 2024 at 8:30 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 03/20, Oleg Nesterov wrote:
> >
> > On 03/20, Jiri Olsa wrote:
> > >
> > > are you ok if I squash the patches together
> >
> > Yes, thanks, I am fine.
> >
> > But lets discuss this change a bit more. So, with this poc we have the
> > (intentionally) oversimplified
> >
> >       SYSCALL_DEFINE0(uretprobe)
> >       {
> >               struct pt_regs *regs = task_pt_regs(current);
> >               unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
> >               unsigned long ip, err;
> >
> >               ip = regs->ip;
> >               regs->orig_ax = -1;
> >               err = get_user(regs->ax, ax_and_ret);
> >               WARN_ON_ONCE(err);
> >
> >               uprobe_handle_trampoline(regs);
> >
> >               err = put_user(regs->ip, ax_and_ret);
> >               WARN_ON_ONCE(err);
> >               regs->ip = ip;
> >
> >               return regs->ax;
> >       }
> >
> > I have no idea what uprobe consumers / bpf programs can do, so let me ask:
> >
> >       - uprobe_consumer's will see the "wrong" values of regs->cx/r11/sp
> >         Is it OK? If not - easy to fix.
> >
> >       - can uprobe_consumer change regs->cx/r11 ? If yes - easy to fix.
> >
> >       - can uprobe_consumer change regs->sp ? If yes - easy to fix too,
> >         but needs a separate check/code.
>
> IOW. If answer is "yes" to all the questions above, then we probably need
> something like

yes to first, so ideally we fix registers to "correct" values
(especially sp), but no to the last two (at least as far as BPF is
concerned)

>
>         SYSCALL_DEFINE0(uretprobe)
>         {
>                 struct pt_regs *regs = task_pt_regs(current);
>                 unsigned long err, ip, sp, r11_cx_ax[3];
>
>                 err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
>                 WARN_ON_ONCE(err);
>
>                 // Q1: apart from ax, do we really care?
>                 // expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
>                 regs->r11 = r11_cx_ax[0];
>                 regs->cx  = r11_cx_ax[1];
>                 regs->ax  = r11_cx_ax[2];
>                 regs->sp += sizeof(r11_cx_ax);
>                 regs->orig_ax = -1;
>
>                 ip = regs->ip;
>                 sp = regs->sp;
>
>                 uprobe_handle_trampoline(regs);
>
>                 // Q2: is it possible? do we care?
>                 // uprobe_consumer has changed sp, we can do nothing,
>                 // just return via iret.
>                 if (regs->sp != sp)
>                         return regs->ax;
>                 regs->sp -= sizeof(r11_cx_ax);
>
>                 // Q3: is it possible? do we care?
>                 // for the case uprobe_consumer has changed r11/cx
>                 r11_cx_ax[0] = regs->r11;
>                 r11_cx_ax[1] = regs->cx;
>
>                 // comment to explain this hack
>                 r11_cx_ax[2] = regs->ip;
>                 regs->ip = ip;
>
>                 err = copy_to_user((void __user*)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
>                 WARN_ON_ONCE(err);
>
>                 // ensure sysret, see do_syscall_64()
>                 regs->r11 = regs->flags;
>                 regs->cx  = regs->ip;
>
>                 return regs->ax;
>         }
>
> Oleg.
>
Jiri Olsa March 20, 2024, 7:08 p.m. UTC | #11
On Wed, Mar 20, 2024 at 10:44:30AM -0700, Andrii Nakryiko wrote:
> On Wed, Mar 20, 2024 at 8:30 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > On 03/20, Oleg Nesterov wrote:
> > >
> > > On 03/20, Jiri Olsa wrote:
> > > >
> > > > are you ok if I squash the patches together
> > >
> > > Yes, thanks, I am fine.
> > >
> > > But lets discuss this change a bit more. So, with this poc we have the
> > > (intentionally) oversimplified
> > >
> > >       SYSCALL_DEFINE0(uretprobe)
> > >       {
> > >               struct pt_regs *regs = task_pt_regs(current);
> > >               unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
> > >               unsigned long ip, err;
> > >
> > >               ip = regs->ip;
> > >               regs->orig_ax = -1;
> > >               err = get_user(regs->ax, ax_and_ret);
> > >               WARN_ON_ONCE(err);
> > >
> > >               uprobe_handle_trampoline(regs);
> > >
> > >               err = put_user(regs->ip, ax_and_ret);
> > >               WARN_ON_ONCE(err);
> > >               regs->ip = ip;
> > >
> > >               return regs->ax;
> > >       }
> > >
> > > I have no idea what uprobe consumers / bpf programs can do, so let me ask:
> > >
> > >       - uprobe_consumer's will see the "wrong" values of regs->cx/r11/sp
> > >         Is it OK? If not - easy to fix.
> > >
> > >       - can uprobe_consumer change regs->cx/r11 ? If yes - easy to fix.
> > >
> > >       - can uprobe_consumer change regs->sp ? If yes - easy to fix too,
> > >         but needs a separate check/code.
> >
> > IOW. If answer is "yes" to all the questions above, then we probably need
> > something like
> 
> yes to first, so ideally we fix registers to "correct" values
> (especially sp), but no to the last two (at least as far as BPF is
> concerned)

I think we should keep the same behaviour as it was for the trap,
so I think we should restore all registers and allow consumer to change it

jirka

> 
> >
> >         SYSCALL_DEFINE0(uretprobe)
> >         {
> >                 struct pt_regs *regs = task_pt_regs(current);
> >                 unsigned long err, ip, sp, r11_cx_ax[3];
> >
> >                 err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> >                 WARN_ON_ONCE(err);
> >
> >                 // Q1: apart from ax, do we really care?
> >                 // expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> >                 regs->r11 = r11_cx_ax[0];
> >                 regs->cx  = r11_cx_ax[1];
> >                 regs->ax  = r11_cx_ax[2];
> >                 regs->sp += sizeof(r11_cx_ax);
> >                 regs->orig_ax = -1;
> >
> >                 ip = regs->ip;
> >                 sp = regs->sp;
> >
> >                 uprobe_handle_trampoline(regs);
> >
> >                 // Q2: is it possible? do we care?
> >                 // uprobe_consumer has changed sp, we can do nothing,
> >                 // just return via iret.
> >                 if (regs->sp != sp)
> >                         return regs->ax;
> >                 regs->sp -= sizeof(r11_cx_ax);
> >
> >                 // Q3: is it possible? do we care?
> >                 // for the case uprobe_consumer has changed r11/cx
> >                 r11_cx_ax[0] = regs->r11;
> >                 r11_cx_ax[1] = regs->cx;
> >
> >                 // comment to explain this hack
> >                 r11_cx_ax[2] = regs->ip;
> >                 regs->ip = ip;
> >
> >                 err = copy_to_user((void __user*)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
> >                 WARN_ON_ONCE(err);
> >
> >                 // ensure sysret, see do_syscall_64()
> >                 regs->r11 = regs->flags;
> >                 regs->cx  = regs->ip;
> >
> >                 return regs->ax;
> >         }
> >
> > Oleg.
> >
Jiri Olsa March 21, 2024, 9:59 a.m. UTC | #12
On Wed, Mar 20, 2024 at 04:28:48PM +0100, Oleg Nesterov wrote:

SNIP

> 	SYSCALL_DEFINE0(uretprobe)
> 	{
> 		struct pt_regs *regs = task_pt_regs(current);
> 		unsigned long err, ip, sp, r11_cx_ax[3];
> 
> 		err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> 		WARN_ON_ONCE(err);
> 
> 		// Q1: apart from ax, do we really care?
> 		// expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> 		regs->r11 = r11_cx_ax[0];
> 		regs->cx  = r11_cx_ax[1];
> 		regs->ax  = r11_cx_ax[2];
> 		regs->sp += sizeof(r11_cx_ax);
> 		regs->orig_ax = -1;
> 
> 		ip = regs->ip;
> 		sp = regs->sp;
> 
> 		uprobe_handle_trampoline(regs);
> 
> 		// Q2: is it possible? do we care?
> 		// uprobe_consumer has changed sp, we can do nothing,
> 		// just return via iret.
> 		if (regs->sp != sp)
> 			return regs->ax;
> 		regs->sp -= sizeof(r11_cx_ax);
> 
> 		// Q3: is it possible? do we care?
> 		// for the case uprobe_consumer has changed r11/cx
> 		r11_cx_ax[0] = regs->r11;
> 		r11_cx_ax[1] = regs->cx;

I wonder we could add test for this as well, and check we return
proper register values in case the consuer changed them, will check

> 
> 		// comment to explain this hack
> 		r11_cx_ax[2] = regs->ip;
> 		regs->ip = ip;

we still need restore regs->ip in case do_syscall_64 decides to do
iret for some reason, right?

overall lgtm, thanks

jirka

> 
> 		err = copy_to_user((void __user*)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
> 		WARN_ON_ONCE(err);
> 
> 		// ensure sysret, see do_syscall_64()
> 		regs->r11 = regs->flags;
> 		regs->cx  = regs->ip;
> 
> 		return regs->ax;
> 	}
> 
> Oleg.
>
Oleg Nesterov March 21, 2024, 10:10 a.m. UTC | #13
On 03/20, Jiri Olsa wrote:
>
> On Wed, Mar 20, 2024 at 10:44:30AM -0700, Andrii Nakryiko wrote:
> > On Wed, Mar 20, 2024 at 8:30 AM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > > I have no idea what uprobe consumers / bpf programs can do, so let me ask:
> > > >
> > > >       - uprobe_consumer's will see the "wrong" values of regs->cx/r11/sp
> > > >         Is it OK? If not - easy to fix.
> > > >
> > > >       - can uprobe_consumer change regs->cx/r11 ? If yes - easy to fix.
> > > >
> > > >       - can uprobe_consumer change regs->sp ? If yes - easy to fix too,
> > > >         but needs a separate check/code.
> > >
> > > IOW. If answer is "yes" to all the questions above, then we probably need
> > > something like
> >
> > yes to first, so ideally we fix registers to "correct" values
> > (especially sp), but no to the last two (at least as far as BPF is
> > concerned)
>
> I think we should keep the same behaviour as it was for the trap,
> so I think we should restore all registers and allow consumer to change it

OK, agreed. Then something like the code below.

Oleg.

> > >         SYSCALL_DEFINE0(uretprobe)
> > >         {
> > >                 struct pt_regs *regs = task_pt_regs(current);
> > >                 unsigned long err, ip, sp, r11_cx_ax[3];
> > >
> > >                 err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> > >                 WARN_ON_ONCE(err);
> > >
> > >                 // Q1: apart from ax, do we really care?
> > >                 // expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> > >                 regs->r11 = r11_cx_ax[0];
> > >                 regs->cx  = r11_cx_ax[1];
> > >                 regs->ax  = r11_cx_ax[2];
> > >                 regs->sp += sizeof(r11_cx_ax);
> > >                 regs->orig_ax = -1;
> > >
> > >                 ip = regs->ip;
> > >                 sp = regs->sp;
> > >
> > >                 uprobe_handle_trampoline(regs);
> > >
> > >                 // Q2: is it possible? do we care?
> > >                 // uprobe_consumer has changed sp, we can do nothing,
> > >                 // just return via iret.
> > >                 if (regs->sp != sp)
> > >                         return regs->ax;
> > >                 regs->sp -= sizeof(r11_cx_ax);
> > >
> > >                 // Q3: is it possible? do we care?
> > >                 // for the case uprobe_consumer has changed r11/cx
> > >                 r11_cx_ax[0] = regs->r11;
> > >                 r11_cx_ax[1] = regs->cx;
> > >
> > >                 // comment to explain this hack
> > >                 r11_cx_ax[2] = regs->ip;
> > >                 regs->ip = ip;
> > >
> > >                 err = copy_to_user((void __user*)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
> > >                 WARN_ON_ONCE(err);
> > >
> > >                 // ensure sysret, see do_syscall_64()
> > >                 regs->r11 = regs->flags;
> > >                 regs->cx  = regs->ip;
> > >
> > >                 return regs->ax;
> > >         }
> > >
> > > Oleg.
> > >
>
Oleg Nesterov March 21, 2024, 10:17 a.m. UTC | #14
On 03/21, Jiri Olsa wrote:
>
> On Wed, Mar 20, 2024 at 04:28:48PM +0100, Oleg Nesterov wrote:
>
> SNIP
>
> > 	SYSCALL_DEFINE0(uretprobe)
> > 	{
> > 		struct pt_regs *regs = task_pt_regs(current);
> > 		unsigned long err, ip, sp, r11_cx_ax[3];
> >
> > 		err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> > 		WARN_ON_ONCE(err);
> >
> > 		// Q1: apart from ax, do we really care?
> > 		// expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> > 		regs->r11 = r11_cx_ax[0];
> > 		regs->cx  = r11_cx_ax[1];
> > 		regs->ax  = r11_cx_ax[2];
> > 		regs->sp += sizeof(r11_cx_ax);
> > 		regs->orig_ax = -1;
> >
> > 		ip = regs->ip;
> > 		sp = regs->sp;
> >
> > 		uprobe_handle_trampoline(regs);
> >
> > 		// Q2: is it possible? do we care?
> > 		// uprobe_consumer has changed sp, we can do nothing,
> > 		// just return via iret.
> > 		if (regs->sp != sp)
> > 			return regs->ax;
> > 		regs->sp -= sizeof(r11_cx_ax);
> >
> > 		// Q3: is it possible? do we care?
> > 		// for the case uprobe_consumer has changed r11/cx
> > 		r11_cx_ax[0] = regs->r11;
> > 		r11_cx_ax[1] = regs->cx;
>
> I wonder we could add test for this as well, and check we return
> proper register values in case the consuer changed them, will check
>
> >
> > 		// comment to explain this hack
> > 		r11_cx_ax[2] = regs->ip;
> > 		regs->ip = ip;
>
> we still need restore regs->ip in case do_syscall_64 decides to do
> iret for some reason, right?

I don't understand... could you spell?

AFAICS everything should work correctly even if do_syscall_64() returns F
and entry_SYSCALL_64() returns via iret. No?

> overall lgtm, thanks

OK, great, feel free to update this code according to your preferences and
use it in V2.

Oleg.
Jiri Olsa March 21, 2024, 10:52 a.m. UTC | #15
On Thu, Mar 21, 2024 at 11:17:51AM +0100, Oleg Nesterov wrote:
> On 03/21, Jiri Olsa wrote:
> >
> > On Wed, Mar 20, 2024 at 04:28:48PM +0100, Oleg Nesterov wrote:
> >
> > SNIP
> >
> > > 	SYSCALL_DEFINE0(uretprobe)
> > > 	{
> > > 		struct pt_regs *regs = task_pt_regs(current);
> > > 		unsigned long err, ip, sp, r11_cx_ax[3];
> > >
> > > 		err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> > > 		WARN_ON_ONCE(err);
> > >
> > > 		// Q1: apart from ax, do we really care?
> > > 		// expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> > > 		regs->r11 = r11_cx_ax[0];
> > > 		regs->cx  = r11_cx_ax[1];
> > > 		regs->ax  = r11_cx_ax[2];
> > > 		regs->sp += sizeof(r11_cx_ax);
> > > 		regs->orig_ax = -1;
> > >
> > > 		ip = regs->ip;
> > > 		sp = regs->sp;
> > >
> > > 		uprobe_handle_trampoline(regs);
> > >
> > > 		// Q2: is it possible? do we care?
> > > 		// uprobe_consumer has changed sp, we can do nothing,
> > > 		// just return via iret.
> > > 		if (regs->sp != sp)
> > > 			return regs->ax;
> > > 		regs->sp -= sizeof(r11_cx_ax);
> > >
> > > 		// Q3: is it possible? do we care?
> > > 		// for the case uprobe_consumer has changed r11/cx
> > > 		r11_cx_ax[0] = regs->r11;
> > > 		r11_cx_ax[1] = regs->cx;
> >
> > I wonder we could add test for this as well, and check we return
> > proper register values in case the consuer changed them, will check
> >
> > >
> > > 		// comment to explain this hack
> > > 		r11_cx_ax[2] = regs->ip;
> > > 		regs->ip = ip;
> >
> > we still need restore regs->ip in case do_syscall_64 decides to do
> > iret for some reason, right?
> 
> I don't understand... could you spell?

I was wondering why to restore regs->ip for sysret path, but do_syscall_64
can decide to do iret return (for which we need proper regs->ip) even if we
prepare cx/r11 registers for sysexit

>
> AFAICS everything should work correctly even if do_syscall_64() returns F
> and entry_SYSCALL_64() returns via iret. No?
> 
> > overall lgtm, thanks
> 
> OK, great, feel free to update this code according to your preferences and
> use it in V2.

will do, thanks

jirka
Oleg Nesterov March 21, 2024, 12:14 p.m. UTC | #16
On 03/21, Jiri Olsa wrote:
>
> On Thu, Mar 21, 2024 at 11:17:51AM +0100, Oleg Nesterov wrote:
> > On 03/21, Jiri Olsa wrote:
> > >
> > > On Wed, Mar 20, 2024 at 04:28:48PM +0100, Oleg Nesterov wrote:
> > >
> > > SNIP
> > >
> > > > 	SYSCALL_DEFINE0(uretprobe)
> > > > 	{
> > > > 		struct pt_regs *regs = task_pt_regs(current);
> > > > 		unsigned long err, ip, sp, r11_cx_ax[3];
> > > >
> > > > 		err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> > > > 		WARN_ON_ONCE(err);
> > > >
> > > > 		// Q1: apart from ax, do we really care?
> > > > 		// expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> > > > 		regs->r11 = r11_cx_ax[0];
> > > > 		regs->cx  = r11_cx_ax[1];
> > > > 		regs->ax  = r11_cx_ax[2];
> > > > 		regs->sp += sizeof(r11_cx_ax);
> > > > 		regs->orig_ax = -1;
> > > >
> > > > 		ip = regs->ip;
> > > > 		sp = regs->sp;
> > > >
> > > > 		uprobe_handle_trampoline(regs);
> > > >
> > > > 		// Q2: is it possible? do we care?
> > > > 		// uprobe_consumer has changed sp, we can do nothing,
> > > > 		// just return via iret.
> > > > 		if (regs->sp != sp)
> > > > 			return regs->ax;
> > > > 		regs->sp -= sizeof(r11_cx_ax);
> > > >
> > > > 		// Q3: is it possible? do we care?
> > > > 		// for the case uprobe_consumer has changed r11/cx
> > > > 		r11_cx_ax[0] = regs->r11;
> > > > 		r11_cx_ax[1] = regs->cx;
> > >
> > > I wonder we could add test for this as well, and check we return
> > > proper register values in case the consuer changed them, will check
> > >
> > > >
> > > > 		// comment to explain this hack
> > > > 		r11_cx_ax[2] = regs->ip;
> > > > 		regs->ip = ip;
> > >
> > > we still need restore regs->ip in case do_syscall_64 decides to do
> > > iret for some reason, right?
> >
> > I don't understand... could you spell?
>
> I was wondering why to restore regs->ip for sysret path, but do_syscall_64
> can decide to do iret return (for which we need proper regs->ip) even if we
> prepare cx/r11 registers for sysexit

Still don't understand... Yes, we prepare cx/r11 to avoid iret if possible.
But (apart from performance) we do not care if do_syscall_64() picks iret.
Either way

			regs->ip = ip;

above ensures that usermode returns to uretprobe_syscall_entry right after
the syscall insn. Then popq %r11/cx will restore r11/cx even if they were
changed by uprobe_consumer's. And then "retq" will return to the address
"returned" by handle_trampoline(regs) because we do

			// comment to explain this hack
			r11_cx_ax[2] = regs->ip;

after handle_trampoline(). This all doesn't depend on iret-or-sysret.

OK, I am sure you understand this, so I guess I misunderstood your concerns.

Oleg.
Jiri Olsa March 21, 2024, 8:29 p.m. UTC | #17
On Thu, Mar 21, 2024 at 01:14:56PM +0100, Oleg Nesterov wrote:
> On 03/21, Jiri Olsa wrote:
> >
> > On Thu, Mar 21, 2024 at 11:17:51AM +0100, Oleg Nesterov wrote:
> > > On 03/21, Jiri Olsa wrote:
> > > >
> > > > On Wed, Mar 20, 2024 at 04:28:48PM +0100, Oleg Nesterov wrote:
> > > >
> > > > SNIP
> > > >
> > > > > 	SYSCALL_DEFINE0(uretprobe)
> > > > > 	{
> > > > > 		struct pt_regs *regs = task_pt_regs(current);
> > > > > 		unsigned long err, ip, sp, r11_cx_ax[3];
> > > > >
> > > > > 		err = copy_from_user(r11_cx_ax, (void __user*)regs->sp, sizeof(r11_cx_ax));
> > > > > 		WARN_ON_ONCE(err);
> > > > >
> > > > > 		// Q1: apart from ax, do we really care?
> > > > > 		// expose the "right" values of r11/cx/ax/sp to uprobe_consumer's
> > > > > 		regs->r11 = r11_cx_ax[0];
> > > > > 		regs->cx  = r11_cx_ax[1];
> > > > > 		regs->ax  = r11_cx_ax[2];
> > > > > 		regs->sp += sizeof(r11_cx_ax);
> > > > > 		regs->orig_ax = -1;
> > > > >
> > > > > 		ip = regs->ip;
> > > > > 		sp = regs->sp;
> > > > >
> > > > > 		uprobe_handle_trampoline(regs);
> > > > >
> > > > > 		// Q2: is it possible? do we care?
> > > > > 		// uprobe_consumer has changed sp, we can do nothing,
> > > > > 		// just return via iret.
> > > > > 		if (regs->sp != sp)
> > > > > 			return regs->ax;
> > > > > 		regs->sp -= sizeof(r11_cx_ax);
> > > > >
> > > > > 		// Q3: is it possible? do we care?
> > > > > 		// for the case uprobe_consumer has changed r11/cx
> > > > > 		r11_cx_ax[0] = regs->r11;
> > > > > 		r11_cx_ax[1] = regs->cx;
> > > >
> > > > I wonder we could add test for this as well, and check we return
> > > > proper register values in case the consuer changed them, will check
> > > >
> > > > >
> > > > > 		// comment to explain this hack
> > > > > 		r11_cx_ax[2] = regs->ip;
> > > > > 		regs->ip = ip;
> > > >
> > > > we still need restore regs->ip in case do_syscall_64 decides to do
> > > > iret for some reason, right?
> > >
> > > I don't understand... could you spell?
> >
> > I was wondering why to restore regs->ip for sysret path, but do_syscall_64
> > can decide to do iret return (for which we need proper regs->ip) even if we
> > prepare cx/r11 registers for sysexit
> 
> Still don't understand... Yes, we prepare cx/r11 to avoid iret if possible.
> But (apart from performance) we do not care if do_syscall_64() picks iret.
> Either way
> 
> 			regs->ip = ip;
> 
> above ensures that usermode returns to uretprobe_syscall_entry right after
> the syscall insn. 

hm, I think above ensures that do_syscall_64 will skip the 'regs->cx != regs->ip'
check.. and after the sysret returns to rcx register value and ignores regs->ip

but in any case we need to set it

> ... Then popq %r11/cx will restore r11/cx even if they were
> changed by uprobe_consumer's. And then "retq" will return to the address
> "returned" by handle_trampoline(regs) because we do
> 
> 			// comment to explain this hack
> 			r11_cx_ax[2] = regs->ip;
> 
> after handle_trampoline(). This all doesn't depend on iret-or-sysret.
> 
> OK, I am sure you understand this, so I guess I misunderstood your concerns.

thanks for the patience ;-)

jirka
Oleg Nesterov March 22, 2024, 8:48 a.m. UTC | #18
On 03/21, Jiri Olsa wrote:
>
> On Thu, Mar 21, 2024 at 01:14:56PM +0100, Oleg Nesterov wrote:
> > Either way
> >
> > 			regs->ip = ip;
> >
> > above ensures that usermode returns to uretprobe_syscall_entry right after
> > the syscall insn.
>
> hm, I think above ensures that do_syscall_64 will skip the 'regs->cx != regs->ip'
> check.. and after the sysret returns to rcx register value and ignores regs->ip
                                                                 ^^^^^^^^^^^^^^^^

Yes, that is why do_syscall_64() returns F if regs->cx != regs->ip.

IOW. To oversimplify, the logic is

	// %rcx == ret ip
	entry_SYSCALL_64:

		pt_regs->ip = %rcx;
		pt_regs->cx = %rcx;

		do-syscall;

		%rcx = pt_regs->cx; // POP_REGS

		if (%rcx == pt_regs->ip) {
			// OK, we can use sysret which returns to rcx
			sysret;
		} else {
			// debugger/whatever changed rcx or ip, can't use sysret.
			// return to pt_regs->ip, see the "Return frame for iretq"
			// comment in struct pt_regs.
			iret;
		}

So return-to-usermode always returns to regs->ip and restores all the registers
from pt_regs, just it can be faster if we can use sysret.

> but in any case we need to set it

Yes, yes, sure.

Oleg.
diff mbox series

Patch

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 069371e86180..b99f1d80a8c8 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -319,6 +319,9 @@  asm (
 	"pushq %r11\n"
 	"movq $462, %rax\n"
 	"syscall\n"
+	"popq %r11\n"
+	"popq %rcx\n"
+	"retq\n"
 	".global uretprobe_syscall_end\n"
 	"uretprobe_syscall_end:\n"
 	".popsection\n"
@@ -336,23 +339,20 @@  void *arch_uprobe_trampoline(unsigned long *psize)
 SYSCALL_DEFINE0(uretprobe)
 {
 	struct pt_regs *regs = task_pt_regs(current);
-	unsigned long sregs[3], err;
+	unsigned long __user *ax_and_ret = (unsigned long __user *)regs->sp + 2;
+	unsigned long ip, err;
 
-	/*
-	 * We set rax and syscall itself changes rcx and r11, so the syscall
-	 * trampoline saves their original values on stack. We need to read
-	 * them and set original register values and fix the rsp pointer back.
-	 */
-	err = copy_from_user((void *) &sregs, (void *) regs->sp, sizeof(sregs));
-	WARN_ON_ONCE(err);
-
-	regs->r11 = sregs[0];
-	regs->cx = sregs[1];
-	regs->ax = sregs[2];
+	ip = regs->ip;
 	regs->orig_ax = -1;
-	regs->sp += sizeof(sregs);
+	err = get_user(regs->ax, ax_and_ret);
+	WARN_ON_ONCE(err);
 
 	uprobe_handle_trampoline(regs);
+
+	err = put_user(regs->ip, ax_and_ret);
+	WARN_ON_ONCE(err);
+	regs->ip = ip;
+
 	return regs->ax;
 }