diff mbox series

[bpf-next,07/13] uprobes/x86: Add support to emulate nop5 instruction

Message ID 20241211133403.208920-8-jolsa@kernel.org (mailing list archive)
State New
Headers show
Series uprobes: Add support to optimize usdt probes on x86_64 | expand

Commit Message

Jiri Olsa Dec. 11, 2024, 1:33 p.m. UTC
Adding support to emulate nop5 as the original uprobe instruction.

This speeds up uprobes on top of nop5 instructions:
(results from benchs/run_bench_uprobes.sh)

current:

     uprobe-nop     :    3.252 ± 0.019M/s
     uprobe-push    :    3.097 ± 0.002M/s
     uprobe-ret     :    1.116 ± 0.001M/s
 --> uprobe-nop5    :    1.115 ± 0.001M/s
     uretprobe-nop  :    1.731 ± 0.016M/s
     uretprobe-push :    1.673 ± 0.023M/s
     uretprobe-ret  :    0.843 ± 0.009M/s
 --> uretprobe-nop5 :    1.124 ± 0.001M/s

after the change:

     uprobe-nop     :    3.281 ± 0.003M/s
     uprobe-push    :    3.085 ± 0.003M/s
     uprobe-ret     :    1.130 ± 0.000M/s
 --> uprobe-nop5    :    3.276 ± 0.007M/s
     uretprobe-nop  :    1.716 ± 0.016M/s
     uretprobe-push :    1.651 ± 0.017M/s
     uretprobe-ret  :    0.846 ± 0.006M/s
 --> uretprobe-nop5 :    3.279 ± 0.002M/s

Strangely I can see uretprobe-nop5 is now much faster compared to
uretprobe-nop, while perf profiles for both are almost identical.
I'm still checking on that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 7 +++++++
 1 file changed, 7 insertions(+)

Comments

Peter Zijlstra Dec. 13, 2024, 10:45 a.m. UTC | #1
On Wed, Dec 11, 2024 at 02:33:56PM +0100, Jiri Olsa wrote:
> Adding support to emulate nop5 as the original uprobe instruction.
> 
> This speeds up uprobes on top of nop5 instructions:
> (results from benchs/run_bench_uprobes.sh)
> 
> current:
> 
>      uprobe-nop     :    3.252 ± 0.019M/s
>      uprobe-push    :    3.097 ± 0.002M/s
>      uprobe-ret     :    1.116 ± 0.001M/s
>  --> uprobe-nop5    :    1.115 ± 0.001M/s
>      uretprobe-nop  :    1.731 ± 0.016M/s
>      uretprobe-push :    1.673 ± 0.023M/s
>      uretprobe-ret  :    0.843 ± 0.009M/s
>  --> uretprobe-nop5 :    1.124 ± 0.001M/s
> 
> after the change:
> 
>      uprobe-nop     :    3.281 ± 0.003M/s
>      uprobe-push    :    3.085 ± 0.003M/s
>      uprobe-ret     :    1.130 ± 0.000M/s
>  --> uprobe-nop5    :    3.276 ± 0.007M/s
>      uretprobe-nop  :    1.716 ± 0.016M/s
>      uretprobe-push :    1.651 ± 0.017M/s
>      uretprobe-ret  :    0.846 ± 0.006M/s
>  --> uretprobe-nop5 :    3.279 ± 0.002M/s
> 
> Strangely I can see uretprobe-nop5 is now much faster compared to
> uretprobe-nop, while perf profiles for both are almost identical.
> I'm still checking on that.
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/kernel/uprobes.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 23e4f2821cff..cdea97f8cd39 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -909,6 +909,11 @@ static const struct uprobe_xol_ops push_xol_ops = {
>  	.emulate  = push_emulate_op,
>  };
>  
> +static int is_nop5_insn(uprobe_opcode_t *insn)
> +{
> +	return !memcmp(insn, x86_nops[5], 5);
> +}
> +
>  /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
>  static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
>  {
> @@ -928,6 +933,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
>  		break;
>  
>  	case 0x0f:
> +		if (is_nop5_insn((uprobe_opcode_t *) &auprobe->insn))
> +			goto setup;

This isn't right, this is not x86_64 specific code, and there's a bunch
of 32bit 5 byte nops that do not start with 0f.

Also, since you already have the insn decoded, I would suggest you
simply check OPCODE2(insn) == 0x1f /* NOPL */ and length == 5.

>  		if (insn->opcode.nbytes != 2)
>  			return -ENOSYS;
>  		/*
> -- 
> 2.47.0
>
Jiri Olsa Dec. 13, 2024, 1:02 p.m. UTC | #2
On Fri, Dec 13, 2024 at 11:45:36AM +0100, Peter Zijlstra wrote:
> On Wed, Dec 11, 2024 at 02:33:56PM +0100, Jiri Olsa wrote:
> > Adding support to emulate nop5 as the original uprobe instruction.
> > 
> > This speeds up uprobes on top of nop5 instructions:
> > (results from benchs/run_bench_uprobes.sh)
> > 
> > current:
> > 
> >      uprobe-nop     :    3.252 ± 0.019M/s
> >      uprobe-push    :    3.097 ± 0.002M/s
> >      uprobe-ret     :    1.116 ± 0.001M/s
> >  --> uprobe-nop5    :    1.115 ± 0.001M/s
> >      uretprobe-nop  :    1.731 ± 0.016M/s
> >      uretprobe-push :    1.673 ± 0.023M/s
> >      uretprobe-ret  :    0.843 ± 0.009M/s
> >  --> uretprobe-nop5 :    1.124 ± 0.001M/s
> > 
> > after the change:
> > 
> >      uprobe-nop     :    3.281 ± 0.003M/s
> >      uprobe-push    :    3.085 ± 0.003M/s
> >      uprobe-ret     :    1.130 ± 0.000M/s
> >  --> uprobe-nop5    :    3.276 ± 0.007M/s
> >      uretprobe-nop  :    1.716 ± 0.016M/s
> >      uretprobe-push :    1.651 ± 0.017M/s
> >      uretprobe-ret  :    0.846 ± 0.006M/s
> >  --> uretprobe-nop5 :    3.279 ± 0.002M/s
> > 
> > Strangely I can see uretprobe-nop5 is now much faster compared to
> > uretprobe-nop, while perf profiles for both are almost identical.
> > I'm still checking on that.
> > 
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  arch/x86/kernel/uprobes.c | 7 +++++++
> >  1 file changed, 7 insertions(+)
> > 
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 23e4f2821cff..cdea97f8cd39 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -909,6 +909,11 @@ static const struct uprobe_xol_ops push_xol_ops = {
> >  	.emulate  = push_emulate_op,
> >  };
> >  
> > +static int is_nop5_insn(uprobe_opcode_t *insn)
> > +{
> > +	return !memcmp(insn, x86_nops[5], 5);
> > +}
> > +
> >  /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
> >  static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
> >  {
> > @@ -928,6 +933,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
> >  		break;
> >  
> >  	case 0x0f:
> > +		if (is_nop5_insn((uprobe_opcode_t *) &auprobe->insn))
> > +			goto setup;
> 
> This isn't right, this is not x86_64 specific code, and there's a bunch
> of 32bit 5 byte nops that do not start with 0f.
> 
> Also, since you already have the insn decoded, I would suggest you
> simply check OPCODE2(insn) == 0x1f /* NOPL */ and length == 5.

ah right.. ok will change, thanks

jirka

> 
> >  		if (insn->opcode.nbytes != 2)
> >  			return -ENOSYS;
> >  		/*
> > -- 
> > 2.47.0
> >
diff mbox series

Patch

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 23e4f2821cff..cdea97f8cd39 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -909,6 +909,11 @@  static const struct uprobe_xol_ops push_xol_ops = {
 	.emulate  = push_emulate_op,
 };
 
+static int is_nop5_insn(uprobe_opcode_t *insn)
+{
+	return !memcmp(insn, x86_nops[5], 5);
+}
+
 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
 static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 {
@@ -928,6 +933,8 @@  static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 		break;
 
 	case 0x0f:
+		if (is_nop5_insn((uprobe_opcode_t *) &auprobe->insn))
+			goto setup;
 		if (insn->opcode.nbytes != 2)
 			return -ENOSYS;
 		/*