diff mbox series

[bpf-next,v4,2/3] bpf,x64: use shrx/sarx/shlx when available

Message ID 20221002051143.831029-3-jmeng@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf,x64: Use BMI2 for shifts | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 18 maintainers not CCed: sdf@google.com john.fastabend@gmail.com dsahern@kernel.org davem@davemloft.net tglx@linutronix.de mingo@redhat.com martin.lau@linux.dev yhs@fb.com haoluo@google.com netdev@vger.kernel.org jolsa@kernel.org kpsingh@kernel.org song@kernel.org yoshfuji@linux-ipv6.org hpa@zytor.com x86@kernel.org dave.hansen@linux.intel.com bp@alien8.de
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2 this patch: 2
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_progs on s390x with gcc

Commit Message

Jie Meng Oct. 2, 2022, 5:11 a.m. UTC
Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
alternatives provided in BMI2 when advantageous; keep using the non BMI2
instructions when shift count is already in BPF_REG_4/rcx as non BMI2
instructions are shorter.

To summarize, when BMI2 is available:
-------------------------------------------------
            |   arbitrary dst
=================================================
src == ecx  |   shl dst, cl
-------------------------------------------------
src != ecx  |   shlx dst, dst, src
-------------------------------------------------

A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
%rdi:

Without BMI2:

 ef3:   push   %rcx
        51
 ef4:   mov    %rdi,%rcx
        48 89 f9
 ef7:   shl    %cl,%rsi
        48 d3 e6
 efa:   pop    %rcx
        59

With BMI2:

 f0b:   shlx   %rdi,%rsi,%rsi
        c4 e2 c1 f7 f6

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

Comments

KP Singh Oct. 6, 2022, 4:11 a.m. UTC | #1
On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@fb.com> wrote:
>
> Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> alternatives provided in BMI2 when advantageous; keep using the non BMI2
> instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> instructions are shorter.

This is confusing, you mention %CL in the first sentence and then RCX in the
second sentence. Can you clarify this more here?

Also, It would be good to have some explanations about the
performance benefits here as well.

i.e. a load + store + non vector instruction v/s a single vector instruction
omitting the load. How many cycles do we expect in each case, I do expect the
latter to be lesser, but mentioning it in the commit removes any ambiguity.

>
> To summarize, when BMI2 is available:
> -------------------------------------------------
>             |   arbitrary dst
> =================================================
> src == ecx  |   shl dst, cl
> -------------------------------------------------
> src != ecx  |   shlx dst, dst, src
> -------------------------------------------------
>
> A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> %rdi:
>
> Without BMI2:
>
>  ef3:   push   %rcx
>         51
>  ef4:   mov    %rdi,%rcx
>         48 89 f9
>  ef7:   shl    %cl,%rsi
>         48 d3 e6
>  efa:   pop    %rcx
>         59
>
> With BMI2:
>
>  f0b:   shlx   %rdi,%rsi,%rsi
>         c4 e2 c1 f7 f6
>
> Signed-off-by: Jie Meng <jmeng@fb.com>
> ---
>  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 64 insertions(+)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index d9ba997c5891..d09c54f3d2e0 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
>         *pprog = prog;
>  }
>
> +/* emit the 3-byte VEX prefix */
> +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> +                     bool w, u8 src_reg2, bool l, u8 p)

Can you please use somewhat more descriptive variable names here?

or add more information about what x, b, m, w, l and p mean?

> +{
> +       u8 *prog = *pprog;
> +       u8 b0 = 0xc4, b1, b2;
> +       u8 src2 = reg2hex[src_reg2];
> +
> +       if (is_ereg(src_reg2))
> +               src2 |= 1 << 3;
> +
> +       /*
> +        *    7                           0
> +        *  +---+---+---+---+---+---+---+---+
> +        *  |~R |~X |~B |         m         |
> +        *  +---+---+---+---+---+---+---+---+
> +        */
> +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);

Some explanation here would help, not everyone is aware of x86 vex encoding :)

> +       /*
> +        *    7                           0
> +        *  +---+---+---+---+---+---+---+---+
> +        *  | W |     ~vvvv     | L |   pp  |
> +        *  +---+---+---+---+---+---+---+---+
> +        */
> +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> +
> +       EMIT3(b0, b1, b2);
> +       *pprog = prog;
> +}
> +
> +/* emit BMI2 shift instruction */
> +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> +{
> +       u8 *prog = *pprog;
> +       bool r = is_ereg(dst_reg);
> +       u8 m = 2; /* escape code 0f38 */
> +
> +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> +       *pprog = prog;
> +}
> +
>  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
>
>  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>                 case BPF_ALU64 | BPF_LSH | BPF_X:
>                 case BPF_ALU64 | BPF_RSH | BPF_X:
>                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> +                               u8 op;
> +
> +                               switch (BPF_OP(insn->code)) {
> +                               case BPF_LSH:
> +                                       op = 1; /* prefix 0x66 */
> +                                       break;
> +                               case BPF_RSH:
> +                                       op = 3; /* prefix 0xf2 */
> +                                       break;
> +                               case BPF_ARSH:
> +                                       op = 2; /* prefix 0xf3 */
> +                                       break;
> +                               }
> +
> +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> +
> +                               break;
> +                       }
>
>                         if (src_reg != BPF_REG_4) { /* common case */
>                                 /* Check for bad case when dst_reg == rcx */
> --
> 2.30.2
>
Jie Meng Oct. 7, 2022, 3:14 a.m. UTC | #2
On Wed, Oct 05, 2022 at 09:11:01PM -0700, KP Singh wrote:
> On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@fb.com> wrote:
> >
> > Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> > alternatives provided in BMI2 when advantageous; keep using the non BMI2
> > instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> > instructions are shorter.
> 
> This is confusing, you mention %CL in the first sentence and then RCX in the
> second sentence. Can you clarify this more here?

%cl is the lowest 8 bit of %rcx. In assembly, non BMI2 shifts with shift
count in register is written as

 SHR eax, cl

Although the use of CL is mandatory and if the shift count is in another
register it has to be moved into RCX first, unless of course when the
shift count is already in BPF_REG_4, which is mapped to RCX in x86-64.

It is indeed awkward but exactly what one would see in assembly: a MOV
to RCX and a shift that uses CL as the source register.

> 
> Also, It would be good to have some explanations about the
> performance benefits here as well.
> 
> i.e. a load + store + non vector instruction v/s a single vector instruction
> omitting the load. How many cycles do we expect in each case, I do expect the
> latter to be lesser, but mentioning it in the commit removes any ambiguity.

Although it uses similar encoding as AVX instructions BMI2 actually
operates on general purpose registers and no vector register is ever
involved [1]. Inside a CPU all shifts instructions (both baseline and BMI2
flavors) are almost always handled by the same units and have the same
latency and throughput [2].

[1] https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
[2] https://www.agner.org/optimize/instruction_tables.pdf
> 
> >
> > To summarize, when BMI2 is available:
> > -------------------------------------------------
> >             |   arbitrary dst
> > =================================================
> > src == ecx  |   shl dst, cl
> > -------------------------------------------------
> > src != ecx  |   shlx dst, dst, src
> > -------------------------------------------------
> >
> > A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> > %rdi:
> >
> > Without BMI2:
> >
> >  ef3:   push   %rcx
> >         51
> >  ef4:   mov    %rdi,%rcx
> >         48 89 f9
> >  ef7:   shl    %cl,%rsi
> >         48 d3 e6
> >  efa:   pop    %rcx
> >         59
> >
> > With BMI2:
> >
> >  f0b:   shlx   %rdi,%rsi,%rsi
> >         c4 e2 c1 f7 f6
> >
> > Signed-off-by: Jie Meng <jmeng@fb.com>
> > ---
> >  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
> >  1 file changed, 64 insertions(+)
> >
> > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> > index d9ba997c5891..d09c54f3d2e0 100644
> > --- a/arch/x86/net/bpf_jit_comp.c
> > +++ b/arch/x86/net/bpf_jit_comp.c
> > @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
> >         *pprog = prog;
> >  }
> >
> > +/* emit the 3-byte VEX prefix */
> > +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> > +                     bool w, u8 src_reg2, bool l, u8 p)
> 
> Can you please use somewhat more descriptive variable names here?
> 
> or add more information about what x, b, m, w, l and p mean?

Apart from src_reg2, the rest is the same as what Intel has chosen to
name the various fields in the VEX prefix. Would rather keep them 
consistent so that people won't get confused when comparing with other
documents across the Internet.
> 
> > +{
> > +       u8 *prog = *pprog;
> > +       u8 b0 = 0xc4, b1, b2;
> > +       u8 src2 = reg2hex[src_reg2];
> > +
> > +       if (is_ereg(src_reg2))
> > +               src2 |= 1 << 3;
> > +
> > +       /*
> > +        *    7                           0
> > +        *  +---+---+---+---+---+---+---+---+
> > +        *  |~R |~X |~B |         m         |
> > +        *  +---+---+---+---+---+---+---+---+
> > +        */
> > +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> 
> Some explanation here would help, not everyone is aware of x86 vex encoding :)

The comment is the exact rule how different pieces of information is
encoded into the 3-byte VEX prefix i.e. their position and length, and
whether a field needs to be bit inverted. Combined with code the comment
should give one clear idea what the intent is here.
> 
> > +       /*
> > +        *    7                           0
> > +        *  +---+---+---+---+---+---+---+---+
> > +        *  | W |     ~vvvv     | L |   pp  |
> > +        *  +---+---+---+---+---+---+---+---+
> > +        */
> > +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> > +
> > +       EMIT3(b0, b1, b2);
> > +       *pprog = prog;
> > +}
> > +
> > +/* emit BMI2 shift instruction */
> > +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> > +{
> > +       u8 *prog = *pprog;
> > +       bool r = is_ereg(dst_reg);
> > +       u8 m = 2; /* escape code 0f38 */
> > +
> > +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> > +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> > +       *pprog = prog;
> > +}
> > +
> >  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
> >
> >  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> > @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> >                 case BPF_ALU64 | BPF_LSH | BPF_X:
> >                 case BPF_ALU64 | BPF_RSH | BPF_X:
> >                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> > +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> > +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> > +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> > +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> > +                               u8 op;
> > +
> > +                               switch (BPF_OP(insn->code)) {
> > +                               case BPF_LSH:
> > +                                       op = 1; /* prefix 0x66 */
> > +                                       break;
> > +                               case BPF_RSH:
> > +                                       op = 3; /* prefix 0xf2 */
> > +                                       break;
> > +                               case BPF_ARSH:
> > +                                       op = 2; /* prefix 0xf3 */
> > +                                       break;
> > +                               }
> > +
> > +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> > +
> > +                               break;
> > +                       }
> >
> >                         if (src_reg != BPF_REG_4) { /* common case */
> >                                 /* Check for bad case when dst_reg == rcx */
> > --
> > 2.30.2
> >
KP Singh Oct. 7, 2022, 6:11 p.m. UTC | #3
On Thu, Oct 6, 2022 at 8:14 PM Jie Meng <jmeng@fb.com> wrote:
>
> On Wed, Oct 05, 2022 at 09:11:01PM -0700, KP Singh wrote:
> > On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@fb.com> wrote:
> > >
> > > Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> > > alternatives provided in BMI2 when advantageous; keep using the non BMI2
> > > instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> > > instructions are shorter.
> >
> > This is confusing, you mention %CL in the first sentence and then RCX in the
> > second sentence. Can you clarify this more here?
>
> %cl is the lowest 8 bit of %rcx. In assembly, non BMI2 shifts with shift
> count in register is written as
>
>  SHR eax, cl
>
> Although the use of CL is mandatory and if the shift count is in another
> register it has to be moved into RCX first, unless of course when the
> shift count is already in BPF_REG_4, which is mapped to RCX in x86-64.
>
> It is indeed awkward but exactly what one would see in assembly: a MOV
> to RCX and a shift that uses CL as the source register.
>
> >
> > Also, It would be good to have some explanations about the
> > performance benefits here as well.
> >
> > i.e. a load + store + non vector instruction v/s a single vector instruction
> > omitting the load. How many cycles do we expect in each case, I do expect the
> > latter to be lesser, but mentioning it in the commit removes any ambiguity.
>
> Although it uses similar encoding as AVX instructions BMI2 actually
> operates on general purpose registers and no vector register is ever
> involved [1]. Inside a CPU all shifts instructions (both baseline and BMI2
> flavors) are almost always handled by the same units and have the same
> latency and throughput [2].
>
> [1] https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
> [2] https://www.agner.org/optimize/instruction_tables.pdf

Cool, please add this to the commit description.

> >
> > >
> > > To summarize, when BMI2 is available:
> > > -------------------------------------------------
> > >             |   arbitrary dst
> > > =================================================
> > > src == ecx  |   shl dst, cl
> > > -------------------------------------------------
> > > src != ecx  |   shlx dst, dst, src
> > > -------------------------------------------------
> > >
> > > A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> > > %rdi:
> > >
> > > Without BMI2:
> > >
> > >  ef3:   push   %rcx
> > >         51
> > >  ef4:   mov    %rdi,%rcx
> > >         48 89 f9
> > >  ef7:   shl    %cl,%rsi
> > >         48 d3 e6
> > >  efa:   pop    %rcx
> > >         59
> > >
> > > With BMI2:
> > >
> > >  f0b:   shlx   %rdi,%rsi,%rsi
> > >         c4 e2 c1 f7 f6
> > >
> > > Signed-off-by: Jie Meng <jmeng@fb.com>
> > > ---
> > >  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 64 insertions(+)
> > >
> > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> > > index d9ba997c5891..d09c54f3d2e0 100644
> > > --- a/arch/x86/net/bpf_jit_comp.c
> > > +++ b/arch/x86/net/bpf_jit_comp.c
> > > @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
> > >         *pprog = prog;
> > >  }
> > >
> > > +/* emit the 3-byte VEX prefix */
> > > +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> > > +                     bool w, u8 src_reg2, bool l, u8 p)
> >
> > Can you please use somewhat more descriptive variable names here?
> >
> > or add more information about what x, b, m, w, l and p mean?
>
> Apart from src_reg2, the rest is the same as what Intel has chosen to
> name the various fields in the VEX prefix. Would rather keep them
> consistent so that people won't get confused when comparing with other
> documents across the Internet.

Sure, but it would be nice to have a comment about what they mean.

These bits allow indexing various kinds of registers. e.g.

"VEX.~R allows is an extra bit for indexing the ModRM register" or something
similar, if it's preferred not to change the variable names.


> >
> > > +{
> > > +       u8 *prog = *pprog;
> > > +       u8 b0 = 0xc4, b1, b2;
> > > +       u8 src2 = reg2hex[src_reg2];
> > > +
> > > +       if (is_ereg(src_reg2))
> > > +               src2 |= 1 << 3;
> > > +
> > > +       /*
> > > +        *    7                           0
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        *  |~R |~X |~B |         m         |
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        */
> > > +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> >
> > Some explanation here would help, not everyone is aware of x86 vex encoding :)
>
> The comment is the exact rule how different pieces of information is
> encoded into the 3-byte VEX prefix i.e. their position and length, and
> whether a field needs to be bit inverted. Combined with code the comment
> should give one clear idea what the intent is here.

I am not sure this gives a clear picture, It assumes that the reader knows
about VEX encoding, which not everyone does. Now they can pull up a
manual and start reading but that doesn't help so the comments need to
explain what's going on here.


> >
> > > +       /*
> > > +        *    7                           0
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        *  | W |     ~vvvv     | L |   pp  |
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        */
> > > +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);

By reading the code one should be able to understand what
b0, b1 and b2 are.


> > > +
> > > +       EMIT3(b0, b1, b2);
> > > +       *pprog = prog;
> > > +}
> > > +
> > > +/* emit BMI2 shift instruction */
> > > +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> > > +{
> > > +       u8 *prog = *pprog;
> > > +       bool r = is_ereg(dst_reg);
> > > +       u8 m = 2; /* escape code 0f38 */
> > > +
> > > +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> > > +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> > > +       *pprog = prog;
> > > +}
> > > +
> > >  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
> > >
> > >  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> > > @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> > >                 case BPF_ALU64 | BPF_LSH | BPF_X:
> > >                 case BPF_ALU64 | BPF_RSH | BPF_X:
> > >                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> > > +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> > > +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> > > +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> > > +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> > > +                               u8 op;
> > > +
> > > +                               switch (BPF_OP(insn->code)) {
> > > +                               case BPF_LSH:
> > > +                                       op = 1; /* prefix 0x66 */
> > > +                                       break;
> > > +                               case BPF_RSH:
> > > +                                       op = 3; /* prefix 0xf2 */
> > > +                                       break;
> > > +                               case BPF_ARSH:
> > > +                                       op = 2; /* prefix 0xf3 */
> > > +                                       break;
> > > +                               }
> > > +
> > > +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> > > +
> > > +                               break;
> > > +                       }
> > >
> > >                         if (src_reg != BPF_REG_4) { /* common case */
> > >                                 /* Check for bad case when dst_reg == rcx */
> > > --
> > > 2.30.2
> > >
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d9ba997c5891..d09c54f3d2e0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -889,6 +889,48 @@  static void emit_nops(u8 **pprog, int len)
 	*pprog = prog;
 }
 
+/* emit the 3-byte VEX prefix */
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+		      bool w, u8 src_reg2, bool l, u8 p)
+{
+	u8 *prog = *pprog;
+	u8 b0 = 0xc4, b1, b2;
+	u8 src2 = reg2hex[src_reg2];
+
+	if (is_ereg(src_reg2))
+		src2 |= 1 << 3;
+
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  |~R |~X |~B |         m         |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  | W |     ~vvvv     | L |   pp  |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
+
+	EMIT3(b0, b1, b2);
+	*pprog = prog;
+}
+
+/* emit BMI2 shift instruction */
+static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
+{
+	u8 *prog = *pprog;
+	bool r = is_ereg(dst_reg);
+	u8 m = 2; /* escape code 0f38 */
+
+	emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
+	EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1135,6 +1177,28 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_LSH | BPF_X:
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			/* BMI2 shifts aren't better when shift count is already in rcx */
+			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+				u8 op;
+
+				switch (BPF_OP(insn->code)) {
+				case BPF_LSH:
+					op = 1; /* prefix 0x66 */
+					break;
+				case BPF_RSH:
+					op = 3; /* prefix 0xf2 */
+					break;
+				case BPF_ARSH:
+					op = 2; /* prefix 0xf3 */
+					break;
+				}
+
+				emit_shiftx(&prog, dst_reg, src_reg, w, op);
+
+				break;
+			}
 
 			if (src_reg != BPF_REG_4) { /* common case */
 				/* Check for bad case when dst_reg == rcx */