diff mbox series

[v2,bpf-next,2/5] bpf: Introduce "volatile compare" macro

Message ID 20231221033854.38397-3-alexei.starovoitov@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: volatile compare | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 16 maintainers not CCed: ndesaulniers@google.com sdf@google.com llvm@lists.linux.dev kpsingh@kernel.org jolsa@kernel.org shuah@kernel.org justinstitt@google.com yonghong.song@linux.dev linux-kselftest@vger.kernel.org nathan@kernel.org morbo@google.com davemarchevsky@fb.com haoluo@google.com martin.lau@linux.dev mykolal@fb.com song@kernel.org
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch fail ERROR: Macros with complex values should be enclosed in parentheses WARNING: Macros with flow control statements should be avoided WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: please, no spaces at the start of a line WARNING: space prohibited between function name and open parenthesis '('
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Alexei Starovoitov Dec. 21, 2023, 3:38 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Compilers optimize conditional operators at will, but often bpf programmers
want to force compilers to keep the same operator in asm as it's written in C.
Introduce bpf_cmp(var1, conditional_op, var2) macro that can be used as:

-               if (seen >= 1000)
+               if (bpf_cmp(seen, >=, 1000))

The macro takes advantage of BPF assembly that is C like.

The macro checks the sign of variable 'seen' and emits either
signed or unsigned compare.

For example:
int a;
bpf_cmp(a, >, 0) will be translated to 'if rX s> 0 goto' in BPF assembly.

unsigned int a;
bpf_cmp(a, >, 0) will be translated to 'if rX > 0 goto' in BPF assembly.

C type conversions coupled with comparison operator are tricky.
  int i = -1;
  unsigned int j = 1;
  if (i < j) // this is false.

  long i = -1;
  unsigned int j = 1;
  if (i < j) // this is true.

Make sure BPF program is compiled with -Wsign-compare then the macro will catch
the mistake.

The macro checks LHS (left hand side) only to figure out the sign of compare.

'if 0 < rX goto' is not allowed in the assembly, so the users
have to use a variable on LHS anyway.

The patch updates few tests to demonstrate the use of the macro.

The macro allows to use BPF_JSET in C code, since LLVM doesn't generate it at
present. For example:

if (i & j) compiles into r0 &= r1; if r0 == 0 goto

while

if (bpf_cmp(i, &, j)) compiles into if r0 & r1 goto

Note that the macro has to be careful with RHS assembly predicate.
Since:
u64 __rhs = 1ull << 42;
asm goto("if r0 < %[rhs] goto +1" :: [rhs] "ri" (__rhs));
LLVM will silently truncate 64-bit constant into s32 imm.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/bpf_experimental.h  | 44 +++++++++++++++++++
 .../testing/selftests/bpf/progs/exceptions.c  | 20 ++++-----
 .../selftests/bpf/progs/iters_task_vma.c      |  3 +-
 3 files changed, 55 insertions(+), 12 deletions(-)

Comments

Kumar Kartikeya Dwivedi Dec. 21, 2023, 4:27 a.m. UTC | #1
On Thu, 21 Dec 2023 at 04:39, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Compilers optimize conditional operators at will, but often bpf programmers
> want to force compilers to keep the same operator in asm as it's written in C.
> Introduce bpf_cmp(var1, conditional_op, var2) macro that can be used as:
>
> -               if (seen >= 1000)
> +               if (bpf_cmp(seen, >=, 1000))
>
> The macro takes advantage of BPF assembly that is C like.
>
> The macro checks the sign of variable 'seen' and emits either
> signed or unsigned compare.
>
> For example:
> int a;
> bpf_cmp(a, >, 0) will be translated to 'if rX s> 0 goto' in BPF assembly.
>
> unsigned int a;
> bpf_cmp(a, >, 0) will be translated to 'if rX > 0 goto' in BPF assembly.
>
> C type conversions coupled with comparison operator are tricky.
>   int i = -1;
>   unsigned int j = 1;
>   if (i < j) // this is false.
>
>   long i = -1;
>   unsigned int j = 1;
>   if (i < j) // this is true.
>
> Make sure BPF program is compiled with -Wsign-compare then the macro will catch
> the mistake.
>
> The macro checks LHS (left hand side) only to figure out the sign of compare.
>
> 'if 0 < rX goto' is not allowed in the assembly, so the users
> have to use a variable on LHS anyway.
>
> The patch updates few tests to demonstrate the use of the macro.
>
> The macro allows to use BPF_JSET in C code, since LLVM doesn't generate it at
> present. For example:
>
> if (i & j) compiles into r0 &= r1; if r0 == 0 goto
>
> while
>
> if (bpf_cmp(i, &, j)) compiles into if r0 & r1 goto
>
> Note that the macro has to be careful with RHS assembly predicate.
> Since:
> u64 __rhs = 1ull << 42;
> asm goto("if r0 < %[rhs] goto +1" :: [rhs] "ri" (__rhs));
> LLVM will silently truncate 64-bit constant into s32 imm.
>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---

Sorry about the delayed feedback on this, I'm knocked out due to flu.

The macro looks very useful and I like the simplification for the
assertion macros, +1, but I would just like to make sure one important
point about this is not missed.

When I was writing the _eq/lt/... variants I noticed that unless I use
LHS to be a full 8-byte register the compiler can still play
shenanigans even with inline assembly, like choosing a different input
register for LHS than the one already allocated for a variable before
the assembly is emitted, doing left/right shifts to mask upper bits
before the inline assembly logic, and thus since the scalar id
association is broken on that, the cmp does not propagate to what are
logical copies.

I noticed it often when say a 8-byte value (known to be small) was
saved as int len = something, asserted upon, and then used later in
the program.

One of the contracts of the bpf_assert_eq* macros compared to just
plain bpf_assert was that the former was supposed to guarantee the
assertion resulted in the verifier preserving the new knowledge for
the LHS variable.

So maybe in addition to using bpf_cmp, we should instead do a
bpf_assert_op as a replacement that enforces this rule of LHS being 8
byte (basically just perform __bpf_assert_check) before dispatching to
the bpf_cmp.
WDYT? Or just enforce that in bpf_cmp (but that might be too strict
for general use of bpf_cmp on its own, so might be better to have a
separate macro).

Apart from that, consider
Acked-by: Kumar Kartikeya Dwivedi <memor@gmail.com>
for the whole set.
Alexei Starovoitov Dec. 22, 2023, 10:59 p.m. UTC | #2
On Wed, Dec 20, 2023 at 8:28 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> When I was writing the _eq/lt/... variants I noticed that unless I use
> LHS to be a full 8-byte register the compiler can still play
> shenanigans even with inline assembly, like choosing a different input
> register for LHS than the one already allocated for a variable before
> the assembly is emitted, doing left/right shifts to mask upper bits
> before the inline assembly logic, and thus since the scalar id
> association is broken on that, the cmp does not propagate to what are
> logical copies.

I saw that llvm can add a redundant r1 = w2 and use r1 as LHS in some cases,
but I haven't seen extra shifts.
When it's assignment like this then the register link is there and
the verifier correctly propagates the range knowledge.
Could you share an example where you saw shifts?

I also don't understand the point of:
_Static_assert(__builtin_constant_p((RHS)), "2nd argument must be a
constant expression")
in your macro.
bpf_assert(bpf_cmp(foo, ==, bar));
seems useful too. Restricting RHS to a constant looks odd.

> So maybe in addition to using bpf_cmp, we should instead do a
> bpf_assert_op as a replacement that enforces this rule of LHS being 8
> byte (basically just perform __bpf_assert_check) before dispatching to
> the bpf_cmp.

I don't see the need to restrict LHS to 8 byte.

I considered making bpf_cmp macro smarter and use "w" registers
when sizeof(lhs)<=4, but I suspect it won't help the verifier much
since 32-bit compares update lower 32-bit only and range of
upper 32-bit may stay unknown (depending on what was in the code before 'if').
So I kept "r" only.

I still need to investigate why barrier_var is better in profiler.c.
Alexei Starovoitov Dec. 25, 2023, 8:33 p.m. UTC | #3
On Fri, Dec 22, 2023 at 2:59 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Dec 20, 2023 at 8:28 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > When I was writing the _eq/lt/... variants I noticed that unless I use
> > LHS to be a full 8-byte register the compiler can still play
> > shenanigans even with inline assembly, like choosing a different input
> > register for LHS than the one already allocated for a variable before
> > the assembly is emitted, doing left/right shifts to mask upper bits
> > before the inline assembly logic, and thus since the scalar id
> > association is broken on that, the cmp does not propagate to what are
> > logical copies.
>
> I saw that llvm can add a redundant r1 = w2 and use r1 as LHS in some cases,
> but I haven't seen extra shifts.

It turned out there are indeed a bunch of redundant shifts
when u32 or s32 is passed into "r" asm constraint.

Strangely the shifts are there when compiled with -mcpu=v3 or v4
and no shifts with -mcpu=v1 and v2.

Also weird that u8 and u16 are passed into "r" without redundant shifts.
Hence I found a "workaround": cast u32 into u16 while passing.
The truncation of u32 doesn't happen and shifts to zero upper 32-bit
are gone as well.

https://godbolt.org/z/Kqszr6q3v

Another issue is llvm removes asm() completely when output "+r"
constraint is used. It has to be asm volatile to convince llvm
to keep that asm block. That's bad1() case.
asm() stays in place when input "r" constraint is used.
That's bad2().
asm() removal happens with x86 backend too. So maybe it's a feature?

I was worried whether:
asm("mov %[reg],%[reg]"::[reg]"r"((short)var));
is buggy too. Meaning whether the compiler should truncate before
allocating an input register.
It turns out it's not.
At least there is no truncation on x86 and on arm64.

https://godbolt.org/z/ovMdPxnj5

So I'm planning to unconditionally use "(short)var" workaround in bpf_cmp.

Anyway, none of that is urgent.
Yonghong or Eduard,
Please take a look at redundant shifts issue with mcpu=v3 after the holidays.
Eduard Zingerman Jan. 4, 2024, 8:06 p.m. UTC | #4
On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
[...]

Regarding disappearing asm blocks.

> https://godbolt.org/z/Kqszr6q3v
> 
> Another issue is llvm removes asm() completely when output "+r"
> constraint is used. It has to be asm volatile to convince llvm
> to keep that asm block. That's bad1() case.
> asm() stays in place when input "r" constraint is used.
> That's bad2().
> asm() removal happens with x86 backend too. So maybe it's a feature?

The difference between bad1() and bad2() is small:

                             .---- output operand
                             v
bad1:    asm("%[reg] += 1":[reg]"+r"(var));
bad2:    asm("%[reg] += 1"::[reg]"r"(var));
                              ^
                              '--- input operand
                              
This leads to different IR generation at the beginning of translation:

  %1 = call i32 asm "$0 += 1", "=r,0"(i32 %0)

vs.

  call void asm sideeffect "$0 += 1", "r"(i32 %0)

Whether or not to add "sideeffect" property to asm call seem to be
controlled by the following condition in CodeGenFunction::EmitAsmStmt():

  bool HasSideEffect = S.isVolatile() || S.getNumOutputs() == 0;

See [1].
This is documented in [2] (assuming that clang and gcc are compatible):

>  asm statements that have no output operands and asm goto statements,
>  are implicitly volatile.

Asm block w/o sideeffect, output value of which is not used,
is removed from selection DAG at early stages of instruction generation.
If "bad1" is modified to use "var" after asm block (e.g. return it)
the asm block is not removed.

So, this looks like regular clang behavior, not specific to BPF target.

[1] https://github.com/llvm/llvm-project/blob/166bd4e1f18da221621953bd5943c1a8d17201fe/clang/lib/CodeGen/CGStmt.cpp#L2843
[2]
Alexei Starovoitov Jan. 4, 2024, 9:02 p.m. UTC | #5
On Thu, Jan 4, 2024 at 12:06 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
> [...]
>
> Regarding disappearing asm blocks.
>
> > https://godbolt.org/z/Kqszr6q3v
> >
> > Another issue is llvm removes asm() completely when output "+r"
> > constraint is used. It has to be asm volatile to convince llvm
> > to keep that asm block. That's bad1() case.
> > asm() stays in place when input "r" constraint is used.
> > That's bad2().
> > asm() removal happens with x86 backend too. So maybe it's a feature?
>
> The difference between bad1() and bad2() is small:
>
>                              .---- output operand
>                              v
> bad1:    asm("%[reg] += 1":[reg]"+r"(var));
> bad2:    asm("%[reg] += 1"::[reg]"r"(var));
>                               ^
>                               '--- input operand
>
> This leads to different IR generation at the beginning of translation:
>
>   %1 = call i32 asm "$0 += 1", "=r,0"(i32 %0)
>
> vs.
>
>   call void asm sideeffect "$0 += 1", "r"(i32 %0)
>
> Whether or not to add "sideeffect" property to asm call seem to be
> controlled by the following condition in CodeGenFunction::EmitAsmStmt():
>
>   bool HasSideEffect = S.isVolatile() || S.getNumOutputs() == 0;
>
> See [1].
> This is documented in [2] (assuming that clang and gcc are compatible):
>
> >  asm statements that have no output operands and asm goto statements,
> >  are implicitly volatile.
>
> Asm block w/o sideeffect, output value of which is not used,
> is removed from selection DAG at early stages of instruction generation.
> If "bad1" is modified to use "var" after asm block (e.g. return it)
> the asm block is not removed.
>
> So, this looks like regular clang behavior, not specific to BPF target.

Wow. Thanks for those details.
I didn't know that getNumOutputs() == 0 is equivalent to volatile
in that sense. Sounds like we should always add volatile to
avoid surprises.
Eduard Zingerman Jan. 5, 2024, 9:47 p.m. UTC | #6
On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
[...]
> It turned out there are indeed a bunch of redundant shifts
> when u32 or s32 is passed into "r" asm constraint.
> 
> Strangely the shifts are there when compiled with -mcpu=v3 or v4
> and no shifts with -mcpu=v1 and v2.
> 
> Also weird that u8 and u16 are passed into "r" without redundant shifts.
> Hence I found a "workaround": cast u32 into u16 while passing.
> The truncation of u32 doesn't happen and shifts to zero upper 32-bit
> are gone as well.
> 
> https://godbolt.org/z/Kqszr6q3v

Regarding unnecessary shifts.
Sorry, a long email about minor feature/defect.

So, currently the following C program
(and it's variations with implicit casts):

    extern unsigned long bar(void);
    void foo(void) {
      asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar()));
    }

Is translated to the following BPF:

    $ clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -c -o - t.c | llvm-objdump --no-show-raw-insn -d -
    
    <stdin>:	file format elf64-bpf
    
    Disassembly of section .text:
    
    0000000000000000 <foo>:
           0:	call -0x1
           1:	r0 <<= 0x20
           2:	r0 >>= 0x20
           3:	r0 += 0x1
           4:	exit
           
Note: no additional shifts are generated when "w" (32-bit register)
      constraint is used instead of "r".

First, is this right or wrong?
------------------------------

C language spec [1] paragraph 6.5.4.6 (Cast operators -> Semantics) says
the following:

  If the value of the expression is represented with greater range or
  precision than required by the type named by the cast (6.3.1.8),
  then the cast specifies a conversion even if the type of the
  expression is the same as the named type and removes any extra range
  and precision.                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  ^^^^^^^^^^^^^
  
What other LLVM backends do in such situations?
Consider the following program translated to amd64 [2] and aarch64 [3]:

    void foo(void) {
      asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned long)  bar())); // 1
      asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned int)   bar())); // 2
      asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned short) bar())); // 3
    }

- for amd64 register of proper size is selected for `reg`;
- for aarch64 warnings about wrong operand size are emitted at (2) and (3)
  and 64-bit register is used w/o generating any additional instructions.

(Note, however, that 'arm' silently ignores the issue and uses 32-bit
 registers for all three points).

So, it looks like that something of this sort should be done:
- either extra precision should be removed via additional instructions;
- or 32-bit register should be picked for `reg`;
- or warning should be emitted as in aarch64 case.

[1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3088.pdf
[2] https://godbolt.org/z/9nKxaMc5j
[3] https://godbolt.org/z/1zxEr5b3f


Second, what to do?
-------------------

I think that the following steps are needed:
- Investigation described in the next section shows that currently two
  shifts are generated accidentally w/o real intent to shed precision.
  I have a patch [6] that removes shifts generation, it should be applied.
- When 32-bit value is passed to "r" constraint:
  - for cpu v3/v4 a 32-bit register should be selected;
  - for cpu v1/v2 a warning should be reported.


Third, why two shifts are generated?
------------------------------------

(Details here might be interesting to Yonghong, regular reader could
 skip this section).

The two shifts are result of interaction between two IR constructs
`trunc` and `asm`. The C program above looks as follows in LLVM IR
before machine code generation:

    declare dso_local i64 @bar()
    define dso_local void @foo(i32 %p) {
    entry:
      %call = call i64 @bar()
      %v32 = trunc i64 %call to i32
      tail call void asm sideeffect "$0 += 1", "r"(i32 %v32)
      ret void
    }

Initial selection DAG:

    $ llc -debug-only=isel -march=bpf -mcpu=v3 --filetype=asm -o - t.ll
    SelectionDAG has 21 nodes:
      ...
      t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
   !     t11: i32 = truncate t10
   !    t15: i64 = zero_extend t11
      t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t15
        t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
                         TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
      ...

Note values t11 and t15 marked with (!).

Optimized lowered selection DAG for this fragment:

    t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
  !   t22: i64 = and t10, Constant:i64<4294967295>
    t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
      t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
                       TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1

Note (zext (truncate ...)) converted to (and ... 0xffff_ffff).

DAG after instruction selection:

    t10: i64,ch,glue = CopyFromReg t8:1, Register:i64 $r0, t8:2
  !     t25: i64 = SLL_ri t10, TargetConstant:i64<32>
  !   t22: i64 = SRL_ri t25, TargetConstant:i64<32>
    t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
      t23: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
                       TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1

Note (and ... 0xffff_ffff) converted to (SRL_ri (SLL_ri ...)).
This happens because of the following pattern from BPFInstrInfo.td:

    // 0xffffFFFF doesn't fit into simm32, optimize common case
    def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
              (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;

So, the two shift instructions are result of translation of (zext (trunc ...)).
However, closer examination shows that zext DAG node was generated
almost by accident. Here is the backtrace for when this node was created:

    Breakpoint 1, llvm::SelectionDAG::getNode (... Opcode=202) ;; 202 is opcode for ZERO_EXTEND
        at .../SelectionDAG.cpp:5605
    (gdb) bt
    #0  llvm::SelectionDAG::getNode (...)
        at ...SelectionDAG.cpp:5605
    #1  0x... in getCopyToParts (..., ExtendKind=llvm::ISD::ZERO_EXTEND)
        at .../SelectionDAGBuilder.cpp:537
    #2  0x... in llvm::RegsForValue::getCopyToRegs (... PreferredExtendType=llvm::ISD::ANY_EXTEND)
        at .../SelectionDAGBuilder.cpp:958
    #3  0x... in llvm::SelectionDAGBuilder::visitInlineAsm(...)
        at .../SelectionDAGBuilder.cpp:9640
        ...

The stack frame #2 is interesting, here is the code for it [4]:

    void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
                                     const SDLoc &dl, SDValue &Chain, SDValue *Glue,
                                     const Value *V,
                                     ISD::NodeType PreferredExtendType) const {
                                                   ^
                                                   '-- this is ANY_EXTEND
      ...
      for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
        ...
                                                   .-- this returns true
                                                   v
        if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
          ExtendKind = ISD::ZERO_EXTEND;
    
                               .-- this is ZERO_EXTEND
                               v
        getCopyToParts(..., ExtendKind);
        Part += NumParts;
      }
      ...
    }

The getCopyToRegs() function was called with ANY_EXTEND preference,
but switched to ZERO_EXTEND because TLI.isZExtFree() currently returns
true for any 32 to 64-bit conversion [5].
However, in this case this is clearly a mistake, as zero extension of 
(zext i64 (truncate i32 ...)) costs two instructions.

The isZExtFree() behavior could be changed to report false for such
situations, as in my patch [6]. This in turn removes zext =>
removes two shifts from final asm.
Here is how DAG/asm look after patch [6]:

    Initial selection DAG:
      ...
      t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
  !   t11: i32 = truncate t10
      t16: ch,glue = CopyToReg t10:1, Register:i64 %1, t10
        t18: ch,glue = inlineasm t16, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
                         TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t16:1
      ...
    
Final asm:

    ...
    # %bb.0:
    	call bar
    	#APP
    	r0 += 1
    	#NO_APP
    	exit
    ...
    
Note that [6] is a very minor change, it does not affect code
generation for selftests at all and I was unable to conjure examples
where it has effect aside from inline asm parameters.

[4] https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L937C10-L937C10
[5] https://github.com/llvm/llvm-project/blob/6f4cc1310b12bc59210e4596a895db4cb9ad6075/llvm/lib/Target/BPF/BPFISelLowering.cpp#L213C21-L213C21
[6] https://github.com/llvm/llvm-project/commit/cf8e142e5eac089cc786c671a40fef022d08b0ef
Yonghong Song Jan. 8, 2024, 9:21 p.m. UTC | #7
On 1/5/24 1:47 PM, Eduard Zingerman wrote:
> On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
> [...]
>> It turned out there are indeed a bunch of redundant shifts
>> when u32 or s32 is passed into "r" asm constraint.
>>
>> Strangely the shifts are there when compiled with -mcpu=v3 or v4
>> and no shifts with -mcpu=v1 and v2.
>>
>> Also weird that u8 and u16 are passed into "r" without redundant shifts.
>> Hence I found a "workaround": cast u32 into u16 while passing.
>> The truncation of u32 doesn't happen and shifts to zero upper 32-bit
>> are gone as well.
>>
>> https://godbolt.org/z/Kqszr6q3v
> Regarding unnecessary shifts.
> Sorry, a long email about minor feature/defect.
>
> So, currently the following C program
> (and it's variations with implicit casts):
>
>      extern unsigned long bar(void);
>      void foo(void) {
>        asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar()));
>      }
>
> Is translated to the following BPF:
>
>      $ clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -c -o - t.c | llvm-objdump --no-show-raw-insn -d -
>      
>      <stdin>:	file format elf64-bpf
>      
>      Disassembly of section .text:
>      
>      0000000000000000 <foo>:
>             0:	call -0x1
>             1:	r0 <<= 0x20
>             2:	r0 >>= 0x20
>             3:	r0 += 0x1
>             4:	exit
>             
> Note: no additional shifts are generated when "w" (32-bit register)
>        constraint is used instead of "r".
>
> First, is this right or wrong?
> ------------------------------
>
> C language spec [1] paragraph 6.5.4.6 (Cast operators -> Semantics) says
> the following:
>
>    If the value of the expression is represented with greater range or
>    precision than required by the type named by the cast (6.3.1.8),
>    then the cast specifies a conversion even if the type of the
>    expression is the same as the named type and removes any extra range
>    and precision.                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
>    ^^^^^^^^^^^^^
>    
> What other LLVM backends do in such situations?
> Consider the following program translated to amd64 [2] and aarch64 [3]:
>
>      void foo(void) {
>        asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned long)  bar())); // 1
>        asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned int)   bar())); // 2
>        asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned short) bar())); // 3
>      }
>
> - for amd64 register of proper size is selected for `reg`;
> - for aarch64 warnings about wrong operand size are emitted at (2) and (3)
>    and 64-bit register is used w/o generating any additional instructions.
>
> (Note, however, that 'arm' silently ignores the issue and uses 32-bit
>   registers for all three points).
>
> So, it looks like that something of this sort should be done:
> - either extra precision should be removed via additional instructions;
> - or 32-bit register should be picked for `reg`;
> - or warning should be emitted as in aarch64 case.
>
> [1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3088.pdf
> [2] https://godbolt.org/z/9nKxaMc5j
> [3] https://godbolt.org/z/1zxEr5b3f
>
>
> Second, what to do?
> -------------------
>
> I think that the following steps are needed:
> - Investigation described in the next section shows that currently two
>    shifts are generated accidentally w/o real intent to shed precision.
>    I have a patch [6] that removes shifts generation, it should be applied.
> - When 32-bit value is passed to "r" constraint:
>    - for cpu v3/v4 a 32-bit register should be selected;
>    - for cpu v1/v2 a warning should be reported.
>
>
> Third, why two shifts are generated?
> ------------------------------------
>
> (Details here might be interesting to Yonghong, regular reader could
>   skip this section).
>
> The two shifts are result of interaction between two IR constructs
> `trunc` and `asm`. The C program above looks as follows in LLVM IR
> before machine code generation:
>
>      declare dso_local i64 @bar()
>      define dso_local void @foo(i32 %p) {
>      entry:
>        %call = call i64 @bar()
>        %v32 = trunc i64 %call to i32
>        tail call void asm sideeffect "$0 += 1", "r"(i32 %v32)
>        ret void
>      }
>
> Initial selection DAG:
>
>      $ llc -debug-only=isel -march=bpf -mcpu=v3 --filetype=asm -o - t.ll
>      SelectionDAG has 21 nodes:
>        ...
>        t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>     !     t11: i32 = truncate t10
>     !    t15: i64 = zero_extend t11
>        t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t15
>          t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                           TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>        ...
>
> Note values t11 and t15 marked with (!).
>
> Optimized lowered selection DAG for this fragment:
>
>      t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>    !   t22: i64 = and t10, Constant:i64<4294967295>
>      t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>        t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                         TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>
> Note (zext (truncate ...)) converted to (and ... 0xffff_ffff).
>
> DAG after instruction selection:
>
>      t10: i64,ch,glue = CopyFromReg t8:1, Register:i64 $r0, t8:2
>    !     t25: i64 = SLL_ri t10, TargetConstant:i64<32>
>    !   t22: i64 = SRL_ri t25, TargetConstant:i64<32>
>      t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>        t23: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                         TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>
> Note (and ... 0xffff_ffff) converted to (SRL_ri (SLL_ri ...)).
> This happens because of the following pattern from BPFInstrInfo.td:
>
>      // 0xffffFFFF doesn't fit into simm32, optimize common case
>      def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
>                (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
>
> So, the two shift instructions are result of translation of (zext (trunc ...)).
> However, closer examination shows that zext DAG node was generated
> almost by accident. Here is the backtrace for when this node was created:
>
>      Breakpoint 1, llvm::SelectionDAG::getNode (... Opcode=202) ;; 202 is opcode for ZERO_EXTEND
>          at .../SelectionDAG.cpp:5605
>      (gdb) bt
>      #0  llvm::SelectionDAG::getNode (...)
>          at ...SelectionDAG.cpp:5605
>      #1  0x... in getCopyToParts (..., ExtendKind=llvm::ISD::ZERO_EXTEND)
>          at .../SelectionDAGBuilder.cpp:537
>      #2  0x... in llvm::RegsForValue::getCopyToRegs (... PreferredExtendType=llvm::ISD::ANY_EXTEND)
>          at .../SelectionDAGBuilder.cpp:958
>      #3  0x... in llvm::SelectionDAGBuilder::visitInlineAsm(...)
>          at .../SelectionDAGBuilder.cpp:9640
>          ...
>
> The stack frame #2 is interesting, here is the code for it [4]:
>
>      void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
>                                       const SDLoc &dl, SDValue &Chain, SDValue *Glue,
>                                       const Value *V,
>                                       ISD::NodeType PreferredExtendType) const {
>                                                     ^
>                                                     '-- this is ANY_EXTEND
>        ...
>        for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
>          ...
>                                                     .-- this returns true
>                                                     v
>          if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
>            ExtendKind = ISD::ZERO_EXTEND;
>      
>                                 .-- this is ZERO_EXTEND
>                                 v
>          getCopyToParts(..., ExtendKind);
>          Part += NumParts;
>        }
>        ...
>      }
>
> The getCopyToRegs() function was called with ANY_EXTEND preference,
> but switched to ZERO_EXTEND because TLI.isZExtFree() currently returns
> true for any 32 to 64-bit conversion [5].
> However, in this case this is clearly a mistake, as zero extension of
> (zext i64 (truncate i32 ...)) costs two instructions.
>
> The isZExtFree() behavior could be changed to report false for such
> situations, as in my patch [6]. This in turn removes zext =>
> removes two shifts from final asm.
> Here is how DAG/asm look after patch [6]:
>
>      Initial selection DAG:
>        ...
>        t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>    !   t11: i32 = truncate t10
>        t16: ch,glue = CopyToReg t10:1, Register:i64 %1, t10
>          t18: ch,glue = inlineasm t16, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                           TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t16:1
>        ...
>      
> Final asm:
>
>      ...
>      # %bb.0:
>      	call bar
>      	#APP
>      	r0 += 1
>      	#NO_APP
>      	exit
>      ...

Thanks for the detailed analysis! Previously we intend to do the following:

- When 32-bit value is passed to "r" constraint:
   - for cpu v3/v4 a 32-bit register should be selected;
   - for cpu v1/v2 a warning should be reported.

So in the above, the desired asm code should be

     ...
     # %bb.0:
     	call bar
     	#APP
     	w0 += 1
     	#NO_APP
     	exit
     ...

for cpuv3/cpuv4. I guess some more work in llvm is needed
to achieve that.

On the other hand, for cpuv3/v4, for regular C code,
I think the compiler might be already omitting the conversion and use w
register already. So I am not sure whether the patch [6]
is needed or not. Could you double check?

>      
> Note that [6] is a very minor change, it does not affect code
> generation for selftests at all and I was unable to conjure examples
> where it has effect aside from inline asm parameters.
>
> [4] https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L937C10-L937C10
> [5] https://github.com/llvm/llvm-project/blob/6f4cc1310b12bc59210e4596a895db4cb9ad6075/llvm/lib/Target/BPF/BPFISelLowering.cpp#L213C21-L213C21
> [6] https://github.com/llvm/llvm-project/commit/cf8e142e5eac089cc786c671a40fef022d08b0ef
>
Alexei Starovoitov Jan. 8, 2024, 9:33 p.m. UTC | #8
On Fri, Jan 5, 2024 at 1:47 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
> [...]
> > It turned out there are indeed a bunch of redundant shifts
> > when u32 or s32 is passed into "r" asm constraint.
> >
> > Strangely the shifts are there when compiled with -mcpu=v3 or v4
> > and no shifts with -mcpu=v1 and v2.
> >
> > Also weird that u8 and u16 are passed into "r" without redundant shifts.
> > Hence I found a "workaround": cast u32 into u16 while passing.
> > The truncation of u32 doesn't happen and shifts to zero upper 32-bit
> > are gone as well.
> >
> > https://godbolt.org/z/Kqszr6q3v
>
> Regarding unnecessary shifts.
> Sorry, a long email about minor feature/defect.
>
> So, currently the following C program
> (and it's variations with implicit casts):
>
>     extern unsigned long bar(void);
>     void foo(void) {
>       asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar()));
>     }
>
> Is translated to the following BPF:
>
>     $ clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -c -o - t.c | llvm-objdump --no-show-raw-insn -d -
>
>     <stdin>:    file format elf64-bpf
>
>     Disassembly of section .text:
>
>     0000000000000000 <foo>:
>            0:   call -0x1
>            1:   r0 <<= 0x20
>            2:   r0 >>= 0x20
>            3:   r0 += 0x1
>            4:   exit
>
> Note: no additional shifts are generated when "w" (32-bit register)
>       constraint is used instead of "r".
>
> First, is this right or wrong?
> ------------------------------
>
> C language spec [1] paragraph 6.5.4.6 (Cast operators -> Semantics) says
> the following:
>
>   If the value of the expression is represented with greater range or
>   precision than required by the type named by the cast (6.3.1.8),
>   then the cast specifies a conversion even if the type of the
>   expression is the same as the named type and removes any extra range
>   and precision.                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
>   ^^^^^^^^^^^^^
>
> What other LLVM backends do in such situations?
> Consider the following program translated to amd64 [2] and aarch64 [3]:
>
>     void foo(void) {
>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned long)  bar())); // 1
>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned int)   bar())); // 2
>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned short) bar())); // 3
>     }
>
> - for amd64 register of proper size is selected for `reg`;
> - for aarch64 warnings about wrong operand size are emitted at (2) and (3)
>   and 64-bit register is used w/o generating any additional instructions.
>
> (Note, however, that 'arm' silently ignores the issue and uses 32-bit
>  registers for all three points).
>
> So, it looks like that something of this sort should be done:
> - either extra precision should be removed via additional instructions;
> - or 32-bit register should be picked for `reg`;
> - or warning should be emitted as in aarch64 case.
>
> [1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3088.pdf
> [2] https://godbolt.org/z/9nKxaMc5j
> [3] https://godbolt.org/z/1zxEr5b3f
>
>
> Second, what to do?
> -------------------
>
> I think that the following steps are needed:
> - Investigation described in the next section shows that currently two
>   shifts are generated accidentally w/o real intent to shed precision.
>   I have a patch [6] that removes shifts generation, it should be applied.
> - When 32-bit value is passed to "r" constraint:
>   - for cpu v3/v4 a 32-bit register should be selected;
>   - for cpu v1/v2 a warning should be reported.

Thank you for the detailed analysis.

Agree that llvm fix [6] is a necessary step, then
using 'w' in v3/v4 and warn on v1/v2 makes sense too,
but we have this macro:
#define barrier_var(var) asm volatile("" : "+r"(var))
that is used in various bpf production programs.
tetragon is also a heavy user of inline asm.

Right now a full 64-bit register is allocated,
so switching to 'w' might cause unexpected behavior
including rejection by the verifier.
I think it makes sense to align the bpf backend with arm64 and x86,
but we need to broadcast this change widely.

Also need to align with GCC. (Jose cc-ed)

And, the most importantly, we need a way to go back to old behavior,
since u32 var; asm("...":: "r"(var)); will now
allocate "w" register or warn.

What should be the workaround?

I've tried:
u32 var; asm("...":: "r"((u64)var));

https://godbolt.org/z/n4ejvWj7v

and x86/arm64 generate 32-bit truction.
Sounds like the bpf backend has to do it as well.
We should be doing 'wX=wX' in such case (just like x86)
instead of <<=32 >>=32.

I think this should be done as a separate diff.
Our current pattern of using shifts is inefficient and guaranteed
to screw up verifier range analysis while wX=wX is faster
and more verifier friendly.
Yes it's still not going to be 1-1 to old (our current) behavior.

We probably need some macro (like we did for __BPF_CPU_VERSION__)
to identify such fixed llvm, so existing users with '(short)'
workaround and other tricks can detect new vs old compiler.

Looks like we opened a can of worms.
Aligning with x86/arm64 makes sense, but the cost of doing
the right thing is hard to estimate.

>
> Third, why two shifts are generated?
> ------------------------------------
>
> (Details here might be interesting to Yonghong, regular reader could
>  skip this section).
>
> The two shifts are result of interaction between two IR constructs
> `trunc` and `asm`. The C program above looks as follows in LLVM IR
> before machine code generation:
>
>     declare dso_local i64 @bar()
>     define dso_local void @foo(i32 %p) {
>     entry:
>       %call = call i64 @bar()
>       %v32 = trunc i64 %call to i32
>       tail call void asm sideeffect "$0 += 1", "r"(i32 %v32)
>       ret void
>     }
>
> Initial selection DAG:
>
>     $ llc -debug-only=isel -march=bpf -mcpu=v3 --filetype=asm -o - t.ll
>     SelectionDAG has 21 nodes:
>       ...
>       t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>    !     t11: i32 = truncate t10
>    !    t15: i64 = zero_extend t11
>       t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t15
>         t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                          TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>       ...
>
> Note values t11 and t15 marked with (!).
>
> Optimized lowered selection DAG for this fragment:
>
>     t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>   !   t22: i64 = and t10, Constant:i64<4294967295>
>     t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>       t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                        TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>
> Note (zext (truncate ...)) converted to (and ... 0xffff_ffff).
>
> DAG after instruction selection:
>
>     t10: i64,ch,glue = CopyFromReg t8:1, Register:i64 $r0, t8:2
>   !     t25: i64 = SLL_ri t10, TargetConstant:i64<32>
>   !   t22: i64 = SRL_ri t25, TargetConstant:i64<32>
>     t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>       t23: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                        TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>
> Note (and ... 0xffff_ffff) converted to (SRL_ri (SLL_ri ...)).
> This happens because of the following pattern from BPFInstrInfo.td:
>
>     // 0xffffFFFF doesn't fit into simm32, optimize common case
>     def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
>               (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
>
> So, the two shift instructions are result of translation of (zext (trunc ...)).
> However, closer examination shows that zext DAG node was generated
> almost by accident. Here is the backtrace for when this node was created:
>
>     Breakpoint 1, llvm::SelectionDAG::getNode (... Opcode=202) ;; 202 is opcode for ZERO_EXTEND
>         at .../SelectionDAG.cpp:5605
>     (gdb) bt
>     #0  llvm::SelectionDAG::getNode (...)
>         at ...SelectionDAG.cpp:5605
>     #1  0x... in getCopyToParts (..., ExtendKind=llvm::ISD::ZERO_EXTEND)
>         at .../SelectionDAGBuilder.cpp:537
>     #2  0x... in llvm::RegsForValue::getCopyToRegs (... PreferredExtendType=llvm::ISD::ANY_EXTEND)
>         at .../SelectionDAGBuilder.cpp:958
>     #3  0x... in llvm::SelectionDAGBuilder::visitInlineAsm(...)
>         at .../SelectionDAGBuilder.cpp:9640
>         ...
>
> The stack frame #2 is interesting, here is the code for it [4]:
>
>     void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
>                                      const SDLoc &dl, SDValue &Chain, SDValue *Glue,
>                                      const Value *V,
>                                      ISD::NodeType PreferredExtendType) const {
>                                                    ^
>                                                    '-- this is ANY_EXTEND
>       ...
>       for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
>         ...
>                                                    .-- this returns true
>                                                    v
>         if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
>           ExtendKind = ISD::ZERO_EXTEND;
>
>                                .-- this is ZERO_EXTEND
>                                v
>         getCopyToParts(..., ExtendKind);
>         Part += NumParts;
>       }
>       ...
>     }
>
> The getCopyToRegs() function was called with ANY_EXTEND preference,
> but switched to ZERO_EXTEND because TLI.isZExtFree() currently returns
> true for any 32 to 64-bit conversion [5].
> However, in this case this is clearly a mistake, as zero extension of
> (zext i64 (truncate i32 ...)) costs two instructions.
>
> The isZExtFree() behavior could be changed to report false for such
> situations, as in my patch [6]. This in turn removes zext =>
> removes two shifts from final asm.
> Here is how DAG/asm look after patch [6]:
>
>     Initial selection DAG:
>       ...
>       t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>   !   t11: i32 = truncate t10
>       t16: ch,glue = CopyToReg t10:1, Register:i64 %1, t10
>         t18: ch,glue = inlineasm t16, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>                          TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t16:1
>       ...
>
> Final asm:
>
>     ...
>     # %bb.0:
>         call bar
>         #APP
>         r0 += 1
>         #NO_APP
>         exit
>     ...
>
> Note that [6] is a very minor change, it does not affect code
> generation for selftests at all and I was unable to conjure examples
> where it has effect aside from inline asm parameters.
>
> [4] https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L937C10-L937C10
> [5] https://github.com/llvm/llvm-project/blob/6f4cc1310b12bc59210e4596a895db4cb9ad6075/llvm/lib/Target/BPF/BPFISelLowering.cpp#L213C21-L213C21
> [6] https://github.com/llvm/llvm-project/commit/cf8e142e5eac089cc786c671a40fef022d08b0ef
>
Eduard Zingerman Jan. 8, 2024, 11:16 p.m. UTC | #9
On Mon, 2024-01-08 at 13:21 -0800, Yonghong Song wrote:
[...]
> Thanks for the detailed analysis! Previously we intend to do the following:
> 
> - When 32-bit value is passed to "r" constraint:
>    - for cpu v3/v4 a 32-bit register should be selected;
>    - for cpu v1/v2 a warning should be reported.
> 
> So in the above, the desired asm code should be
> 
>      ...
>      # %bb.0:
>      	call bar
>      	#APP
>      	w0 += 1
>      	#NO_APP
>      	exit
>      ...
> 
> for cpuv3/cpuv4. I guess some more work in llvm is needed
> to achieve that.

To make clang emit w0 the following modification is needed:

diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index b20e09c7f95f..4c504d587ce6 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -265,6 +265,8 @@ BPFTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     // GCC Constraint Letters
     switch (Constraint[0]) {
     case 'r': // GENERAL_REGS
+      if (HasAlu32 && VT == MVT::i32)
+        return std::make_pair(0U, &BPF::GPR32RegClass);
       return std::make_pair(0U, &BPF::GPRRegClass);
     case 'w':
       if (HasAlu32)

However, as Alexei notes in the sibling thread,
this leads to incompatibility with some existing inline assembly.
E.g. there are two compilation errors in selftests.
I'll write in some more detail in the sibling thread.

> On the other hand, for cpuv3/v4, for regular C code,
> I think the compiler might be already omitting the conversion and use w
> register already. So I am not sure whether the patch [6]
> is needed or not. Could you double check?

Yes, for regular C code, generated assembly uses 32-bit registers as expected:

echo $(cat <<EOF
extern unsigned long bar(unsigned);
void foo(void) {
  bar(bar(7));
}
EOF
) | clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -x c -S -o - -

...

foo:                                    # @foo
# %bb.0:
	w1 = 7
	call bar
	w1 = w0
	call bar
	exit
Yonghong Song Jan. 8, 2024, 11:22 p.m. UTC | #10
On 1/8/24 1:33 PM, Alexei Starovoitov wrote:
> On Fri, Jan 5, 2024 at 1:47 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>> On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
>> [...]
>>> It turned out there are indeed a bunch of redundant shifts
>>> when u32 or s32 is passed into "r" asm constraint.
>>>
>>> Strangely the shifts are there when compiled with -mcpu=v3 or v4
>>> and no shifts with -mcpu=v1 and v2.
>>>
>>> Also weird that u8 and u16 are passed into "r" without redundant shifts.
>>> Hence I found a "workaround": cast u32 into u16 while passing.
>>> The truncation of u32 doesn't happen and shifts to zero upper 32-bit
>>> are gone as well.
>>>
>>> https://godbolt.org/z/Kqszr6q3v
>> Regarding unnecessary shifts.
>> Sorry, a long email about minor feature/defect.
>>
>> So, currently the following C program
>> (and it's variations with implicit casts):
>>
>>      extern unsigned long bar(void);
>>      void foo(void) {
>>        asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar()));
>>      }
>>
>> Is translated to the following BPF:
>>
>>      $ clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -c -o - t.c | llvm-objdump --no-show-raw-insn -d -
>>
>>      <stdin>:    file format elf64-bpf
>>
>>      Disassembly of section .text:
>>
>>      0000000000000000 <foo>:
>>             0:   call -0x1
>>             1:   r0 <<= 0x20
>>             2:   r0 >>= 0x20
>>             3:   r0 += 0x1
>>             4:   exit
>>
>> Note: no additional shifts are generated when "w" (32-bit register)
>>        constraint is used instead of "r".
>>
>> First, is this right or wrong?
>> ------------------------------
>>
>> C language spec [1] paragraph 6.5.4.6 (Cast operators -> Semantics) says
>> the following:
>>
>>    If the value of the expression is represented with greater range or
>>    precision than required by the type named by the cast (6.3.1.8),
>>    then the cast specifies a conversion even if the type of the
>>    expression is the same as the named type and removes any extra range
>>    and precision.                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>    ^^^^^^^^^^^^^
>>
>> What other LLVM backends do in such situations?
>> Consider the following program translated to amd64 [2] and aarch64 [3]:
>>
>>      void foo(void) {
>>        asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned long)  bar())); // 1
>>        asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned int)   bar())); // 2
>>        asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned short) bar())); // 3
>>      }
>>
>> - for amd64 register of proper size is selected for `reg`;
>> - for aarch64 warnings about wrong operand size are emitted at (2) and (3)
>>    and 64-bit register is used w/o generating any additional instructions.
>>
>> (Note, however, that 'arm' silently ignores the issue and uses 32-bit
>>   registers for all three points).
>>
>> So, it looks like that something of this sort should be done:
>> - either extra precision should be removed via additional instructions;
>> - or 32-bit register should be picked for `reg`;
>> - or warning should be emitted as in aarch64 case.
>>
>> [1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3088.pdf
>> [2] https://godbolt.org/z/9nKxaMc5j
>> [3] https://godbolt.org/z/1zxEr5b3f
>>
>>
>> Second, what to do?
>> -------------------
>>
>> I think that the following steps are needed:
>> - Investigation described in the next section shows that currently two
>>    shifts are generated accidentally w/o real intent to shed precision.
>>    I have a patch [6] that removes shifts generation, it should be applied.
>> - When 32-bit value is passed to "r" constraint:
>>    - for cpu v3/v4 a 32-bit register should be selected;
>>    - for cpu v1/v2 a warning should be reported.
> Thank you for the detailed analysis.
>
> Agree that llvm fix [6] is a necessary step, then

[6] probably not needed for normal C code. Eduard may help
to confirm. But yes it is needed to simplify asm
with 'r' registers.

> using 'w' in v3/v4 and warn on v1/v2 makes sense too,
> but we have this macro:
> #define barrier_var(var) asm volatile("" : "+r"(var))
> that is used in various bpf production programs.
> tetragon is also a heavy user of inline asm.

asm volatile("" : "+r"(var)) is expected to be
used only in compiler and the code will be generated
in final binary. This is true for all the instances
I see. If in unusual cases,
we have code generated like (r1 = r1, w1.= w1),
we can remove that inline asm in bpf backend at
the very late peephole optimization.

>
> Right now a full 64-bit register is allocated,
> so switching to 'w' might cause unexpected behavior
> including rejection by the verifier.
> I think it makes sense to align the bpf backend with arm64 and x86,
> but we need to broadcast this change widely.

Indeed, just with [6] might cause wrong results in certain cases.
Or if [6] landed, some code might need to change from 'r' to 'w'
in their source code.

Agree we should do similar thing to arm64/x86.
For cpuv3/cpuv4 4-byte value, changing from 'r' to 'w'.
For other 4-byte value, or 1/2 byte value, issue warnings.

>
> Also need to align with GCC. (Jose cc-ed)
>
> And, the most importantly, we need a way to go back to old behavior,
> since u32 var; asm("...":: "r"(var)); will now
> allocate "w" register or warn.
>
> What should be the workaround?
>
> I've tried:
> u32 var; asm("...":: "r"((u64)var));
>
> https://godbolt.org/z/n4ejvWj7v
>
> and x86/arm64 generate 32-bit truction.
> Sounds like the bpf backend has to do it as well.
> We should be doing 'wX=wX' in such case (just like x86)
> instead of <<=32 >>=32.

Indeed, this is preferred.

>
> I think this should be done as a separate diff.
> Our current pattern of using shifts is inefficient and guaranteed
> to screw up verifier range analysis while wX=wX is faster
> and more verifier friendly.
> Yes it's still not going to be 1-1 to old (our current) behavior.

With [6] and replacing 'r' to 'w' in inline asm, it will be a
new behavior.

>
> We probably need some macro (like we did for __BPF_CPU_VERSION__)
> to identify such fixed llvm, so existing users with '(short)'
> workaround and other tricks can detect new vs old compiler.

We can add a feature macro like
https://github.com/llvm/llvm-project/blob/main/clang/lib/Basic/Targets/BPF.cpp#L55

>
> Looks like we opened a can of worms.
> Aligning with x86/arm64 makes sense, but the cost of doing
> the right thing is hard to estimate.
>
>> Third, why two shifts are generated?
>> ------------------------------------
>>
>> (Details here might be interesting to Yonghong, regular reader could
>>   skip this section).
>>
>> The two shifts are result of interaction between two IR constructs
>> `trunc` and `asm`. The C program above looks as follows in LLVM IR
>> before machine code generation:
>>
>>      declare dso_local i64 @bar()
>>      define dso_local void @foo(i32 %p) {
>>      entry:
>>        %call = call i64 @bar()
>>        %v32 = trunc i64 %call to i32
>>        tail call void asm sideeffect "$0 += 1", "r"(i32 %v32)
>>        ret void
>>      }
>>
>> Initial selection DAG:
>>
>>      $ llc -debug-only=isel -march=bpf -mcpu=v3 --filetype=asm -o - t.ll
>>      SelectionDAG has 21 nodes:
>>        ...
>>        t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>     !     t11: i32 = truncate t10
>>     !    t15: i64 = zero_extend t11
>>        t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t15
>>          t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                           TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>        ...
>>
>> Note values t11 and t15 marked with (!).
>>
>> Optimized lowered selection DAG for this fragment:
>>
>>      t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>    !   t22: i64 = and t10, Constant:i64<4294967295>
>>      t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>>        t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                         TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>
>> Note (zext (truncate ...)) converted to (and ... 0xffff_ffff).
>>
>> DAG after instruction selection:
>>
>>      t10: i64,ch,glue = CopyFromReg t8:1, Register:i64 $r0, t8:2
>>    !     t25: i64 = SLL_ri t10, TargetConstant:i64<32>
>>    !   t22: i64 = SRL_ri t25, TargetConstant:i64<32>
>>      t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>>        t23: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                         TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>
>> Note (and ... 0xffff_ffff) converted to (SRL_ri (SLL_ri ...)).
>> This happens because of the following pattern from BPFInstrInfo.td:
>>
>>      // 0xffffFFFF doesn't fit into simm32, optimize common case
>>      def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
>>                (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
>>
>> So, the two shift instructions are result of translation of (zext (trunc ...)).
>> However, closer examination shows that zext DAG node was generated
>> almost by accident. Here is the backtrace for when this node was created:
>>
>>      Breakpoint 1, llvm::SelectionDAG::getNode (... Opcode=202) ;; 202 is opcode for ZERO_EXTEND
>>          at .../SelectionDAG.cpp:5605
>>      (gdb) bt
>>      #0  llvm::SelectionDAG::getNode (...)
>>          at ...SelectionDAG.cpp:5605
>>      #1  0x... in getCopyToParts (..., ExtendKind=llvm::ISD::ZERO_EXTEND)
>>          at .../SelectionDAGBuilder.cpp:537
>>      #2  0x... in llvm::RegsForValue::getCopyToRegs (... PreferredExtendType=llvm::ISD::ANY_EXTEND)
>>          at .../SelectionDAGBuilder.cpp:958
>>      #3  0x... in llvm::SelectionDAGBuilder::visitInlineAsm(...)
>>          at .../SelectionDAGBuilder.cpp:9640
>>          ...
>>
>> The stack frame #2 is interesting, here is the code for it [4]:
>>
>>      void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
>>                                       const SDLoc &dl, SDValue &Chain, SDValue *Glue,
>>                                       const Value *V,
>>                                       ISD::NodeType PreferredExtendType) const {
>>                                                     ^
>>                                                     '-- this is ANY_EXTEND
>>        ...
>>        for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
>>          ...
>>                                                     .-- this returns true
>>                                                     v
>>          if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
>>            ExtendKind = ISD::ZERO_EXTEND;
>>
>>                                 .-- this is ZERO_EXTEND
>>                                 v
>>          getCopyToParts(..., ExtendKind);
>>          Part += NumParts;
>>        }
>>        ...
>>      }
>>
>> The getCopyToRegs() function was called with ANY_EXTEND preference,
>> but switched to ZERO_EXTEND because TLI.isZExtFree() currently returns
>> true for any 32 to 64-bit conversion [5].
>> However, in this case this is clearly a mistake, as zero extension of
>> (zext i64 (truncate i32 ...)) costs two instructions.
>>
>> The isZExtFree() behavior could be changed to report false for such
>> situations, as in my patch [6]. This in turn removes zext =>
>> removes two shifts from final asm.
>> Here is how DAG/asm look after patch [6]:
>>
>>      Initial selection DAG:
>>        ...
>>        t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>    !   t11: i32 = truncate t10
>>        t16: ch,glue = CopyToReg t10:1, Register:i64 %1, t10
>>          t18: ch,glue = inlineasm t16, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                           TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t16:1
>>        ...
>>
>> Final asm:
>>
>>      ...
>>      # %bb.0:
>>          call bar
>>          #APP
>>          r0 += 1
>>          #NO_APP
>>          exit
>>      ...
>>
>> Note that [6] is a very minor change, it does not affect code
>> generation for selftests at all and I was unable to conjure examples
>> where it has effect aside from inline asm parameters.
>>
>> [4] https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L937C10-L937C10
>> [5] https://github.com/llvm/llvm-project/blob/6f4cc1310b12bc59210e4596a895db4cb9ad6075/llvm/lib/Target/BPF/BPFISelLowering.cpp#L213C21-L213C21
>> [6] https://github.com/llvm/llvm-project/commit/cf8e142e5eac089cc786c671a40fef022d08b0ef
>>
Jose E. Marchesi Jan. 9, 2024, 10:49 a.m. UTC | #11
> On Fri, Jan 5, 2024 at 1:47 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>
>> On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
>> [...]
>> > It turned out there are indeed a bunch of redundant shifts
>> > when u32 or s32 is passed into "r" asm constraint.
>> >
>> > Strangely the shifts are there when compiled with -mcpu=v3 or v4
>> > and no shifts with -mcpu=v1 and v2.
>> >
>> > Also weird that u8 and u16 are passed into "r" without redundant shifts.
>> > Hence I found a "workaround": cast u32 into u16 while passing.
>> > The truncation of u32 doesn't happen and shifts to zero upper 32-bit
>> > are gone as well.
>> >
>> > https://godbolt.org/z/Kqszr6q3v
>>
>> Regarding unnecessary shifts.
>> Sorry, a long email about minor feature/defect.
>>
>> So, currently the following C program
>> (and it's variations with implicit casts):
>>
>>     extern unsigned long bar(void);
>>     void foo(void) {
>>       asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar()));
>>     }
>>
>> Is translated to the following BPF:
>>
>>     $ clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -c -o - t.c | llvm-objdump --no-show-raw-insn -d -
>>
>>     <stdin>:    file format elf64-bpf
>>
>>     Disassembly of section .text:
>>
>>     0000000000000000 <foo>:
>>            0:   call -0x1
>>            1:   r0 <<= 0x20
>>            2:   r0 >>= 0x20
>>            3:   r0 += 0x1
>>            4:   exit
>>
>> Note: no additional shifts are generated when "w" (32-bit register)
>>       constraint is used instead of "r".
>>
>> First, is this right or wrong?
>> ------------------------------
>>
>> C language spec [1] paragraph 6.5.4.6 (Cast operators -> Semantics) says
>> the following:
>>
>>   If the value of the expression is represented with greater range or
>>   precision than required by the type named by the cast (6.3.1.8),
>>   then the cast specifies a conversion even if the type of the
>>   expression is the same as the named type and removes any extra range
>>   and precision.                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>   ^^^^^^^^^^^^^
>>
>> What other LLVM backends do in such situations?
>> Consider the following program translated to amd64 [2] and aarch64 [3]:
>>
>>     void foo(void) {
>>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned long)  bar())); // 1
>>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned int)   bar())); // 2
>>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned short) bar())); // 3
>>     }
>>
>> - for amd64 register of proper size is selected for `reg`;
>> - for aarch64 warnings about wrong operand size are emitted at (2) and (3)
>>   and 64-bit register is used w/o generating any additional instructions.
>>
>> (Note, however, that 'arm' silently ignores the issue and uses 32-bit
>>  registers for all three points).
>>
>> So, it looks like that something of this sort should be done:
>> - either extra precision should be removed via additional instructions;
>> - or 32-bit register should be picked for `reg`;
>> - or warning should be emitted as in aarch64 case.
>>
>> [1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3088.pdf
>> [2] https://godbolt.org/z/9nKxaMc5j
>> [3] https://godbolt.org/z/1zxEr5b3f
>>
>>
>> Second, what to do?
>> -------------------
>>
>> I think that the following steps are needed:
>> - Investigation described in the next section shows that currently two
>>   shifts are generated accidentally w/o real intent to shed precision.
>>   I have a patch [6] that removes shifts generation, it should be applied.
>> - When 32-bit value is passed to "r" constraint:
>>   - for cpu v3/v4 a 32-bit register should be selected;
>>   - for cpu v1/v2 a warning should be reported.
>
> Thank you for the detailed analysis.
>
> Agree that llvm fix [6] is a necessary step, then
> using 'w' in v3/v4 and warn on v1/v2 makes sense too,
> but we have this macro:
> #define barrier_var(var) asm volatile("" : "+r"(var))
> that is used in various bpf production programs.
> tetragon is also a heavy user of inline asm.
>
> Right now a full 64-bit register is allocated,
> so switching to 'w' might cause unexpected behavior
> including rejection by the verifier.
> I think it makes sense to align the bpf backend with arm64 and x86,
> but we need to broadcast this change widely.
>
> Also need to align with GCC. (Jose cc-ed)

GCC doesn't have an integrated assembler, so using -masm=pseudoc it just
compiles the program above to:

  foo:
  	call bar
  	r0 += 1
	exit

Also, at the moment we don't support a "w" constraint, because the
assembly-like assembly syntax we started with implies different
instructions that interpret the values stored in the BPF 64-bit
registers as 32-bit or 64-bit values, i.e.

  mov %r1, 1
  mov32 %r1, 1

But then the pseudo-c assembly syntax (that we also support) translates
some of the semantics of the instructions to the register names,
creating the notion that BPF actually has both 32-bit registers and
64-bit registers, i.e.

  r1 += 1
  w1 += 1

In GCC we support both assembly syntaxes and currently we lack the
ability to emit 32-bit variants in templates like "%[reg] += 1", so I
suppose we can introduce a "w" constraint to:

1. When assembly-like assembly syntax is used, expect a 32-bit mode to
   match the operand and warn about operand size overflow whenever
   necessary.  Always emit "%r" as the register name.

2. When pseudo-c assembly syntax is used, expect a 32-bit mode to match
   the operand and warn about operand size overflow whenever necessary,
   and then emit "w" instead of "r" as the register name.

> And, the most importantly, we need a way to go back to old behavior,
> since u32 var; asm("...":: "r"(var)); will now
> allocate "w" register or warn.

Is it really necessary to change the meaning of "r"?  You can write
templates like the one triggering this problem like:

  asm volatile ("%[reg] += 1"::[reg]"w"((unsigned)bar()));

Then the checks above will be performed, driven by the particular
constraint explicitly specified by the user, not driven by the type of
the value passed as the operand.

Or am I misunderstanding?

> What should be the workaround?
>
> I've tried:
> u32 var; asm("...":: "r"((u64)var));
>
> https://godbolt.org/z/n4ejvWj7v
>
> and x86/arm64 generate 32-bit truction.
> Sounds like the bpf backend has to do it as well.
> We should be doing 'wX=wX' in such case (just like x86)
> instead of <<=32 >>=32.
>
> I think this should be done as a separate diff.
> Our current pattern of using shifts is inefficient and guaranteed
> to screw up verifier range analysis while wX=wX is faster
> and more verifier friendly.
> Yes it's still not going to be 1-1 to old (our current) behavior.
>
> We probably need some macro (like we did for __BPF_CPU_VERSION__)
> to identify such fixed llvm, so existing users with '(short)'
> workaround and other tricks can detect new vs old compiler.
>
> Looks like we opened a can of worms.
> Aligning with x86/arm64 makes sense, but the cost of doing
> the right thing is hard to estimate.
>
>>
>> Third, why two shifts are generated?
>> ------------------------------------
>>
>> (Details here might be interesting to Yonghong, regular reader could
>>  skip this section).
>>
>> The two shifts are result of interaction between two IR constructs
>> `trunc` and `asm`. The C program above looks as follows in LLVM IR
>> before machine code generation:
>>
>>     declare dso_local i64 @bar()
>>     define dso_local void @foo(i32 %p) {
>>     entry:
>>       %call = call i64 @bar()
>>       %v32 = trunc i64 %call to i32
>>       tail call void asm sideeffect "$0 += 1", "r"(i32 %v32)
>>       ret void
>>     }
>>
>> Initial selection DAG:
>>
>>     $ llc -debug-only=isel -march=bpf -mcpu=v3 --filetype=asm -o - t.ll
>>     SelectionDAG has 21 nodes:
>>       ...
>>       t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>    !     t11: i32 = truncate t10
>>    !    t15: i64 = zero_extend t11
>>       t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t15
>>         t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                          TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>       ...
>>
>> Note values t11 and t15 marked with (!).
>>
>> Optimized lowered selection DAG for this fragment:
>>
>>     t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>   !   t22: i64 = and t10, Constant:i64<4294967295>
>>     t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>>       t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                        TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>
>> Note (zext (truncate ...)) converted to (and ... 0xffff_ffff).
>>
>> DAG after instruction selection:
>>
>>     t10: i64,ch,glue = CopyFromReg t8:1, Register:i64 $r0, t8:2
>>   !     t25: i64 = SLL_ri t10, TargetConstant:i64<32>
>>   !   t22: i64 = SRL_ri t25, TargetConstant:i64<32>
>>     t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>>       t23: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                        TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>
>> Note (and ... 0xffff_ffff) converted to (SRL_ri (SLL_ri ...)).
>> This happens because of the following pattern from BPFInstrInfo.td:
>>
>>     // 0xffffFFFF doesn't fit into simm32, optimize common case
>>     def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
>>               (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
>>
>> So, the two shift instructions are result of translation of (zext (trunc ...)).
>> However, closer examination shows that zext DAG node was generated
>> almost by accident. Here is the backtrace for when this node was created:
>>
>>     Breakpoint 1, llvm::SelectionDAG::getNode (... Opcode=202) ;; 202 is opcode for ZERO_EXTEND
>>         at .../SelectionDAG.cpp:5605
>>     (gdb) bt
>>     #0  llvm::SelectionDAG::getNode (...)
>>         at ...SelectionDAG.cpp:5605
>>     #1  0x... in getCopyToParts (..., ExtendKind=llvm::ISD::ZERO_EXTEND)
>>         at .../SelectionDAGBuilder.cpp:537
>>     #2  0x... in llvm::RegsForValue::getCopyToRegs (... PreferredExtendType=llvm::ISD::ANY_EXTEND)
>>         at .../SelectionDAGBuilder.cpp:958
>>     #3  0x... in llvm::SelectionDAGBuilder::visitInlineAsm(...)
>>         at .../SelectionDAGBuilder.cpp:9640
>>         ...
>>
>> The stack frame #2 is interesting, here is the code for it [4]:
>>
>>     void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
>>                                      const SDLoc &dl, SDValue &Chain, SDValue *Glue,
>>                                      const Value *V,
>>                                      ISD::NodeType PreferredExtendType) const {
>>                                                    ^
>>                                                    '-- this is ANY_EXTEND
>>       ...
>>       for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
>>         ...
>>                                                    .-- this returns true
>>                                                    v
>>         if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
>>           ExtendKind = ISD::ZERO_EXTEND;
>>
>>                                .-- this is ZERO_EXTEND
>>                                v
>>         getCopyToParts(..., ExtendKind);
>>         Part += NumParts;
>>       }
>>       ...
>>     }
>>
>> The getCopyToRegs() function was called with ANY_EXTEND preference,
>> but switched to ZERO_EXTEND because TLI.isZExtFree() currently returns
>> true for any 32 to 64-bit conversion [5].
>> However, in this case this is clearly a mistake, as zero extension of
>> (zext i64 (truncate i32 ...)) costs two instructions.
>>
>> The isZExtFree() behavior could be changed to report false for such
>> situations, as in my patch [6]. This in turn removes zext =>
>> removes two shifts from final asm.
>> Here is how DAG/asm look after patch [6]:
>>
>>     Initial selection DAG:
>>       ...
>>       t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>   !   t11: i32 = truncate t10
>>       t16: ch,glue = CopyToReg t10:1, Register:i64 %1, t10
>>         t18: ch,glue = inlineasm t16, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>                          TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t16:1
>>       ...
>>
>> Final asm:
>>
>>     ...
>>     # %bb.0:
>>         call bar
>>         #APP
>>         r0 += 1
>>         #NO_APP
>>         exit
>>     ...
>>
>> Note that [6] is a very minor change, it does not affect code
>> generation for selftests at all and I was unable to conjure examples
>> where it has effect aside from inline asm parameters.
>>
>> [4] https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L937C10-L937C10
>> [5] https://github.com/llvm/llvm-project/blob/6f4cc1310b12bc59210e4596a895db4cb9ad6075/llvm/lib/Target/BPF/BPFISelLowering.cpp#L213C21-L213C21
>> [6] https://github.com/llvm/llvm-project/commit/cf8e142e5eac089cc786c671a40fef022d08b0ef
>>
Jose E. Marchesi Jan. 9, 2024, 12:09 p.m. UTC | #12
>> On Fri, Jan 5, 2024 at 1:47 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>>
>>> On Mon, 2023-12-25 at 12:33 -0800, Alexei Starovoitov wrote:
>>> [...]
>>> > It turned out there are indeed a bunch of redundant shifts
>>> > when u32 or s32 is passed into "r" asm constraint.
>>> >
>>> > Strangely the shifts are there when compiled with -mcpu=v3 or v4
>>> > and no shifts with -mcpu=v1 and v2.
>>> >
>>> > Also weird that u8 and u16 are passed into "r" without redundant shifts.
>>> > Hence I found a "workaround": cast u32 into u16 while passing.
>>> > The truncation of u32 doesn't happen and shifts to zero upper 32-bit
>>> > are gone as well.
>>> >
>>> > https://godbolt.org/z/Kqszr6q3v
>>>
>>> Regarding unnecessary shifts.
>>> Sorry, a long email about minor feature/defect.
>>>
>>> So, currently the following C program
>>> (and it's variations with implicit casts):
>>>
>>>     extern unsigned long bar(void);
>>>     void foo(void) {
>>>       asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar()));
>>>     }
>>>
>>> Is translated to the following BPF:
>>>
>>>     $ clang -mcpu=v3 -O2 --target=bpf -mcpu=v3 -c -o - t.c | llvm-objdump --no-show-raw-insn -d -
>>>
>>>     <stdin>:    file format elf64-bpf
>>>
>>>     Disassembly of section .text:
>>>
>>>     0000000000000000 <foo>:
>>>            0:   call -0x1
>>>            1:   r0 <<= 0x20
>>>            2:   r0 >>= 0x20
>>>            3:   r0 += 0x1
>>>            4:   exit
>>>
>>> Note: no additional shifts are generated when "w" (32-bit register)
>>>       constraint is used instead of "r".
>>>
>>> First, is this right or wrong?
>>> ------------------------------
>>>
>>> C language spec [1] paragraph 6.5.4.6 (Cast operators -> Semantics) says
>>> the following:
>>>
>>>   If the value of the expression is represented with greater range or
>>>   precision than required by the type named by the cast (6.3.1.8),
>>>   then the cast specifies a conversion even if the type of the
>>>   expression is the same as the named type and removes any extra range
>>>   and precision.                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>   ^^^^^^^^^^^^^
>>>
>>> What other LLVM backends do in such situations?
>>> Consider the following program translated to amd64 [2] and aarch64 [3]:
>>>
>>>     void foo(void) {
>>>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned long)  bar())); // 1
>>>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned int)   bar())); // 2
>>>       asm volatile("mov %[reg],%[reg]"::[reg]"r"((unsigned short) bar())); // 3
>>>     }
>>>
>>> - for amd64 register of proper size is selected for `reg`;
>>> - for aarch64 warnings about wrong operand size are emitted at (2) and (3)
>>>   and 64-bit register is used w/o generating any additional instructions.
>>>
>>> (Note, however, that 'arm' silently ignores the issue and uses 32-bit
>>>  registers for all three points).
>>>
>>> So, it looks like that something of this sort should be done:
>>> - either extra precision should be removed via additional instructions;
>>> - or 32-bit register should be picked for `reg`;
>>> - or warning should be emitted as in aarch64 case.
>>>
>>> [1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3088.pdf
>>> [2] https://godbolt.org/z/9nKxaMc5j
>>> [3] https://godbolt.org/z/1zxEr5b3f
>>>
>>>
>>> Second, what to do?
>>> -------------------
>>>
>>> I think that the following steps are needed:
>>> - Investigation described in the next section shows that currently two
>>>   shifts are generated accidentally w/o real intent to shed precision.
>>>   I have a patch [6] that removes shifts generation, it should be applied.
>>> - When 32-bit value is passed to "r" constraint:
>>>   - for cpu v3/v4 a 32-bit register should be selected;
>>>   - for cpu v1/v2 a warning should be reported.
>>
>> Thank you for the detailed analysis.
>>
>> Agree that llvm fix [6] is a necessary step, then
>> using 'w' in v3/v4 and warn on v1/v2 makes sense too,
>> but we have this macro:
>> #define barrier_var(var) asm volatile("" : "+r"(var))
>> that is used in various bpf production programs.
>> tetragon is also a heavy user of inline asm.
>>
>> Right now a full 64-bit register is allocated,
>> so switching to 'w' might cause unexpected behavior
>> including rejection by the verifier.
>> I think it makes sense to align the bpf backend with arm64 and x86,
>> but we need to broadcast this change widely.
>>
>> Also need to align with GCC. (Jose cc-ed)
>
> GCC doesn't have an integrated assembler, so using -masm=pseudoc it just
> compiles the program above to:
>
>   foo:
>   	call bar
>   	r0 += 1
> 	exit
>
> Also, at the moment we don't support a "w" constraint, because the
> assembly-like assembly syntax we started with implies different
> instructions that interpret the values stored in the BPF 64-bit
> registers as 32-bit or 64-bit values, i.e.
>
>   mov %r1, 1
>   mov32 %r1, 1
>
> But then the pseudo-c assembly syntax (that we also support) translates
> some of the semantics of the instructions to the register names,
> creating the notion that BPF actually has both 32-bit registers and
> 64-bit registers, i.e.
>
>   r1 += 1
>   w1 += 1
>
> In GCC we support both assembly syntaxes and currently we lack the
> ability to emit 32-bit variants in templates like "%[reg] += 1", so I
> suppose we can introduce a "w" constraint to:
>
> 1. When assembly-like assembly syntax is used, expect a 32-bit mode to
>    match the operand and warn about operand size overflow whenever
>    necessary.  Always emit "%r" as the register name.
>
> 2. When pseudo-c assembly syntax is used, expect a 32-bit mode to match
>    the operand and warn about operand size overflow whenever necessary,
>    and then emit "w" instead of "r" as the register name.
>
>> And, the most importantly, we need a way to go back to old behavior,
>> since u32 var; asm("...":: "r"(var)); will now
>> allocate "w" register or warn.
>
> Is it really necessary to change the meaning of "r"?  You can write
> templates like the one triggering this problem like:
>
>   asm volatile ("%[reg] += 1"::[reg]"w"((unsigned)bar()));
>
> Then the checks above will be performed, driven by the particular
> constraint explicitly specified by the user, not driven by the type of
> the value passed as the operand.
>
> Or am I misunderstanding?

[I have just added a proposal for an agenda item to this week's BPF
 Office Hours so we can discuss about BPF sub-registers and compiler
 constraints, to complement this thread.]

>
>> What should be the workaround?
>>
>> I've tried:
>> u32 var; asm("...":: "r"((u64)var));
>>
>> https://godbolt.org/z/n4ejvWj7v
>>
>> and x86/arm64 generate 32-bit truction.
>> Sounds like the bpf backend has to do it as well.
>> We should be doing 'wX=wX' in such case (just like x86)
>> instead of <<=32 >>=32.
>>
>> I think this should be done as a separate diff.
>> Our current pattern of using shifts is inefficient and guaranteed
>> to screw up verifier range analysis while wX=wX is faster
>> and more verifier friendly.
>> Yes it's still not going to be 1-1 to old (our current) behavior.
>>
>> We probably need some macro (like we did for __BPF_CPU_VERSION__)
>> to identify such fixed llvm, so existing users with '(short)'
>> workaround and other tricks can detect new vs old compiler.
>>
>> Looks like we opened a can of worms.
>> Aligning with x86/arm64 makes sense, but the cost of doing
>> the right thing is hard to estimate.
>>
>>>
>>> Third, why two shifts are generated?
>>> ------------------------------------
>>>
>>> (Details here might be interesting to Yonghong, regular reader could
>>>  skip this section).
>>>
>>> The two shifts are result of interaction between two IR constructs
>>> `trunc` and `asm`. The C program above looks as follows in LLVM IR
>>> before machine code generation:
>>>
>>>     declare dso_local i64 @bar()
>>>     define dso_local void @foo(i32 %p) {
>>>     entry:
>>>       %call = call i64 @bar()
>>>       %v32 = trunc i64 %call to i32
>>>       tail call void asm sideeffect "$0 += 1", "r"(i32 %v32)
>>>       ret void
>>>     }
>>>
>>> Initial selection DAG:
>>>
>>>     $ llc -debug-only=isel -march=bpf -mcpu=v3 --filetype=asm -o - t.ll
>>>     SelectionDAG has 21 nodes:
>>>       ...
>>>       t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>>    !     t11: i32 = truncate t10
>>>    !    t15: i64 = zero_extend t11
>>>       t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t15
>>>         t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>>                          TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>>       ...
>>>
>>> Note values t11 and t15 marked with (!).
>>>
>>> Optimized lowered selection DAG for this fragment:
>>>
>>>     t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>>   !   t22: i64 = and t10, Constant:i64<4294967295>
>>>     t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>>>       t19: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>>                        TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>>
>>> Note (zext (truncate ...)) converted to (and ... 0xffff_ffff).
>>>
>>> DAG after instruction selection:
>>>
>>>     t10: i64,ch,glue = CopyFromReg t8:1, Register:i64 $r0, t8:2
>>>   !     t25: i64 = SLL_ri t10, TargetConstant:i64<32>
>>>   !   t22: i64 = SRL_ri t25, TargetConstant:i64<32>
>>>     t17: ch,glue = CopyToReg t10:1, Register:i64 %1, t22
>>>       t23: ch,glue = inlineasm t17, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>>                        TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t17:1
>>>
>>> Note (and ... 0xffff_ffff) converted to (SRL_ri (SLL_ri ...)).
>>> This happens because of the following pattern from BPFInstrInfo.td:
>>>
>>>     // 0xffffFFFF doesn't fit into simm32, optimize common case
>>>     def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
>>>               (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
>>>
>>> So, the two shift instructions are result of translation of (zext (trunc ...)).
>>> However, closer examination shows that zext DAG node was generated
>>> almost by accident. Here is the backtrace for when this node was created:
>>>
>>>     Breakpoint 1, llvm::SelectionDAG::getNode (... Opcode=202) ;; 202 is opcode for ZERO_EXTEND
>>>         at .../SelectionDAG.cpp:5605
>>>     (gdb) bt
>>>     #0  llvm::SelectionDAG::getNode (...)
>>>         at ...SelectionDAG.cpp:5605
>>>     #1  0x... in getCopyToParts (..., ExtendKind=llvm::ISD::ZERO_EXTEND)
>>>         at .../SelectionDAGBuilder.cpp:537
>>>     #2  0x... in llvm::RegsForValue::getCopyToRegs (... PreferredExtendType=llvm::ISD::ANY_EXTEND)
>>>         at .../SelectionDAGBuilder.cpp:958
>>>     #3  0x... in llvm::SelectionDAGBuilder::visitInlineAsm(...)
>>>         at .../SelectionDAGBuilder.cpp:9640
>>>         ...
>>>
>>> The stack frame #2 is interesting, here is the code for it [4]:
>>>
>>>     void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
>>>                                      const SDLoc &dl, SDValue &Chain, SDValue *Glue,
>>>                                      const Value *V,
>>>                                      ISD::NodeType PreferredExtendType) const {
>>>                                                    ^
>>>                                                    '-- this is ANY_EXTEND
>>>       ...
>>>       for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
>>>         ...
>>>                                                    .-- this returns true
>>>                                                    v
>>>         if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
>>>           ExtendKind = ISD::ZERO_EXTEND;
>>>
>>>                                .-- this is ZERO_EXTEND
>>>                                v
>>>         getCopyToParts(..., ExtendKind);
>>>         Part += NumParts;
>>>       }
>>>       ...
>>>     }
>>>
>>> The getCopyToRegs() function was called with ANY_EXTEND preference,
>>> but switched to ZERO_EXTEND because TLI.isZExtFree() currently returns
>>> true for any 32 to 64-bit conversion [5].
>>> However, in this case this is clearly a mistake, as zero extension of
>>> (zext i64 (truncate i32 ...)) costs two instructions.
>>>
>>> The isZExtFree() behavior could be changed to report false for such
>>> situations, as in my patch [6]. This in turn removes zext =>
>>> removes two shifts from final asm.
>>> Here is how DAG/asm look after patch [6]:
>>>
>>>     Initial selection DAG:
>>>       ...
>>>       t10: i64,ch,glue = CopyFromReg t8, Register:i64 $r0, t8:1
>>>   !   t11: i32 = truncate t10
>>>       t16: ch,glue = CopyToReg t10:1, Register:i64 %1, t10
>>>         t18: ch,glue = inlineasm t16, TargetExternalSymbol:i64'$0 += 1', MDNode:ch<null>,
>>>                          TargetConstant:i64<1>, TargetConstant:i32<131081>, Register:i64 %1, t16:1
>>>       ...
>>>
>>> Final asm:
>>>
>>>     ...
>>>     # %bb.0:
>>>         call bar
>>>         #APP
>>>         r0 += 1
>>>         #NO_APP
>>>         exit
>>>     ...
>>>
>>> Note that [6] is a very minor change, it does not affect code
>>> generation for selftests at all and I was unable to conjure examples
>>> where it has effect aside from inline asm parameters.
>>>
>>> [4] https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp#L937C10-L937C10
>>> [5] https://github.com/llvm/llvm-project/blob/6f4cc1310b12bc59210e4596a895db4cb9ad6075/llvm/lib/Target/BPF/BPFISelLowering.cpp#L213C21-L213C21
>>> [6] https://github.com/llvm/llvm-project/commit/cf8e142e5eac089cc786c671a40fef022d08b0ef
>>>
Alexei Starovoitov Jan. 11, 2024, 2:46 a.m. UTC | #13
On Tue, Jan 9, 2024 at 3:00 AM Jose E. Marchesi
<jose.marchesi@oracle.com> wrote:
>
> >
> > Also need to align with GCC. (Jose cc-ed)
>
> GCC doesn't have an integrated assembler, so using -masm=pseudoc it just
> compiles the program above to:
>
>   foo:
>         call bar
>         r0 += 1
>         exit
>
> Also, at the moment we don't support a "w" constraint, because the
> assembly-like assembly syntax we started with implies different
> instructions that interpret the values stored in the BPF 64-bit
> registers as 32-bit or 64-bit values, i.e.
>
>   mov %r1, 1
>   mov32 %r1, 1

Heh. gcc tried to invent a traditional looking asm for bpf and instead
invented the above :)
x86 and arm64 use single 'mov' and encode sub-registers as rax/eax or x0/w0.

imo support of gcc-only asm style is an obstacle in gcc-bpf adoption.
It's not too far to reconsider supporting this. You can easily
remove the support and it will reduce your maintenance/support work.
It's a bit of a distraction in this thread too.

> But then the pseudo-c assembly syntax (that we also support) translates
> some of the semantics of the instructions to the register names,
> creating the notion that BPF actually has both 32-bit registers and
> 64-bit registers, i.e.
>
>   r1 += 1
>   w1 += 1
>
> In GCC we support both assembly syntaxes and currently we lack the
> ability to emit 32-bit variants in templates like "%[reg] += 1", so I
> suppose we can introduce a "w" constraint to:
>
> 2. When pseudo-c assembly syntax is used, expect a 32-bit mode to match
>    the operand and warn about operand size overflow whenever necessary,
>    and then emit "w" instead of "r" as the register name.

clang supports "w" constraint with -mcpu=v3,v4 and emits 'w'
as register name.

> > And, the most importantly, we need a way to go back to old behavior,
> > since u32 var; asm("...":: "r"(var)); will now
> > allocate "w" register or warn.
>
> Is it really necessary to change the meaning of "r"?  You can write
> templates like the one triggering this problem like:
>
>   asm volatile ("%[reg] += 1"::[reg]"w"((unsigned)bar()));
>
> Then the checks above will be performed, driven by the particular
> constraint explicitly specified by the user, not driven by the type of
> the value passed as the operand.

That's a good question.
For x86 "r" constraint means 8, 16, 32, or 64 bit integer.
For arm64 "r" constraint means 32 or 64 bit integer.

and this is traditional behavior of "r" in other asms too:
AMDGPU - 32 or 64
Hexagon - 32 or 64
powerpc - 32 or 64
risc-v - 32 or 64

imo it makes sense for bpf asm to align with the rest so that:

asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar())); would generate
w0 += 1, NO warn (with -mcpu=v3,v4; and a warn with -mcpu=v1,v2)

asm volatile ("%[reg] += 1"::[reg]"r"((unsigned long)bar()));
r0 += 1, NO warn

asm volatile ("%[reg] += 1"::[reg]"w"((unsigned)bar()));
w0 += 1, NO warn

asm volatile ("%[reg] += 1"::[reg]"w"((unsigned long)bar()));
w0 += 1 and a warn (currently there is none in clang)

I think we can add "R" constraint to mean 64-bit register only:

asm volatile ("%[reg] += 1"::[reg]"R"((unsigned)bar()));
r0 += 1 and a warn

asm volatile ("%[reg] += 1"::[reg]"R"((unsigned long)bar()));
r0 += 1, NO warn
Jose E. Marchesi Jan. 11, 2024, 10:34 a.m. UTC | #14
> On Tue, Jan 9, 2024 at 3:00 AM Jose E. Marchesi
> <jose.marchesi@oracle.com> wrote:
>>
>> >
>> > Also need to align with GCC. (Jose cc-ed)
>>
>> GCC doesn't have an integrated assembler, so using -masm=pseudoc it just
>> compiles the program above to:
>>
>>   foo:
>>         call bar
>>         r0 += 1
>>         exit
>>
>> Also, at the moment we don't support a "w" constraint, because the
>> assembly-like assembly syntax we started with implies different
>> instructions that interpret the values stored in the BPF 64-bit
>> registers as 32-bit or 64-bit values, i.e.
>>
>>   mov %r1, 1
>>   mov32 %r1, 1
>
> Heh. gcc tried to invent a traditional looking asm for bpf and instead
> invented the above :)

Very funny, but we didn't invent it.  We took it from ubpf.

> x86 and arm64 use single 'mov' and encode sub-registers as rax/eax or
> x0/w0.

Yes both targets support specifying portions of the 64-bit registers
using pseudo-register names, which is a better approch vs. using
explicit mnemonics for the 32-bit operations (mov32, add32, etc) because
it makes it possible to specify which instruction to use in a
per-operand basis, like making the mode of actually passed arguments in
inline assembly to influence the operation to be performed.

It is nice to have it also in BPF.

> imo support of gcc-only asm style is an obstacle in gcc-bpf adoption.
> It's not too far to reconsider supporting this. You can easily
> remove the support and it will reduce your maintenance/support work.
> It's a bit of a distraction in this thread too.
>
>> But then the pseudo-c assembly syntax (that we also support) translates
>> some of the semantics of the instructions to the register names,
>> creating the notion that BPF actually has both 32-bit registers and
>> 64-bit registers, i.e.
>>
>>   r1 += 1
>>   w1 += 1
>>
>> In GCC we support both assembly syntaxes and currently we lack the
>> ability to emit 32-bit variants in templates like "%[reg] += 1", so I
>> suppose we can introduce a "w" constraint to:
>>
>> 2. When pseudo-c assembly syntax is used, expect a 32-bit mode to match
>>    the operand and warn about operand size overflow whenever necessary,
>>    and then emit "w" instead of "r" as the register name.
>
> clang supports "w" constraint with -mcpu=v3,v4 and emits 'w'
> as register name.
>
>> > And, the most importantly, we need a way to go back to old behavior,
>> > since u32 var; asm("...":: "r"(var)); will now
>> > allocate "w" register or warn.
>>
>> Is it really necessary to change the meaning of "r"?  You can write
>> templates like the one triggering this problem like:
>>
>>   asm volatile ("%[reg] += 1"::[reg]"w"((unsigned)bar()));
>>
>> Then the checks above will be performed, driven by the particular
>> constraint explicitly specified by the user, not driven by the type of
>> the value passed as the operand.
>
> That's a good question.
> For x86 "r" constraint means 8, 16, 32, or 64 bit integer.
> For arm64 "r" constraint means 32 or 64 bit integer.
>
> and this is traditional behavior of "r" in other asms too:
> AMDGPU - 32 or 64
> Hexagon - 32 or 64
> powerpc - 32 or 64
> risc-v - 32 or 64
> imo it makes sense for bpf asm to align with the rest so that:

Yes you are right and I agree.  It makes sense to follow the established
practice where "r" can lead to any pseudo-register name depending on the
mode of the operand, like in x86_64:

   char     -> %al
   short    -> %ax
   int      -> %eax
   long int -> %rax

And then add diagnostics conditioned on the availability of 32-bit
instructions (alu32).

>
> asm volatile ("%[reg] += 1"::[reg]"r"((unsigned)bar())); would generate
> w0 += 1, NO warn (with -mcpu=v3,v4; and a warn with -mcpu=v1,v2)
>
> asm volatile ("%[reg] += 1"::[reg]"r"((unsigned long)bar()));
> r0 += 1, NO warn
>
> asm volatile ("%[reg] += 1"::[reg]"w"((unsigned)bar()));
> w0 += 1, NO warn
>
> asm volatile ("%[reg] += 1"::[reg]"w"((unsigned long)bar()));
> w0 += 1 and a warn (currently there is none in clang)

Makes sense to me.

> I think we can add "R" constraint to mean 64-bit register only:
>
> asm volatile ("%[reg] += 1"::[reg]"R"((unsigned)bar()));
> r0 += 1 and a warn
>
> asm volatile ("%[reg] += 1"::[reg]"R"((unsigned long)bar()));
> r0 += 1, NO warn

The x86 target has similar constraints "q" (for %Rl registers) and "Q"
(for %Rh registers) but not for 32 and 64 pseudo-registers that I can
see.
Eduard Zingerman Jan. 11, 2024, 4:46 p.m. UTC | #15
On Mon, 2024-01-08 at 13:33 -0800, Alexei Starovoitov wrote:
[...]

> Agree that llvm fix [6] is a necessary step, then
> using 'w' in v3/v4 and warn on v1/v2 makes sense too,
> but we have this macro:
> #define barrier_var(var) asm volatile("" : "+r"(var))
> that is used in various bpf production programs.
> tetragon is also a heavy user of inline asm.

I have an llvm version [1] that implements 32-bit/64-bit register
selection and it indeed breaks some code when tested on BPF selftests.
The main pain point is that code base is compiled both with and
without ALU32, so 'r' constraint cannot be used with 'int'
(and constants are 'int' by default), hence the fixes like:

        size_t off = (size_t)buf->head;
-       asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
+       asm("%0 -= %1" : "+r"(off) : "r"((__u64)buf->skb->data));
        return off;

or

 #ifndef bpf_nop_mov
 #define bpf_nop_mov(var) \
-       asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var))
+       asm volatile("%[reg]=%[reg]"::[reg]"r"((unsigned long)var))
 #endif

or

        int save_syn = 1;
        int rv = -1;
        int v = 0;
-       int op;
+       long op;
        ...
        asm volatile (
            "%[op] = *(u32 *)(%[skops] +96)"
            : [op] "+r"(op)
            : [skops] "r"(skops)
            :);

[1] https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r

Need a bit more time to share the branch with updates for selftests.

> And, the most importantly, we need a way to go back to old behavior,
> since u32 var; asm("...":: "r"(var)); will now
> allocate "w" register or warn.
>
> What should be the workaround?

The u64 cast is the workaround.

> I've tried:
> u32 var; asm("...":: "r"((u64)var));
>
> https://godbolt.org/z/n4ejvWj7v
>
> and x86/arm64 generate 32-bit truction.
> Sounds like the bpf backend has to do it as well.

Here is godbolt output:

    callq   bar()@PLT
    movl    %eax, %eax ; <-- (u64)var is translated as zero extension
    movq    %rax, %rax ; <-- inline asm block uses 64-bit register

Here is LLVM IR for this example before code generation:

    %call = tail call i64 @bar() #2
    %conv1 = and i64 %call, 4294967295 ; <------- `(u64)var` translation
    tail call void asm sideeffect "mov $0, $0",
              "r,~{dirflag},~{fpsr},~{flags}"(i64 %conv1) #2
    ret void

> We should be doing 'wX=wX' in such case (just like x86)
> instead of <<=32 >>=32.

Opened pull request for this:
https://github.com/llvm/llvm-project/pull/77501

> We probably need some macro (like we did for __BPF_CPU_VERSION__)
> to identify such fixed llvm, so existing users with '(short)'
> workaround and other tricks can detect new vs old compiler.
> 
> Looks like we opened a can of worms.
> Aligning with x86/arm64 makes sense, but the cost of doing
> the right thing is hard to estimate.

Indeed.
Jose E. Marchesi Jan. 11, 2024, 6:33 p.m. UTC | #16
> [I have just added a proposal for an agenda item to this week's BPF
>  Office Hours so we can discuss about BPF sub-registers and compiler
>  constraints, to complement this thread.]

Notes from the office hours:

Availability of 32-bit arithmetic instructions:

  (cpu >= v3 AND not disabled with -mno-alu32) OR -malu32

Compiler constraints:

  "r"

     64-bit register (rN) or 32-bit sub-register (wN), based on the mode of
     the operand.

     If 32-bit arithmetic available
        char, short -> wN and warning
        int -> wN
        long int -> rN
     Else
        char, short, int -> rN and warning
        long int -> rN

  "w"

     32-bit sub-register (wN) regardless of the mode of the operand.

     If 32-bit arithmetic available
       char, short -> wN and warning
       int -> wN
       long int -> wN and warning
     Else
       char, short, int, long int -> wN and warning

  "R"

     64-bit register (rN) regardless of the mode of the operand.

     char, short, int -> rN and warn
     long int -> rN

Additional constraints for instruction immediates:

  "I" imm32
  "i" imm64  (already exists as "i" is standard.)
  "O" off16

  warning if not in range.
Eduard Zingerman Jan. 15, 2024, 4:33 p.m. UTC | #17
On Thu, 2024-01-11 at 19:33 +0100, Jose E. Marchesi wrote:
> 
> > [I have just added a proposal for an agenda item to this week's BPF
> >  Office Hours so we can discuss about BPF sub-registers and compiler
> >  constraints, to complement this thread.]
> 
> Notes from the office hours:
> 
> Availability of 32-bit arithmetic instructions:
> 
>   (cpu >= v3 AND not disabled with -mno-alu32) OR -malu32
> 
> Compiler constraints:
> 
>   "r"
> 
>      64-bit register (rN) or 32-bit sub-register (wN), based on the mode of
>      the operand.
> 
>      If 32-bit arithmetic available
>         char, short -> wN and warning
>         int -> wN
>         long int -> rN
>      Else
>         char, short, int -> rN and warning
>         long int -> rN
> 
>   "w"
> 
>      32-bit sub-register (wN) regardless of the mode of the operand.
> 
>      If 32-bit arithmetic available
>        char, short -> wN and warning
>        int -> wN
>        long int -> wN and warning
>      Else
>        char, short, int, long int -> wN and warning

Hi All,

I have a preliminary implementation of agreed logic for "r" and "w"
inline asm constraints in LLVM branch [0]:

- Constraint "r":
| Operand type | Has ALU32          | No ALU32           |
|--------------+--------------------+--------------------|
| char         | warning            | warning            |
| short        | warning            | warning            |
| int          | use 32-bit "w" reg | warning            |
| long         | use 64-bit "r" reg | use 64-bit "r" reg |

- Constraint "w":
| Operand type | Has ALU32          | No ALU32 |
|--------------+--------------------+----------|
| char         | warning            | warning  |
| short        | warning            | warning  |
| int          | use 32-bit "w" reg | warning  |
| long         | warning            | warning  |

The following constraints are not implemented for now:
"R", "I", "i", "O.

I also have patches, adapting BPF selftests [1], Cilium [2] and
Tetragon [3] to compile using both current LLVM main and [0].
After porting, selftests [1] are passing, and there is no verification
pass/fail differences when loading [2,3] object files using veristat.

The main take-away from the the porting exercise is that all three
projects still use CPUv2 fully or in parts of the code-base.

The issue with CPUv2 is that 'int' type cannot be passed to input
"r" constraint w/o explicit cast, and cannot be passed to output
constraint at all. Note that by default integer literals in C code
have type 'int', hence passing a constant to "ri" constraint might
require cast as well.

E.g. changes for input constraints, necessary for CPUv2:

-   asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
+   asm("%0 -= %1" : "+r"(off) : "r"((__u64)buf->skb->data));

E.g. changes for ouput (input/output) constraints, necessary for CPUv2:

-   int am;
+   long am;
    ...
    asm volatile("%[am] &= 0xffff;\n" ::[am] "+r"(am)

Also, selftests use constraint named "g" in one place:

  #define __sink(expr) asm volatile("" : "+g"(expr))

This constraint is documented in [4] as follows:

  "g" Any register, memory or immediate integer operand is allowed,
      except for registers that are not general registers.

In [0] I apply to "g" same checks as to "r".
This is not fully correct, as it warns about e.g. operand 10 (but not 10L)
when built with CPUv2. It looks like some internal Clang interfaces
(shared between targets) would need adjustment to allow different
semantic action on value vs. constant passed to "g" constraint.

The only instance when I had to analyze verifier behavior while
porting selftests considers usage of the barrier_var() macro.
Test case verif_scale_pyperf600_bpf_loop contains a code sequence
similar to following:

    #define barrier_var(var) asm volatile("" : "+r"(var))
    ...
    void foo(unsigned int i, unsigned char *ptr) {
      ...
      barrier_var(i);
      if (i >= 100)
        return;
      ... ptr[i] ...;
    }

Previously such code was translated as follows:

    w1 = w1                ; sign extension for inline asm operand
    if w1 > 0x63 goto ...
    ...
    r2 += r1

But now sign extension is gone.
In the test case the range for 'i' was unknown to verifier prior to
comparison and 32-bit comparison only produces range for lower half of
the register. Thus verifier reported an error:

  math between map_value pointer and register with
  unbounded min value is not allowed

Unfortunately, I'm not aware of a simple way in C to force this
comparison be done in 64-bits w/o changing 'i' to 64-bit type:
InstCombinePass demotes it 32-bit comparison.
Hence, I changed the type of 'i' from __u32 to __u64.

There are 18 selftest object files with slight differences in code
generation when new constraint handling logic of [0] is applied.
I'll report on these differences a bit later.

Please let me know what you think about impact of the compiler changes
on the code base.

Thanks,
Eduard

[0] Updated LLVM
    https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r
[1] selftests
    https://gist.github.com/eddyz87/276f1ecc51930017dcddbb56e37f57ad
[2] Cilium
    https://gist.github.com/eddyz87/4a485573556012ec730c2de0256a79db
    Note: this is based upon branch 'libbpf-friendliness'
          from https://github.com/anakryiko/cilium
[3] Tetragon
    https://gist.github.com/eddyz87/ca9a4b68007c72469307f2cce3f83bb1
[4] https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#index-g-in-constraint
Alexei Starovoitov Jan. 16, 2024, 5:47 p.m. UTC | #18
On Mon, Jan 15, 2024 at 8:33 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
>
> [0] Updated LLVM
>     https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r

1.
// Use sequence 'wX = wX' if 32-bits ops are available.
let Predicates = [BPFHasALU32] in {

This is unnecessary conservative.
wX = wX instructions existed from day one.
The very first commit of the interpreter and the verifier recognized it.
No need to gate it by BPFHasALU32.

2.
case 'w':
if (Size == 32 && HasAlu32)

This is probably unnecessary as well.
When bpf programmer specifies 'w' constraint, llvm should probably use it
regardless of alu32 flag.

aarch64 has this comment:
    case 'x':
    case 'w':
      // For now assume that the person knows what they're
      // doing with the modifier.
      return true;

I'm reading it as the constraint is a final decision.
Inline assembly shouldn't change with -mcpu flags.
Alexei Starovoitov Jan. 16, 2024, 6:40 p.m. UTC | #19
On Mon, Jan 15, 2024 at 8:33 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
> [1] selftests
>     https://gist.github.com/eddyz87/276f1ecc51930017dcddbb56e37f57ad
> [2] Cilium
>     https://gist.github.com/eddyz87/4a485573556012ec730c2de0256a79db
>     Note: this is based upon branch 'libbpf-friendliness'
>           from https://github.com/anakryiko/cilium
> [3] Tetragon
>     https://gist.github.com/eddyz87/ca9a4b68007c72469307f2cce3f83bb1


The changes to all three make sense, but they might cause regressions
if they are not synchronized with new llvm.
cilium/tetragon can control the llvm version to some degree, but not selftests.
Should we add clang macro like __BPF_CPU_VERSION__ and ifdef
different asm style depending on that?
I suspect this "(short)" workaround will still be needed for quite
some time while people upgrade to the latest llvm.
something like __BPF_STRICT_ASM_CONSTRAINT__ ?
Maybe a flag too that can revert to old behavior without warnings?
Yonghong Song Jan. 16, 2024, 7:07 p.m. UTC | #20
On 1/16/24 9:47 AM, Alexei Starovoitov wrote:
> On Mon, Jan 15, 2024 at 8:33 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>
>> [0] Updated LLVM
>>      https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r
> 1.
> // Use sequence 'wX = wX' if 32-bits ops are available.
> let Predicates = [BPFHasALU32] in {
>
> This is unnecessary conservative.
> wX = wX instructions existed from day one.
> The very first commit of the interpreter and the verifier recognized it.
> No need to gate it by BPFHasALU32.

Actually this is not true from llvm perspective.
wX = wX is available in bpf ISA from day one, but
wX register is only introduced in llvm in 2017
and at the same time alu32 is added to facilitate
its usage.

>
> 2.
> case 'w':
> if (Size == 32 && HasAlu32)
>
> This is probably unnecessary as well.
> When bpf programmer specifies 'w' constraint, llvm should probably use it
> regardless of alu32 flag.
>
> aarch64 has this comment:
>      case 'x':
>      case 'w':
>        // For now assume that the person knows what they're
>        // doing with the modifier.
>        return true;
>
> I'm reading it as the constraint is a final decision.
> Inline assembly shouldn't change with -mcpu flags.
Alexei Starovoitov Jan. 16, 2024, 7:34 p.m. UTC | #21
On Tue, Jan 16, 2024 at 11:07 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 1/16/24 9:47 AM, Alexei Starovoitov wrote:
> > On Mon, Jan 15, 2024 at 8:33 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >>
> >> [0] Updated LLVM
> >>      https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r
> > 1.
> > // Use sequence 'wX = wX' if 32-bits ops are available.
> > let Predicates = [BPFHasALU32] in {
> >
> > This is unnecessary conservative.
> > wX = wX instructions existed from day one.
> > The very first commit of the interpreter and the verifier recognized it.
> > No need to gate it by BPFHasALU32.
>
> Actually this is not true from llvm perspective.
> wX = wX is available in bpf ISA from day one, but
> wX register is only introduced in llvm in 2017
> and at the same time alu32 is added to facilitate
> its usage.

Not quite. At that time we added general support in the verifier
for the majority of alu32 insns. The insns worked in the interpreter
earlier, but the verifier didn't handle them.
While wX=wX was supported by the verifier from the start.
So this particular single insn shouldn't be part of alu32 flag
It didn't need to be back in 2017 and doesn't need to be now.
Yonghong Song Jan. 16, 2024, 11:14 p.m. UTC | #22
On 1/16/24 11:34 AM, Alexei Starovoitov wrote:

> On Tue, Jan 16, 2024 at 11:07 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> On 1/16/24 9:47 AM, Alexei Starovoitov wrote:
>>> On Mon, Jan 15, 2024 at 8:33 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>>> [0] Updated LLVM
>>>>       https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r
>>> 1.
>>> // Use sequence 'wX = wX' if 32-bits ops are available.
>>> let Predicates = [BPFHasALU32] in {
>>>
>>> This is unnecessary conservative.
>>> wX = wX instructions existed from day one.
>>> The very first commit of the interpreter and the verifier recognized it.
>>> No need to gate it by BPFHasALU32.
>> Actually this is not true from llvm perspective.
>> wX = wX is available in bpf ISA from day one, but
>> wX register is only introduced in llvm in 2017
>> and at the same time alu32 is added to facilitate
>> its usage.
> Not quite. At that time we added general support in the verifier
> for the majority of alu32 insns. The insns worked in the interpreter
> earlier, but the verifier didn't handle them.
> While wX=wX was supported by the verifier from the start.
> So this particular single insn shouldn't be part of alu32 flag
> It didn't need to be back in 2017 and doesn't need to be now.

Okay, IIUC, currently 32-bit subreg is enabled
only if alu32 is enabled.
   if (STI.getHasAlu32())
     addRegisterClass(MVT::i32, &BPF::GPR32RegClass);

We should unconditionally enable 32-bit subreg with.
   addRegisterClass(MVT::i32, &BPF::GPR32RegClass);

We may need to add Alu32 control in some other
places which trying to access 32-bit subreg.
But for wX = wX thing, we do not need Alu32 control
and the following is okay:
  def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
           (INSERT_SUBREG
             (i64 (IMPLICIT_DEF)),
             (MOV_rr_32 (i32 (EXTRACT_SUBREG GPR:$src, sub_32))),
             sub_32)>;

I tried with the above change with unconditionally
doing addRegisterClass(MVT::i32, &BPF::GPR32RegClass).

$ cat t1.c
unsigned long test1(unsigned long x) {
         return (unsigned)x;
}
unsigned long test2(unsigned x) {
         return x;
}
#if 0
unsigned test3(unsigned x, unsigned y) {
         return x + y;
}
#endif
$ clang --target=bpf -mcpu=v1 -O2 -c t1.c && llvm-objdump -d t1.o

t1.o:   file format elf64-bpf

Disassembly of section .text:
         
0000000000000000 <test1>:
        0:       bc 10 00 00 00 00 00 00 w0 = w1
        1:       95 00 00 00 00 00 00 00 exit
         
0000000000000010 <test2>:
        2:       bc 10 00 00 00 00 00 00 w0 = w1
        3:       95 00 00 00 00 00 00 00 exit
$

More changes in BPFInstrInfo.td and possible other
places need to make the above test3() function work
at -mcpu=v1.
Eduard Zingerman Jan. 16, 2024, 11:15 p.m. UTC | #23
On Tue, 2024-01-16 at 10:40 -0800, Alexei Starovoitov wrote:
[...]
> The changes to all three make sense, but they might cause regressions
> if they are not synchronized with new llvm.
> cilium/tetragon can control the llvm version to some degree, but not selftests.
> Should we add clang macro like __BPF_CPU_VERSION__ and ifdef
> different asm style depending on that?
> I suspect this "(short)" workaround will still be needed for quite
> some time while people upgrade to the latest llvm.
> something like __BPF_STRICT_ASM_CONSTRAINT__ ?
> Maybe a flag too that can revert to old behavior without warnings?

After my changes selftests are passing both with old and new
constraint semantics, so such macro definitions / compiler flags
are not necessary for selftests.
(Although, I have not yet checked the codegen difference,
 so the absence of the "(short)" thing might be visible there).

As for Cilium / Tetragon: I checked verification of the object files
with both LLVM versions, but adding compiler flag might make sense.
Maybe compiler users should comment?
Eduard Zingerman Jan. 16, 2024, 11:55 p.m. UTC | #24
On Tue, 2024-01-16 at 09:47 -0800, Alexei Starovoitov wrote:

[...]

> 2.
> case 'w':
> if (Size == 32 && HasAlu32)
> 
> This is probably unnecessary as well.
> When bpf programmer specifies 'w' constraint, llvm should probably use it
> regardless of alu32 flag.
> 
> aarch64 has this comment:
>     case 'x':
>     case 'w':
>       // For now assume that the person knows what they're
>       // doing with the modifier.
>       return true;
> 
> I'm reading it as the constraint is a final decision.
> Inline assembly shouldn't change with -mcpu flags.

llvm-mc -arch=bpf -mcpu=v2 compiles the asm below w/o warnings:

	.text
	.file	"test-asm-shifts.c"
	.globl	foo
	.p2align	3
	.type	foo,@function
foo:
	call bar
	w0 += 1
	exit
.Lfunc_end0:
	.size	foo, .Lfunc_end0-foo

However, clang would complain about "w" constraint when used
for inline assembly when CPUv2 is used:

  error: couldn't allocate input reg for constraint 'w'

As Yonghong notes in the sibling thread, this would probably require
enabling 32-bit sub-registers for CPUv2.
Will need some time to investigate.
Alexei Starovoitov Jan. 17, 2024, 10:43 p.m. UTC | #25
On Tue, Jan 16, 2024 at 3:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> On 1/16/24 11:34 AM, Alexei Starovoitov wrote:
>
> > On Tue, Jan 16, 2024 at 11:07 AM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>
> >> On 1/16/24 9:47 AM, Alexei Starovoitov wrote:
> >>> On Mon, Jan 15, 2024 at 8:33 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >>>> [0] Updated LLVM
> >>>>       https://github.com/eddyz87/llvm-project/tree/bpf-inline-asm-polymorphic-r
> >>> 1.
> >>> // Use sequence 'wX = wX' if 32-bits ops are available.
> >>> let Predicates = [BPFHasALU32] in {
> >>>
> >>> This is unnecessary conservative.
> >>> wX = wX instructions existed from day one.
> >>> The very first commit of the interpreter and the verifier recognized it.
> >>> No need to gate it by BPFHasALU32.
> >> Actually this is not true from llvm perspective.
> >> wX = wX is available in bpf ISA from day one, but
> >> wX register is only introduced in llvm in 2017
> >> and at the same time alu32 is added to facilitate
> >> its usage.
> > Not quite. At that time we added general support in the verifier
> > for the majority of alu32 insns. The insns worked in the interpreter
> > earlier, but the verifier didn't handle them.
> > While wX=wX was supported by the verifier from the start.
> > So this particular single insn shouldn't be part of alu32 flag
> > It didn't need to be back in 2017 and doesn't need to be now.
>
> Okay, IIUC, currently 32-bit subreg is enabled
> only if alu32 is enabled.
>    if (STI.getHasAlu32())
>      addRegisterClass(MVT::i32, &BPF::GPR32RegClass);
>
> We should unconditionally enable 32-bit subreg with.
>    addRegisterClass(MVT::i32, &BPF::GPR32RegClass);

Makes sense to me.
It should be fine with -mcpu=v1,v2.

> We may need to add Alu32 control in some other
> places which trying to access 32-bit subreg.
> But for wX = wX thing, we do not need Alu32 control
> and the following is okay:
>   def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
>            (INSERT_SUBREG
>              (i64 (IMPLICIT_DEF)),
>              (MOV_rr_32 (i32 (EXTRACT_SUBREG GPR:$src, sub_32))),
>              sub_32)>;
>
> I tried with the above change with unconditionally
> doing addRegisterClass(MVT::i32, &BPF::GPR32RegClass).
>
> $ cat t1.c
> unsigned long test1(unsigned long x) {
>          return (unsigned)x;
> }
> unsigned long test2(unsigned x) {
>          return x;
> }
> #if 0
> unsigned test3(unsigned x, unsigned y) {
>          return x + y;
> }
> #endif
> $ clang --target=bpf -mcpu=v1 -O2 -c t1.c && llvm-objdump -d t1.o
>
> t1.o:   file format elf64-bpf
>
> Disassembly of section .text:
>
> 0000000000000000 <test1>:
>         0:       bc 10 00 00 00 00 00 00 w0 = w1
>         1:       95 00 00 00 00 00 00 00 exit
>
> 0000000000000010 <test2>:
>         2:       bc 10 00 00 00 00 00 00 w0 = w1
>         3:       95 00 00 00 00 00 00 00 exit
> $
>
> More changes in BPFInstrInfo.td and possible other
> places need to make the above test3() function work
> at -mcpu=v1.

All makes sense to me.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 1386baf9ae4a..b78a9449793d 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -254,6 +254,50 @@  extern void bpf_throw(u64 cookie) __ksym;
 		}									\
 	 })
 
+#define __cmp_cannot_be_signed(x) \
+	__builtin_strcmp(#x, "==") == 0 || __builtin_strcmp(#x, "!=") == 0 || \
+	__builtin_strcmp(#x, "&") == 0
+
+#define __is_signed_type(type) (((type)(-1)) < (type)1)
+
+#define __bpf_cmp(LHS, OP, SIGN, PRED, RHS) \
+	({ \
+		__label__ l_true; \
+		bool ret = true; \
+		asm volatile goto("if %[lhs] " SIGN #OP " %[rhs] goto %l[l_true]" \
+				  :: [lhs] "r"(LHS), [rhs] PRED (RHS) :: l_true); \
+		ret = false; \
+l_true:\
+		ret;\
+       })
+
+/* C type conversions coupled with comparison operator are tricky.
+ * Make sure BPF program is compiled with -Wsign-compre then
+ * __lhs OP __rhs below will catch the mistake.
+ * Be aware that we check only __lhs to figure out the sign of compare.
+ */
+#ifndef bpf_cmp
+#define bpf_cmp(LHS, OP, RHS) \
+	({ \
+		typeof(LHS) __lhs = (LHS); \
+		typeof(RHS) __rhs = (RHS); \
+		bool ret; \
+		(void)(__lhs OP __rhs); \
+		if (__cmp_cannot_be_signed(OP) || !__is_signed_type(typeof(__lhs))) {\
+			if (sizeof(__rhs) == 8) \
+				ret = __bpf_cmp(__lhs, OP, "", "r", __rhs); \
+			else \
+				ret = __bpf_cmp(__lhs, OP, "", "i", __rhs); \
+		} else { \
+			if (sizeof(__rhs) == 8) \
+				ret = __bpf_cmp(__lhs, OP, "s", "r", __rhs); \
+			else \
+				ret = __bpf_cmp(__lhs, OP, "s", "i", __rhs); \
+		} \
+		ret; \
+       })
+#endif
+
 /* Description
  *	Assert that a conditional expression is true.
  * Returns
diff --git a/tools/testing/selftests/bpf/progs/exceptions.c b/tools/testing/selftests/bpf/progs/exceptions.c
index 2811ee842b01..7acb859f57c7 100644
--- a/tools/testing/selftests/bpf/progs/exceptions.c
+++ b/tools/testing/selftests/bpf/progs/exceptions.c
@@ -210,7 +210,7 @@  __noinline int assert_zero_gfunc(u64 c)
 {
 	volatile u64 cookie = c;
 
-	bpf_assert_eq(cookie, 0);
+	bpf_assert(bpf_cmp(cookie, ==, 0));
 	return 0;
 }
 
@@ -218,7 +218,7 @@  __noinline int assert_neg_gfunc(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_lt(cookie, 0);
+	bpf_assert(bpf_cmp(cookie, <, 0));
 	return 0;
 }
 
@@ -226,7 +226,7 @@  __noinline int assert_pos_gfunc(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_gt(cookie, 0);
+	bpf_assert(bpf_cmp(cookie, >, 0));
 	return 0;
 }
 
@@ -234,7 +234,7 @@  __noinline int assert_negeq_gfunc(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_le(cookie, -1);
+	bpf_assert(bpf_cmp(cookie, <=, -1));
 	return 0;
 }
 
@@ -242,7 +242,7 @@  __noinline int assert_poseq_gfunc(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_ge(cookie, 1);
+	bpf_assert(bpf_cmp(cookie, >=, 1));
 	return 0;
 }
 
@@ -258,7 +258,7 @@  __noinline int assert_zero_gfunc_with(u64 c)
 {
 	volatile u64 cookie = c;
 
-	bpf_assert_eq_with(cookie, 0, cookie + 100);
+	bpf_assert_with(bpf_cmp(cookie, ==, 0), cookie + 100);
 	return 0;
 }
 
@@ -266,7 +266,7 @@  __noinline int assert_neg_gfunc_with(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_lt_with(cookie, 0, cookie + 100);
+	bpf_assert_with(bpf_cmp(cookie, <, 0), cookie + 100);
 	return 0;
 }
 
@@ -274,7 +274,7 @@  __noinline int assert_pos_gfunc_with(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_gt_with(cookie, 0, cookie + 100);
+	bpf_assert_with(bpf_cmp(cookie, >, 0), cookie + 100);
 	return 0;
 }
 
@@ -282,7 +282,7 @@  __noinline int assert_negeq_gfunc_with(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_le_with(cookie, -1, cookie + 100);
+	bpf_assert_with(bpf_cmp(cookie, <=, -1), cookie + 100);
 	return 0;
 }
 
@@ -290,7 +290,7 @@  __noinline int assert_poseq_gfunc_with(s64 c)
 {
 	volatile s64 cookie = c;
 
-	bpf_assert_ge_with(cookie, 1, cookie + 100);
+	bpf_assert_with(bpf_cmp(cookie, >=, 1), cookie + 100);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c
index e085a51d153e..7af9d6bca03b 100644
--- a/tools/testing/selftests/bpf/progs/iters_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c
@@ -28,9 +28,8 @@  int iter_task_vma_for_each(const void *ctx)
 		return 0;
 
 	bpf_for_each(task_vma, vma, task, 0) {
-		if (seen >= 1000)
+		if (bpf_cmp(seen, >=, 1000))
 			break;
-		barrier_var(seen);
 
 		vm_ranges[seen].vm_start = vma->vm_start;
 		vm_ranges[seen].vm_end = vma->vm_end;