diff mbox series

[bpf] riscv, bpf: make some atomic operations fully ordered

Message ID 20240505201633.123115-1-puranjay@kernel.org (mailing list archive)
State Accepted
Commit 20a759df3bba35bf5c3ddec0c02ad69b603b584c
Delegated to: BPF
Headers show
Series [bpf] riscv, bpf: make some atomic operations fully ordered | expand

Checks

Context Check Description
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 20 of 20 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 43 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Puranjay Mohan May 5, 2024, 8:16 p.m. UTC
The BPF atomic operations with the BPF_FETCH modifier along with
BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements
all atomic operations except BPF_CMPXCHG with relaxed ordering.

Section 8.1 of the "The RISC-V Instruction Set Manual Volume I:
Unprivileged ISA" [1], titled, "Specifying Ordering of Atomic
Instructions" says:

| To provide more efficient support for release consistency [5], each
| atomic instruction has two bits, aq and rl, used to specify additional
| memory ordering constraints as viewed by other RISC-V harts.

and

| If only the aq bit is set, the atomic memory operation is treated as
| an acquire access.
| If only the rl bit is set, the atomic memory operation is treated as a
| release access.
|
| If both the aq and rl bits are set, the atomic memory operation is
| sequentially consistent.

Fix this by setting both aq and rl bits as 1 for operations with
BPF_FETCH and BPF_XCHG.

[1] https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf

Fixes: dd642ccb45ec ("riscv, bpf: Implement more atomic operations for RV64")
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 arch/riscv/net/bpf_jit_comp64.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

Comments

Puranjay Mohan May 5, 2024, 10:40 p.m. UTC | #1
Puranjay Mohan <puranjay@kernel.org> writes:

> The BPF atomic operations with the BPF_FETCH modifier along with
> BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements
> all atomic operations except BPF_CMPXCHG with relaxed ordering.

I know that the BPF memory model is in the works and we currently don't
have a way to make all the JITs consistent. But as far as atomic
operations are concerned here are my observations:

1. ARM64 and x86
   -------------

JITs are following the LKMM, where:

Any operation with BPF_FETCH, BPF_CMPXCHG, and BPF_XCHG is fully
ordered.

On x86, this is by the virtue of its memory model where locked
instructions are fully ordered. ARM64 is emitting explicit instructions
to make sure the above are fully ordered.


2. RISCV64
   -------

JIT was emitting all atomic instructions with relaxed ordering, the
above patch fixes it to follow LKMM.


3. POWERPC
   -------

JIT is emitting all atomic instructions with relaxed ordering. It
implements atomic operations using LL and SC instructions, we need to
emit "sync" instructions before and after this sequence to make it
follow the LKMM. This is how the kernel is doing it.

Naveen, can you ack this? if this is the correct thing to do, I will
send a patch.


4. S390
   ----

Ilya, can you help with this?

I see that the kernel is emitting "bcr 14,0" after "laal|laalg" but the
JIT is not.


5. Loongarch
   ---------
   
Tiezhu, can you help with this?

I see that the JIT is using am*.{w/d} instructions for all atomic
accesses. I see that there are am*_db.{w/d} instructions in the ISA that
also implement the data barrier function with the atomic op. Maybe
these need to used for BPF_FETCH, BPF_XCHG, and BPF_CMPXCHG as the
kernel is using them for arch_atomic_fetch_add() etc.


Thanks,
Puranjay
Ilya Leoshkevich May 6, 2024, 12:28 p.m. UTC | #2
> Puranjay Mohan <puranjay@kernel.org> writes:
> 
> > The BPF atomic operations with the BPF_FETCH modifier along with
> > BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT
> > implements
> > all atomic operations except BPF_CMPXCHG with relaxed ordering.
> 
> I know that the BPF memory model is in the works and we currently
> don't
> have a way to make all the JITs consistent. But as far as atomic
> operations are concerned here are my observations:

[...]

> 4. S390
>    ----
> 
> Ilya, can you help with this?
> 
> I see that the kernel is emitting "bcr 14,0" after "laal|laalg" but
> the
> JIT is not.

Hi,

Here are two relevant paragraphs from the Principles of Operation:

  Relation between Operand Accesses
  =================================
  As observed by other CPUs and by channel pro-
  grams, storage-operand fetches associated with one
  instruction execution appear to precede all storage-
  operand references for conceptually subsequent
  instructions. A storage-operand store specified by
  one instruction appears to precede all storage-oper-
  and stores specified by conceptually subsequent
  instructions, but it does not necessarily precede stor-
  age-operand fetches specified by conceptually sub-
  sequent instructions. However, a storage-operand
  store appears to precede a conceptually subsequent
  storage-operand fetch from the same main-storage
  location.

In short, all memory accesses are fully ordered except for
stores followed by fetches from different addresses.

  LAALG R1,R3,D2(B2)
  ==================
  [...]
  All accesses to the second-operand location appear
  to be a block-concurrent interlocked-update refer-
  ence as observed by other CPUs and the I/O subsys-
  tem. A specific-operand-serialization operation is
  performed.

Specific-operand-serialization is weaker than full serialization,
which means that, even though s390x provides very strong ordering
guarantees, strictly speaking, as architected, s390x atomics are not
fully ordered.

I have a hard time thinking of a situation where a store-fetch
reordering for different addresses could matter, but to be on the safe
side we should probably just do what the kernel does and add a
"bcr 14,0". I will send a patch.

[...]

> Thanks,
> Puranjay
Puranjay Mohan May 6, 2024, 2:46 p.m. UTC | #3
Ilya Leoshkevich <iii@linux.ibm.com> writes:

>> Puranjay Mohan <puranjay@kernel.org> writes:
>> 
>> > The BPF atomic operations with the BPF_FETCH modifier along with
>> > BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT
>> > implements
>> > all atomic operations except BPF_CMPXCHG with relaxed ordering.
>> 
>> I know that the BPF memory model is in the works and we currently
>> don't
>> have a way to make all the JITs consistent. But as far as atomic
>> operations are concerned here are my observations:
>
> [...]
>
>> 4. S390
>>    ----
>> 
>> Ilya, can you help with this?
>> 
>> I see that the kernel is emitting "bcr 14,0" after "laal|laalg" but
>> the
>> JIT is not.
>
> Hi,
>
> Here are two relevant paragraphs from the Principles of Operation:
>
>   Relation between Operand Accesses
>   =================================
>   As observed by other CPUs and by channel pro-
>   grams, storage-operand fetches associated with one
>   instruction execution appear to precede all storage-
>   operand references for conceptually subsequent
>   instructions. A storage-operand store specified by
>   one instruction appears to precede all storage-oper-
>   and stores specified by conceptually subsequent
>   instructions, but it does not necessarily precede stor-
>   age-operand fetches specified by conceptually sub-
>   sequent instructions. However, a storage-operand
>   store appears to precede a conceptually subsequent
>   storage-operand fetch from the same main-storage
>   location.
>
> In short, all memory accesses are fully ordered except for
> stores followed by fetches from different addresses.

Thanks for sharing the details.

So, this is TSO like x86

>   LAALG R1,R3,D2(B2)
>   ==================
>   [...]
>   All accesses to the second-operand location appear
>   to be a block-concurrent interlocked-update refer-
>   ence as observed by other CPUs and the I/O subsys-
>   tem. A specific-operand-serialization operation is
>   performed.
>
> Specific-operand-serialization is weaker than full serialization,
> which means that, even though s390x provides very strong ordering
> guarantees, strictly speaking, as architected, s390x atomics are not
> fully ordered.
>
> I have a hard time thinking of a situation where a store-fetch
> reordering for different addresses could matter, but to be on the safe
> side we should probably just do what the kernel does and add a
> "bcr 14,0". I will send a patch.

Thanks,

IMO, bcr 14,0 would be needed because, s390x has both

  int __atomic_add(int i, int *v);

and

  int __atomic_add_barrier(int i, int *v);

both of these do the fetch operation but the second one adds a barrier
(bcr 14, 0)

JIT was using the first one (without barrier) to implement the arch_atomic_fetch_add

So, assuming that without this barrier, store->fetch reordering would be
allowed as you said.

It would mean:
This litmus test would fail for the s390 JIT:

  C SB+atomic_fetch
  
  (*
   * Result: Never
   *
   * This litmus test demonstrates that LKMM expects total ordering from
   * atomic_*() operations with fetch or return.
   *)
  
  {
          atomic_t dummy1 = ATOMIC_INIT(1);
          atomic_t dummy2 = ATOMIC_INIT(1);
  }
  
  P0(int *x, int *y, atomic_t *dummy1)
  {
          WRITE_ONCE(*x, 1);
          rd = atomic_fetch_add(1, dummy1);     /* assuming this is implemented without barrier */ 
          r0 = READ_ONCE(*y);
  }
  
  P1(int *x, int *y, atomic_t *dummy2)
  {
          WRITE_ONCE(*y, 1);
          rd = atomic_fetch_add(1, dummy2);    /* assuming this is implemented without barrier */
          r1 = READ_ONCE(*x);
  }
  
  exists (0:r0=0 /\ 1:r1=0)


P.S. - It would be nice to have a tool that can convert litmus tests
into BPF assembly programs and then we can test them on hardware after JITing.

Thanks,
Puranjay
Pu Lehui May 6, 2024, 3:38 p.m. UTC | #4
On 2024/5/6 4:16, Puranjay Mohan wrote:
> The BPF atomic operations with the BPF_FETCH modifier along with
> BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements
> all atomic operations except BPF_CMPXCHG with relaxed ordering.
> 
> Section 8.1 of the "The RISC-V Instruction Set Manual Volume I:
> Unprivileged ISA" [1], titled, "Specifying Ordering of Atomic
> Instructions" says:
> 
> | To provide more efficient support for release consistency [5], each
> | atomic instruction has two bits, aq and rl, used to specify additional
> | memory ordering constraints as viewed by other RISC-V harts.
> 
> and
> 
> | If only the aq bit is set, the atomic memory operation is treated as
> | an acquire access.
> | If only the rl bit is set, the atomic memory operation is treated as a
> | release access.
> |
> | If both the aq and rl bits are set, the atomic memory operation is
> | sequentially consistent.
> 
> Fix this by setting both aq and rl bits as 1 for operations with
> BPF_FETCH and BPF_XCHG.
> 
> [1] https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
> 
> Fixes: dd642ccb45ec ("riscv, bpf: Implement more atomic operations for RV64")
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
>   arch/riscv/net/bpf_jit_comp64.c | 20 ++++++++++----------
>   1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index ec9d692838fc..fb5d1950042b 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -498,33 +498,33 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
>   		break;
>   	/* src_reg = atomic_fetch_<op>(dst_reg + off16, src_reg) */
>   	case BPF_ADD | BPF_FETCH:
> -		emit(is64 ? rv_amoadd_d(rs, rs, rd, 0, 0) :
> -		     rv_amoadd_w(rs, rs, rd, 0, 0), ctx);
> +		emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) :
> +		     rv_amoadd_w(rs, rs, rd, 1, 1), ctx);
>   		if (!is64)
>   			emit_zextw(rs, rs, ctx);
>   		break;
>   	case BPF_AND | BPF_FETCH:
> -		emit(is64 ? rv_amoand_d(rs, rs, rd, 0, 0) :
> -		     rv_amoand_w(rs, rs, rd, 0, 0), ctx);
> +		emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) :
> +		     rv_amoand_w(rs, rs, rd, 1, 1), ctx);
>   		if (!is64)
>   			emit_zextw(rs, rs, ctx);
>   		break;
>   	case BPF_OR | BPF_FETCH:
> -		emit(is64 ? rv_amoor_d(rs, rs, rd, 0, 0) :
> -		     rv_amoor_w(rs, rs, rd, 0, 0), ctx);
> +		emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) :
> +		     rv_amoor_w(rs, rs, rd, 1, 1), ctx);
>   		if (!is64)
>   			emit_zextw(rs, rs, ctx);
>   		break;
>   	case BPF_XOR | BPF_FETCH:
> -		emit(is64 ? rv_amoxor_d(rs, rs, rd, 0, 0) :
> -		     rv_amoxor_w(rs, rs, rd, 0, 0), ctx);
> +		emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) :
> +		     rv_amoxor_w(rs, rs, rd, 1, 1), ctx);
>   		if (!is64)
>   			emit_zextw(rs, rs, ctx);
>   		break;
>   	/* src_reg = atomic_xchg(dst_reg + off16, src_reg); */
>   	case BPF_XCHG:
> -		emit(is64 ? rv_amoswap_d(rs, rs, rd, 0, 0) :
> -		     rv_amoswap_w(rs, rs, rd, 0, 0), ctx);
> +		emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) :
> +		     rv_amoswap_w(rs, rs, rd, 1, 1), ctx);
>   		if (!is64)
>   			emit_zextw(rs, rs, ctx);
>   		break;

Reviewed-by: Pu Lehui <pulehui@huawei.com>
Ilya Leoshkevich May 6, 2024, 10:56 p.m. UTC | #5
On Mon, 2024-05-06 at 14:46 +0000, Puranjay Mohan wrote:
> Ilya Leoshkevich <iii@linux.ibm.com> writes:
> 
> > > Puranjay Mohan <puranjay@kernel.org> writes:
> > > 
> > > > The BPF atomic operations with the BPF_FETCH modifier along
> > > > with
> > > > BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT
> > > > implements
> > > > all atomic operations except BPF_CMPXCHG with relaxed ordering.
> > > 
> > > I know that the BPF memory model is in the works and we currently
> > > don't
> > > have a way to make all the JITs consistent. But as far as atomic
> > > operations are concerned here are my observations:
> > 
> > [...]
> > 
> > > 4. S390
> > >    ----
> > > 
> > > Ilya, can you help with this?
> > > 
> > > I see that the kernel is emitting "bcr 14,0" after "laal|laalg"
> > > but
> > > the
> > > JIT is not.
> > 
> > Hi,
> > 
> > Here are two relevant paragraphs from the Principles of Operation:
> > 
> >   Relation between Operand Accesses
> >   =================================
> >   As observed by other CPUs and by channel pro-
> >   grams, storage-operand fetches associated with one
> >   instruction execution appear to precede all storage-
> >   operand references for conceptually subsequent
> >   instructions. A storage-operand store specified by
> >   one instruction appears to precede all storage-oper-
> >   and stores specified by conceptually subsequent
> >   instructions, but it does not necessarily precede stor-
> >   age-operand fetches specified by conceptually sub-
> >   sequent instructions. However, a storage-operand
> >   store appears to precede a conceptually subsequent
> >   storage-operand fetch from the same main-storage
> >   location.
> > 
> > In short, all memory accesses are fully ordered except for
> > stores followed by fetches from different addresses.
> 
> Thanks for sharing the details.
> 
> So, this is TSO like x86
> 
> >   LAALG R1,R3,D2(B2)
> >   ==================
> >   [...]
> >   All accesses to the second-operand location appear
> >   to be a block-concurrent interlocked-update refer-
> >   ence as observed by other CPUs and the I/O subsys-
> >   tem. A specific-operand-serialization operation is
> >   performed.
> > 
> > Specific-operand-serialization is weaker than full serialization,
> > which means that, even though s390x provides very strong ordering
> > guarantees, strictly speaking, as architected, s390x atomics are
> > not
> > fully ordered.
> > 
> > I have a hard time thinking of a situation where a store-fetch
> > reordering for different addresses could matter, but to be on the
> > safe
> > side we should probably just do what the kernel does and add a
> > "bcr 14,0". I will send a patch.
> 
> Thanks,
> 
> IMO, bcr 14,0 would be needed because, s390x has both
> 
>   int __atomic_add(int i, int *v);
> 
> and
> 
>   int __atomic_add_barrier(int i, int *v);
> 
> both of these do the fetch operation but the second one adds a
> barrier
> (bcr 14, 0)
> 
> JIT was using the first one (without barrier) to implement the
> arch_atomic_fetch_add
> 
> So, assuming that without this barrier, store->fetch reordering would
> be
> allowed as you said.
> 
> It would mean:
> This litmus test would fail for the s390 JIT:
> 
>   C SB+atomic_fetch
>   
>   (*
>    * Result: Never
>    *
>    * This litmus test demonstrates that LKMM expects total ordering
> from
>    * atomic_*() operations with fetch or return.
>    *)
>   
>   {
>           atomic_t dummy1 = ATOMIC_INIT(1);
>           atomic_t dummy2 = ATOMIC_INIT(1);
>   }
>   
>   P0(int *x, int *y, atomic_t *dummy1)
>   {
>           WRITE_ONCE(*x, 1);
>           rd = atomic_fetch_add(1, dummy1);     /* assuming this is
> implemented without barrier */ 
>           r0 = READ_ONCE(*y);
>   }
>   
>   P1(int *x, int *y, atomic_t *dummy2)
>   {
>           WRITE_ONCE(*y, 1);
>           rd = atomic_fetch_add(1, dummy2);    /* assuming this is
> implemented without barrier */
>           r1 = READ_ONCE(*x);
>   }
>   
>   exists (0:r0=0 /\ 1:r1=0)
> 
> 
> P.S. - It would be nice to have a tool that can convert litmus tests
> into BPF assembly programs and then we can test them on hardware
> after JITing.
> 
> Thanks,
> Puranjay

Thanks for providing an example! So unrelated memory accesses may rely
on atomics being barriers.

I will adjust my commit message, since now I claim that we are doing
the change "just in case", but apparently, if the hardware chooses to
exercise all the freedoms allowed by the architecture, there can occur
real problems.
Naveen N Rao May 7, 2024, 9:52 a.m. UTC | #6
Hi Puranjay,

On Sun, May 05, 2024 at 10:40:00PM GMT, Puranjay Mohan wrote:
> Puranjay Mohan <puranjay@kernel.org> writes:
> 
> > The BPF atomic operations with the BPF_FETCH modifier along with
> > BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements
> > all atomic operations except BPF_CMPXCHG with relaxed ordering.
> 
> I know that the BPF memory model is in the works and we currently don't
> have a way to make all the JITs consistent. But as far as atomic
> operations are concerned here are my observations:
> 
...
> 
> 
> 3. POWERPC
>    -------
> 
> JIT is emitting all atomic instructions with relaxed ordering. It
> implements atomic operations using LL and SC instructions, we need to
> emit "sync" instructions before and after this sequence to make it
> follow the LKMM. This is how the kernel is doing it.

Indeed - good find!

> 
> Naveen, can you ack this? if this is the correct thing to do, I will
> send a patch.

Please do.


Thanks,
Naveen
Puranjay Mohan May 7, 2024, 5:58 p.m. UTC | #7
Naveen N Rao <naveen@kernel.org> writes:

> Hi Puranjay,
>
> On Sun, May 05, 2024 at 10:40:00PM GMT, Puranjay Mohan wrote:
>> Puranjay Mohan <puranjay@kernel.org> writes:
>> 
>> > The BPF atomic operations with the BPF_FETCH modifier along with
>> > BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements
>> > all atomic operations except BPF_CMPXCHG with relaxed ordering.
>> 
>> I know that the BPF memory model is in the works and we currently don't
>> have a way to make all the JITs consistent. But as far as atomic
>> operations are concerned here are my observations:
>> 
> ...
>> 
>> 
>> 3. POWERPC
>>    -------
>> 
>> JIT is emitting all atomic instructions with relaxed ordering. It
>> implements atomic operations using LL and SC instructions, we need to
>> emit "sync" instructions before and after this sequence to make it
>> follow the LKMM. This is how the kernel is doing it.
>
> Indeed - good find!
>
>> 
>> Naveen, can you ack this? if this is the correct thing to do, I will
>> send a patch.
>
> Please do.
>

Hi Naveen,
I have sent a patch fixing both ppc32 and ppc64. But I don't have a way
to test this or even compile it:
https://lore.kernel.org/all/20240507175439.119467-1-puranjay@kernel.org/

Can you help me test this? the change is trivial.

Thanks,
Puranjay
patchwork-bot+netdevbpf@kernel.org May 13, 2024, midnight UTC | #8
Hello:

This patch was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:

On Sun,  5 May 2024 20:16:33 +0000 you wrote:
> The BPF atomic operations with the BPF_FETCH modifier along with
> BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements
> all atomic operations except BPF_CMPXCHG with relaxed ordering.
> 
> Section 8.1 of the "The RISC-V Instruction Set Manual Volume I:
> Unprivileged ISA" [1], titled, "Specifying Ordering of Atomic
> Instructions" says:
> 
> [...]

Here is the summary with links:
  - [bpf] riscv, bpf: make some atomic operations fully ordered
    https://git.kernel.org/bpf/bpf-next/c/20a759df3bba

You are awesome, thank you!
diff mbox series

Patch

diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index ec9d692838fc..fb5d1950042b 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -498,33 +498,33 @@  static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
 		break;
 	/* src_reg = atomic_fetch_<op>(dst_reg + off16, src_reg) */
 	case BPF_ADD | BPF_FETCH:
-		emit(is64 ? rv_amoadd_d(rs, rs, rd, 0, 0) :
-		     rv_amoadd_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) :
+		     rv_amoadd_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	case BPF_AND | BPF_FETCH:
-		emit(is64 ? rv_amoand_d(rs, rs, rd, 0, 0) :
-		     rv_amoand_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) :
+		     rv_amoand_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	case BPF_OR | BPF_FETCH:
-		emit(is64 ? rv_amoor_d(rs, rs, rd, 0, 0) :
-		     rv_amoor_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) :
+		     rv_amoor_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	case BPF_XOR | BPF_FETCH:
-		emit(is64 ? rv_amoxor_d(rs, rs, rd, 0, 0) :
-		     rv_amoxor_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) :
+		     rv_amoxor_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	/* src_reg = atomic_xchg(dst_reg + off16, src_reg); */
 	case BPF_XCHG:
-		emit(is64 ? rv_amoswap_d(rs, rs, rd, 0, 0) :
-		     rv_amoswap_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) :
+		     rv_amoswap_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;