Message ID | 20220505035526.2974382-6-guoren@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | riscv: Optimize atomic implementation | expand |
On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren@kernel.org wrote: > From: Guo Ren <guoren@linux.alibaba.com> > > The current implementation is the same with 8e86f0b409a4 > ("arm64: atomics: fix use of acquire + release for full barrier > semantics"). RISC-V could combine acquire and release into the SC > instructions and it could reduce a fence instruction to gain better > performance. Here is related descriptio from RISC-V ISA 10.2 > Load-Reserved/Store-Conditional Instructions: > > - .aq: The LR/SC sequence can be given acquire semantics by > setting the aq bit on the LR instruction. > - .rl: The LR/SC sequence can be given release semantics by > setting the rl bit on the SC instruction. > - .aqrl: Setting the aq bit on the LR instruction, and setting > both the aq and the rl bit on the SC instruction makes > the LR/SC sequence sequentially consistent, meaning that > it cannot be reordered with earlier or later memory > operations from the same hart. > > Software should not set the rl bit on an LR instruction unless > the aq bit is also set, nor should software set the aq bit on an > SC instruction unless the rl bit is also set. LR.rl and SC.aq > instructions are not guaranteed to provide any stronger ordering > than those with both bits clear, but may result in lower > performance. > > The only difference is when sc.w/d.aqrl failed, it would cause .aq > effect than before. But it's okay for sematic because overlap > address LR couldn't beyond relating SC. IIUC that's not accurate, or at least wasn't in 2018. The ISA tends to drift around a bit, so it's possible things have changed since then. 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") describes the issue more specifically, that's when we added these fences. There have certainly been complains that these fences are too heavyweight for the HW to go fast, but IIUC it's the best option we have given the current set of memory model primitives we implement in the ISA (ie, there's more in RVWMO but no way to encode that). The others all look good, though, and as these are really all independent cleanups I'm going to go ahead and put those three on for-next. There's also a bunch of checkpatch errors. The ones about "*" seem spurious, but the alignment ones aren't. Here's my fixups: diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 34f757dfc8f2..0bde499fa8bc 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor, i) * versions, while the logical ops only have fetch versions. */ #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ -static __always_inline \ -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ - atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ + atomic##prefix##_t *v) \ { \ register c_type ret; \ __asm__ __volatile__ ( \ @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ : "memory"); \ return ret; \ } \ -static __always_inline \ -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ - atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ + atomic##prefix##_t *v) \ { \ register c_type ret; \ __asm__ __volatile__ ( \ @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ : "memory"); \ return ret; \ } \ -static __always_inline \ -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ - atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_fetch_##op##_release(c_type i, \ + atomic##prefix##_t *v) \ { \ register c_type ret; \ __asm__ __volatile__ ( \ @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ : "memory"); \ return ret; \ } \ -static __always_inline \ -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ { \ register c_type ret; \ __asm__ __volatile__ ( \ @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ } #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ -static __always_inline \ -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ - atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ + atomic##prefix##_t *v) \ { \ - return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ + return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ } \ -static __always_inline \ -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \ - atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_##op##_return_acquire(c_type i, \ + atomic##prefix##_t *v) \ { \ - return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ + return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ } \ -static __always_inline \ -c_type arch_atomic##prefix##_##op##_return_release(c_type i, \ - atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_##op##_return_release(c_type i, \ + atomic##prefix##_t *v) \ { \ - return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ + return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ } \ -static __always_inline \ -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ +static __always_inline c_type \ +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ { \ - return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ + return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ } #ifdef CONFIG_GENERIC_ATOMIC64 > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > Signed-off-by: Guo Ren <guoren@kernel.org> > Cc: Palmer Dabbelt <palmer@dabbelt.com> > Cc: Mark Rutland <mark.rutland@arm.com> > Cc: Dan Lustig <dlustig@nvidia.com> > Cc: Andrea Parri <parri.andrea@gmail.com> > --- > arch/riscv/include/asm/atomic.h | 24 ++++++++---------------- > arch/riscv/include/asm/cmpxchg.h | 6 ++---- > 2 files changed, 10 insertions(+), 20 deletions(-) > > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > index 34f757dfc8f2..aef8aa9ac4f4 100644 > --- a/arch/riscv/include/asm/atomic.h > +++ b/arch/riscv/include/asm/atomic.h > @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int > "0: lr.w %[p], %[c]\n" > " beq %[p], %[u], 1f\n" > " add %[rc], %[p], %[a]\n" > - " sc.w.rl %[rc], %[rc], %[c]\n" > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : [a]"r" (a), [u]"r" (u) > @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, > "0: lr.d %[p], %[c]\n" > " beq %[p], %[u], 1f\n" > " add %[rc], %[p], %[a]\n" > - " sc.d.rl %[rc], %[rc], %[c]\n" > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : [a]"r" (a), [u]"r" (u) > @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) > "0: lr.w %[p], %[c]\n" > " bltz %[p], 1f\n" > " addi %[rc], %[p], 1\n" > - " sc.w.rl %[rc], %[rc], %[c]\n" > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : > @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) > "0: lr.w %[p], %[c]\n" > " bgtz %[p], 1f\n" > " addi %[rc], %[p], -1\n" > - " sc.w.rl %[rc], %[rc], %[c]\n" > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : > @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) > "0: lr.w %[p], %[c]\n" > " addi %[rc], %[p], -1\n" > " bltz %[rc], 1f\n" > - " sc.w.rl %[rc], %[rc], %[c]\n" > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : > @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) > "0: lr.d %[p], %[c]\n" > " bltz %[p], 1f\n" > " addi %[rc], %[p], 1\n" > - " sc.d.rl %[rc], %[rc], %[c]\n" > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : > @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) > "0: lr.d %[p], %[c]\n" > " bgtz %[p], 1f\n" > " addi %[rc], %[p], -1\n" > - " sc.d.rl %[rc], %[rc], %[c]\n" > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : > @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) > "0: lr.d %[p], %[c]\n" > " addi %[rc], %[p], -1\n" > " bltz %[rc], 1f\n" > - " sc.d.rl %[rc], %[rc], %[c]\n" > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" > "1:\n" > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > : > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h > index 1af8db92250b..9269fceb86e0 100644 > --- a/arch/riscv/include/asm/cmpxchg.h > +++ b/arch/riscv/include/asm/cmpxchg.h > @@ -307,9 +307,8 @@ > __asm__ __volatile__ ( \ > "0: lr.w %0, %2\n" \ > " bne %0, %z3, 1f\n" \ > - " sc.w.rl %1, %z4, %2\n" \ > + " sc.w.aqrl %1, %z4, %2\n" \ > " bnez %1, 0b\n" \ > - " fence rw, rw\n" \ > "1:\n" \ > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > : "rJ" ((long)__old), "rJ" (__new) \ > @@ -319,9 +318,8 @@ > __asm__ __volatile__ ( \ > "0: lr.d %0, %2\n" \ > " bne %0, %z3, 1f\n" \ > - " sc.d.rl %1, %z4, %2\n" \ > + " sc.d.aqrl %1, %z4, %2\n" \ > " bnez %1, 0b\n" \ > - " fence rw, rw\n" \ > "1:\n" \ > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > : "rJ" (__old), "rJ" (__new) \
On Sun, May 22, 2022 at 4:46 AM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren@kernel.org wrote: > > From: Guo Ren <guoren@linux.alibaba.com> > > > > The current implementation is the same with 8e86f0b409a4 > > ("arm64: atomics: fix use of acquire + release for full barrier > > semantics"). RISC-V could combine acquire and release into the SC > > instructions and it could reduce a fence instruction to gain better > > performance. Here is related descriptio from RISC-V ISA 10.2 > > Load-Reserved/Store-Conditional Instructions: > > > > - .aq: The LR/SC sequence can be given acquire semantics by > > setting the aq bit on the LR instruction. > > - .rl: The LR/SC sequence can be given release semantics by > > setting the rl bit on the SC instruction. > > - .aqrl: Setting the aq bit on the LR instruction, and setting > > both the aq and the rl bit on the SC instruction makes > > the LR/SC sequence sequentially consistent, meaning that > > it cannot be reordered with earlier or later memory > > operations from the same hart. > > > > Software should not set the rl bit on an LR instruction unless > > the aq bit is also set, nor should software set the aq bit on an > > SC instruction unless the rl bit is also set. LR.rl and SC.aq > > instructions are not guaranteed to provide any stronger ordering > > than those with both bits clear, but may result in lower > > performance. > > > > The only difference is when sc.w/d.aqrl failed, it would cause .aq > > effect than before. But it's okay for sematic because overlap > > address LR couldn't beyond relating SC. > > IIUC that's not accurate, or at least wasn't in 2018. The ISA tends to > drift around a bit, so it's possible things have changed since then. > 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") Thx for explaining, that why I suspected with the sentence in the comment: "which do not give full-ordering with .aqrl" > describes the issue more specifically, that's when we added these > fences. There have certainly been complains that these fences are too > heavyweight for the HW to go fast, but IIUC it's the best option we have Yeah, it would reduce the performance on D1 and our next-generation processor has optimized fence performance a lot. > given the current set of memory model primitives we implement in the > ISA (ie, there's more in RVWMO but no way to encode that). > > The others all look good, though, and as these are really all > independent cleanups I'm going to go ahead and put those three on > for-next. > > There's also a bunch of checkpatch errors. The ones about "*" seem > spurious, but the alignment ones aren't. Here's my fixups: Thx for the fixup. > > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > index 34f757dfc8f2..0bde499fa8bc 100644 > --- a/arch/riscv/include/asm/atomic.h > +++ b/arch/riscv/include/asm/atomic.h > @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor, i) > * versions, while the logical ops only have fetch versions. > */ > #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ > -static __always_inline \ > -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > - atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > + atomic##prefix##_t *v) \ > { \ > register c_type ret; \ > __asm__ __volatile__ ( \ > @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > : "memory"); \ > return ret; \ > } \ > -static __always_inline \ > -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > - atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > + atomic##prefix##_t *v) \ > { \ > register c_type ret; \ > __asm__ __volatile__ ( \ > @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > : "memory"); \ > return ret; \ > } \ > -static __always_inline \ > -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > - atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > + atomic##prefix##_t *v) \ > { \ > register c_type ret; \ > __asm__ __volatile__ ( \ > @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > : "memory"); \ > return ret; \ > } \ > -static __always_inline \ > -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > { \ > register c_type ret; \ > __asm__ __volatile__ ( \ > @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > } > > #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ > -static __always_inline \ > -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > - atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > + atomic##prefix##_t *v) \ > { \ > - return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > + return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > } \ > -static __always_inline \ > -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > - atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > + atomic##prefix##_t *v) \ > { \ > - return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > + return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > } \ > -static __always_inline \ > -c_type arch_atomic##prefix##_##op##_return_release(c_type i, \ > - atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_##op##_return_release(c_type i, \ > + atomic##prefix##_t *v) \ > { \ > - return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > + return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > } \ > -static __always_inline \ > -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > +static __always_inline c_type \ > +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > { \ > - return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > + return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > } > > #ifdef CONFIG_GENERIC_ATOMIC64 > > > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > > Signed-off-by: Guo Ren <guoren@kernel.org> > > Cc: Palmer Dabbelt <palmer@dabbelt.com> > > Cc: Mark Rutland <mark.rutland@arm.com> > > Cc: Dan Lustig <dlustig@nvidia.com> > > Cc: Andrea Parri <parri.andrea@gmail.com> > > --- > > arch/riscv/include/asm/atomic.h | 24 ++++++++---------------- > > arch/riscv/include/asm/cmpxchg.h | 6 ++---- > > 2 files changed, 10 insertions(+), 20 deletions(-) > > > > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > > index 34f757dfc8f2..aef8aa9ac4f4 100644 > > --- a/arch/riscv/include/asm/atomic.h > > +++ b/arch/riscv/include/asm/atomic.h > > @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int > > "0: lr.w %[p], %[c]\n" > > " beq %[p], %[u], 1f\n" > > " add %[rc], %[p], %[a]\n" > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : [a]"r" (a), [u]"r" (u) > > @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, > > "0: lr.d %[p], %[c]\n" > > " beq %[p], %[u], 1f\n" > > " add %[rc], %[p], %[a]\n" > > - " sc.d.rl %[rc], %[rc], %[c]\n" > > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : [a]"r" (a), [u]"r" (u) > > @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) > > "0: lr.w %[p], %[c]\n" > > " bltz %[p], 1f\n" > > " addi %[rc], %[p], 1\n" > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : > > @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) > > "0: lr.w %[p], %[c]\n" > > " bgtz %[p], 1f\n" > > " addi %[rc], %[p], -1\n" > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : > > @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) > > "0: lr.w %[p], %[c]\n" > > " addi %[rc], %[p], -1\n" > > " bltz %[rc], 1f\n" > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : > > @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) > > "0: lr.d %[p], %[c]\n" > > " bltz %[p], 1f\n" > > " addi %[rc], %[p], 1\n" > > - " sc.d.rl %[rc], %[rc], %[c]\n" > > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : > > @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) > > "0: lr.d %[p], %[c]\n" > > " bgtz %[p], 1f\n" > > " addi %[rc], %[p], -1\n" > > - " sc.d.rl %[rc], %[rc], %[c]\n" > > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : > > @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) > > "0: lr.d %[p], %[c]\n" > > " addi %[rc], %[p], -1\n" > > " bltz %[rc], 1f\n" > > - " sc.d.rl %[rc], %[rc], %[c]\n" > > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > "1:\n" > > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > : > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h > > index 1af8db92250b..9269fceb86e0 100644 > > --- a/arch/riscv/include/asm/cmpxchg.h > > +++ b/arch/riscv/include/asm/cmpxchg.h > > @@ -307,9 +307,8 @@ > > __asm__ __volatile__ ( \ > > "0: lr.w %0, %2\n" \ > > " bne %0, %z3, 1f\n" \ > > - " sc.w.rl %1, %z4, %2\n" \ > > + " sc.w.aqrl %1, %z4, %2\n" \ > > " bnez %1, 0b\n" \ > > - " fence rw, rw\n" \ > > "1:\n" \ > > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > > : "rJ" ((long)__old), "rJ" (__new) \ > > @@ -319,9 +318,8 @@ > > __asm__ __volatile__ ( \ > > "0: lr.d %0, %2\n" \ > > " bne %0, %z3, 1f\n" \ > > - " sc.d.rl %1, %z4, %2\n" \ > > + " sc.d.aqrl %1, %z4, %2\n" \ > > " bnez %1, 0b\n" \ > > - " fence rw, rw\n" \ > > "1:\n" \ > > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > > : "rJ" (__old), "rJ" (__new) \
On Sun, 22 May 2022 06:12:56 PDT (-0700), guoren@kernel.org wrote: > On Sun, May 22, 2022 at 4:46 AM Palmer Dabbelt <palmer@dabbelt.com> wrote: >> >> On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren@kernel.org wrote: >> > From: Guo Ren <guoren@linux.alibaba.com> >> > >> > The current implementation is the same with 8e86f0b409a4 >> > ("arm64: atomics: fix use of acquire + release for full barrier >> > semantics"). RISC-V could combine acquire and release into the SC >> > instructions and it could reduce a fence instruction to gain better >> > performance. Here is related descriptio from RISC-V ISA 10.2 >> > Load-Reserved/Store-Conditional Instructions: >> > >> > - .aq: The LR/SC sequence can be given acquire semantics by >> > setting the aq bit on the LR instruction. >> > - .rl: The LR/SC sequence can be given release semantics by >> > setting the rl bit on the SC instruction. >> > - .aqrl: Setting the aq bit on the LR instruction, and setting >> > both the aq and the rl bit on the SC instruction makes >> > the LR/SC sequence sequentially consistent, meaning that >> > it cannot be reordered with earlier or later memory >> > operations from the same hart. >> > >> > Software should not set the rl bit on an LR instruction unless >> > the aq bit is also set, nor should software set the aq bit on an >> > SC instruction unless the rl bit is also set. LR.rl and SC.aq >> > instructions are not guaranteed to provide any stronger ordering >> > than those with both bits clear, but may result in lower >> > performance. >> > >> > The only difference is when sc.w/d.aqrl failed, it would cause .aq >> > effect than before. But it's okay for sematic because overlap >> > address LR couldn't beyond relating SC. >> >> IIUC that's not accurate, or at least wasn't in 2018. The ISA tends to >> drift around a bit, so it's possible things have changed since then. >> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > Thx for explaining, that why I suspected with the sentence in the comment: > "which do not give full-ordering with .aqrl" Sorry, I'm not quite sure what you're saying here. My understanding is that this change results in mappings that violate LKMM, based on the rationale in that previous commit. IIUC that all still holds, but I'm not really a memory model person so I frequently miss stuff around there. >> describes the issue more specifically, that's when we added these >> fences. There have certainly been complains that these fences are too >> heavyweight for the HW to go fast, but IIUC it's the best option we have > Yeah, it would reduce the performance on D1 and our next-generation > processor has optimized fence performance a lot. Definately a bummer that the fences make the HW go slow, but I don't really see any other way to go about this. If you think these mappings are valid for LKMM and RVWMO then we should figure this out, but trying to drop fences to make HW go faster in ways that violate the memory model is going to lead to insanity. I can definately buy the argument that we have mappings of various application memory models that are difficult to make fast given the ISA encodings of RVWMO we have, but that's not really an issue we can fix in the atomic mappings. >> given the current set of memory model primitives we implement in the >> ISA (ie, there's more in RVWMO but no way to encode that). >> >> The others all look good, though, and as these are really all >> independent cleanups I'm going to go ahead and put those three on >> for-next. >> >> There's also a bunch of checkpatch errors. The ones about "*" seem >> spurious, but the alignment ones aren't. Here's my fixups: > Thx for the fixup. > >> >> diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h >> index 34f757dfc8f2..0bde499fa8bc 100644 >> --- a/arch/riscv/include/asm/atomic.h >> +++ b/arch/riscv/include/asm/atomic.h >> @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor, i) >> * versions, while the logical ops only have fetch versions. >> */ >> #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ >> - atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ >> + atomic##prefix##_t *v) \ >> { \ >> register c_type ret; \ >> __asm__ __volatile__ ( \ >> @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ >> : "memory"); \ >> return ret; \ >> } \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ >> - atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ >> + atomic##prefix##_t *v) \ >> { \ >> register c_type ret; \ >> __asm__ __volatile__ ( \ >> @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ >> : "memory"); \ >> return ret; \ >> } \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ >> - atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_fetch_##op##_release(c_type i, \ >> + atomic##prefix##_t *v) \ >> { \ >> register c_type ret; \ >> __asm__ __volatile__ ( \ >> @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ >> : "memory"); \ >> return ret; \ >> } \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ >> { \ >> register c_type ret; \ >> __asm__ __volatile__ ( \ >> @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ >> } >> >> #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ >> - atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ >> + atomic##prefix##_t *v) \ >> { \ >> - return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ >> + return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ >> } \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \ >> - atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_##op##_return_acquire(c_type i, \ >> + atomic##prefix##_t *v) \ >> { \ >> - return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ >> + return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ >> } \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_##op##_return_release(c_type i, \ >> - atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_##op##_return_release(c_type i, \ >> + atomic##prefix##_t *v) \ >> { \ >> - return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ >> + return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ >> } \ >> -static __always_inline \ >> -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ >> +static __always_inline c_type \ >> +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ >> { \ >> - return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ >> + return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ >> } >> >> #ifdef CONFIG_GENERIC_ATOMIC64 >> >> >> > >> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> >> > Signed-off-by: Guo Ren <guoren@kernel.org> >> > Cc: Palmer Dabbelt <palmer@dabbelt.com> >> > Cc: Mark Rutland <mark.rutland@arm.com> >> > Cc: Dan Lustig <dlustig@nvidia.com> >> > Cc: Andrea Parri <parri.andrea@gmail.com> >> > --- >> > arch/riscv/include/asm/atomic.h | 24 ++++++++---------------- >> > arch/riscv/include/asm/cmpxchg.h | 6 ++---- >> > 2 files changed, 10 insertions(+), 20 deletions(-) >> > >> > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h >> > index 34f757dfc8f2..aef8aa9ac4f4 100644 >> > --- a/arch/riscv/include/asm/atomic.h >> > +++ b/arch/riscv/include/asm/atomic.h >> > @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int >> > "0: lr.w %[p], %[c]\n" >> > " beq %[p], %[u], 1f\n" >> > " add %[rc], %[p], %[a]\n" >> > - " sc.w.rl %[rc], %[rc], %[c]\n" >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : [a]"r" (a), [u]"r" (u) >> > @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, >> > "0: lr.d %[p], %[c]\n" >> > " beq %[p], %[u], 1f\n" >> > " add %[rc], %[p], %[a]\n" >> > - " sc.d.rl %[rc], %[rc], %[c]\n" >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : [a]"r" (a), [u]"r" (u) >> > @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) >> > "0: lr.w %[p], %[c]\n" >> > " bltz %[p], 1f\n" >> > " addi %[rc], %[p], 1\n" >> > - " sc.w.rl %[rc], %[rc], %[c]\n" >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : >> > @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) >> > "0: lr.w %[p], %[c]\n" >> > " bgtz %[p], 1f\n" >> > " addi %[rc], %[p], -1\n" >> > - " sc.w.rl %[rc], %[rc], %[c]\n" >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : >> > @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) >> > "0: lr.w %[p], %[c]\n" >> > " addi %[rc], %[p], -1\n" >> > " bltz %[rc], 1f\n" >> > - " sc.w.rl %[rc], %[rc], %[c]\n" >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : >> > @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) >> > "0: lr.d %[p], %[c]\n" >> > " bltz %[p], 1f\n" >> > " addi %[rc], %[p], 1\n" >> > - " sc.d.rl %[rc], %[rc], %[c]\n" >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : >> > @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) >> > "0: lr.d %[p], %[c]\n" >> > " bgtz %[p], 1f\n" >> > " addi %[rc], %[p], -1\n" >> > - " sc.d.rl %[rc], %[rc], %[c]\n" >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : >> > @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) >> > "0: lr.d %[p], %[c]\n" >> > " addi %[rc], %[p], -1\n" >> > " bltz %[rc], 1f\n" >> > - " sc.d.rl %[rc], %[rc], %[c]\n" >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" >> > " bnez %[rc], 0b\n" >> > - " fence rw, rw\n" >> > "1:\n" >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) >> > : >> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h >> > index 1af8db92250b..9269fceb86e0 100644 >> > --- a/arch/riscv/include/asm/cmpxchg.h >> > +++ b/arch/riscv/include/asm/cmpxchg.h >> > @@ -307,9 +307,8 @@ >> > __asm__ __volatile__ ( \ >> > "0: lr.w %0, %2\n" \ >> > " bne %0, %z3, 1f\n" \ >> > - " sc.w.rl %1, %z4, %2\n" \ >> > + " sc.w.aqrl %1, %z4, %2\n" \ >> > " bnez %1, 0b\n" \ >> > - " fence rw, rw\n" \ >> > "1:\n" \ >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ >> > : "rJ" ((long)__old), "rJ" (__new) \ >> > @@ -319,9 +318,8 @@ >> > __asm__ __volatile__ ( \ >> > "0: lr.d %0, %2\n" \ >> > " bne %0, %z3, 1f\n" \ >> > - " sc.d.rl %1, %z4, %2\n" \ >> > + " sc.d.aqrl %1, %z4, %2\n" \ >> > " bnez %1, 0b\n" \ >> > - " fence rw, rw\n" \ >> > "1:\n" \ >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ >> > : "rJ" (__old), "rJ" (__new) \
On Thu, Jun 2, 2022 at 1:59 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Sun, 22 May 2022 06:12:56 PDT (-0700), guoren@kernel.org wrote: > > On Sun, May 22, 2022 at 4:46 AM Palmer Dabbelt <palmer@dabbelt.com> wrote: > >> > >> On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren@kernel.org wrote: > >> > From: Guo Ren <guoren@linux.alibaba.com> > >> > > >> > The current implementation is the same with 8e86f0b409a4 > >> > ("arm64: atomics: fix use of acquire + release for full barrier > >> > semantics"). RISC-V could combine acquire and release into the SC > >> > instructions and it could reduce a fence instruction to gain better > >> > performance. Here is related descriptio from RISC-V ISA 10.2 > >> > Load-Reserved/Store-Conditional Instructions: > >> > > >> > - .aq: The LR/SC sequence can be given acquire semantics by > >> > setting the aq bit on the LR instruction. > >> > - .rl: The LR/SC sequence can be given release semantics by > >> > setting the rl bit on the SC instruction. > >> > - .aqrl: Setting the aq bit on the LR instruction, and setting > >> > both the aq and the rl bit on the SC instruction makes > >> > the LR/SC sequence sequentially consistent, meaning that > >> > it cannot be reordered with earlier or later memory > >> > operations from the same hart. > >> > > >> > Software should not set the rl bit on an LR instruction unless > >> > the aq bit is also set, nor should software set the aq bit on an > >> > SC instruction unless the rl bit is also set. LR.rl and SC.aq > >> > instructions are not guaranteed to provide any stronger ordering > >> > than those with both bits clear, but may result in lower > >> > performance. > >> > > >> > The only difference is when sc.w/d.aqrl failed, it would cause .aq > >> > effect than before. But it's okay for sematic because overlap > >> > address LR couldn't beyond relating SC. > >> > >> IIUC that's not accurate, or at least wasn't in 2018. The ISA tends to > >> drift around a bit, so it's possible things have changed since then. > >> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > Thx for explaining, that why I suspected with the sentence in the comment: > > "which do not give full-ordering with .aqrl" > > Sorry, I'm not quite sure what you're saying here. My understanding is > that this change results in mappings that violate LKMM, based on the > rationale in that previous commit. IIUC that all still holds, but I'm > not really a memory model person so I frequently miss stuff around > there. 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") is about fixup wrong spinlock/unlock implementation and not relate to this patch. Actually, sc.w.aqrl is very strong and the same with: fence rw, rw sc.w fence rw,rw So "which do not give full-ordering with .aqrl" is not writen in RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > >> describes the issue more specifically, that's when we added these > >> fences. There have certainly been complains that these fences are too > >> heavyweight for the HW to go fast, but IIUC it's the best option we have > > Yeah, it would reduce the performance on D1 and our next-generation > > processor has optimized fence performance a lot. > > Definately a bummer that the fences make the HW go slow, but I don't > really see any other way to go about this. If you think these mappings > are valid for LKMM and RVWMO then we should figure this out, but trying > to drop fences to make HW go faster in ways that violate the memory > model is going to lead to insanity. Actually, this patch is okay with the ISA spec, and Dan also thought it was valid. ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw ------ > 2. Using .aqrl to replace the fence rw, rw is okay to ISA manual, > right? And reducing a fence instruction to gain better performance: > "0: lr.w %[p], %[c]\n" > " sub %[rc], %[p], %[o]\n" > " bltz %[rc], 1f\n". > - " sc.w.rl %[rc], %[rc], %[c]\n" > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > " bnez %[rc], 0b\n" > - " fence rw, rw\n" Yes, using .aqrl is valid. Dan ------ > > I can definately buy the argument that we have mappings of various > application memory models that are difficult to make fast given the ISA > encodings of RVWMO we have, but that's not really an issue we can fix in > the atomic mappings. > > >> given the current set of memory model primitives we implement in the > >> ISA (ie, there's more in RVWMO but no way to encode that). > >> > >> The others all look good, though, and as these are really all > >> independent cleanups I'm going to go ahead and put those three on > >> for-next. > >> > >> There's also a bunch of checkpatch errors. The ones about "*" seem > >> spurious, but the alignment ones aren't. Here's my fixups: > > Thx for the fixup. > > > >> > >> diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > >> index 34f757dfc8f2..0bde499fa8bc 100644 > >> --- a/arch/riscv/include/asm/atomic.h > >> +++ b/arch/riscv/include/asm/atomic.h > >> @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor, i) > >> * versions, while the logical ops only have fetch versions. > >> */ > >> #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > >> - atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > >> + atomic##prefix##_t *v) \ > >> { \ > >> register c_type ret; \ > >> __asm__ __volatile__ ( \ > >> @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > >> : "memory"); \ > >> return ret; \ > >> } \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > >> - atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > >> + atomic##prefix##_t *v) \ > >> { \ > >> register c_type ret; \ > >> __asm__ __volatile__ ( \ > >> @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > >> : "memory"); \ > >> return ret; \ > >> } \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > >> - atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > >> + atomic##prefix##_t *v) \ > >> { \ > >> register c_type ret; \ > >> __asm__ __volatile__ ( \ > >> @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > >> : "memory"); \ > >> return ret; \ > >> } \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > >> { \ > >> register c_type ret; \ > >> __asm__ __volatile__ ( \ > >> @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > >> } > >> > >> #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > >> - atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > >> + atomic##prefix##_t *v) \ > >> { \ > >> - return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > >> + return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > >> } \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > >> - atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > >> + atomic##prefix##_t *v) \ > >> { \ > >> - return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > >> + return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > >> } \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_##op##_return_release(c_type i, \ > >> - atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_##op##_return_release(c_type i, \ > >> + atomic##prefix##_t *v) \ > >> { \ > >> - return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > >> + return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > >> } \ > >> -static __always_inline \ > >> -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > >> +static __always_inline c_type \ > >> +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > >> { \ > >> - return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > >> + return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > >> } > >> > >> #ifdef CONFIG_GENERIC_ATOMIC64 > >> > >> > >> > > >> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > >> > Signed-off-by: Guo Ren <guoren@kernel.org> > >> > Cc: Palmer Dabbelt <palmer@dabbelt.com> > >> > Cc: Mark Rutland <mark.rutland@arm.com> > >> > Cc: Dan Lustig <dlustig@nvidia.com> > >> > Cc: Andrea Parri <parri.andrea@gmail.com> > >> > --- > >> > arch/riscv/include/asm/atomic.h | 24 ++++++++---------------- > >> > arch/riscv/include/asm/cmpxchg.h | 6 ++---- > >> > 2 files changed, 10 insertions(+), 20 deletions(-) > >> > > >> > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > >> > index 34f757dfc8f2..aef8aa9ac4f4 100644 > >> > --- a/arch/riscv/include/asm/atomic.h > >> > +++ b/arch/riscv/include/asm/atomic.h > >> > @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int > >> > "0: lr.w %[p], %[c]\n" > >> > " beq %[p], %[u], 1f\n" > >> > " add %[rc], %[p], %[a]\n" > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : [a]"r" (a), [u]"r" (u) > >> > @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, > >> > "0: lr.d %[p], %[c]\n" > >> > " beq %[p], %[u], 1f\n" > >> > " add %[rc], %[p], %[a]\n" > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : [a]"r" (a), [u]"r" (u) > >> > @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) > >> > "0: lr.w %[p], %[c]\n" > >> > " bltz %[p], 1f\n" > >> > " addi %[rc], %[p], 1\n" > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : > >> > @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) > >> > "0: lr.w %[p], %[c]\n" > >> > " bgtz %[p], 1f\n" > >> > " addi %[rc], %[p], -1\n" > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : > >> > @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) > >> > "0: lr.w %[p], %[c]\n" > >> > " addi %[rc], %[p], -1\n" > >> > " bltz %[rc], 1f\n" > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : > >> > @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) > >> > "0: lr.d %[p], %[c]\n" > >> > " bltz %[p], 1f\n" > >> > " addi %[rc], %[p], 1\n" > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : > >> > @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) > >> > "0: lr.d %[p], %[c]\n" > >> > " bgtz %[p], 1f\n" > >> > " addi %[rc], %[p], -1\n" > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : > >> > @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) > >> > "0: lr.d %[p], %[c]\n" > >> > " addi %[rc], %[p], -1\n" > >> > " bltz %[rc], 1f\n" > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > >> > " bnez %[rc], 0b\n" > >> > - " fence rw, rw\n" > >> > "1:\n" > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > >> > : > >> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h > >> > index 1af8db92250b..9269fceb86e0 100644 > >> > --- a/arch/riscv/include/asm/cmpxchg.h > >> > +++ b/arch/riscv/include/asm/cmpxchg.h > >> > @@ -307,9 +307,8 @@ > >> > __asm__ __volatile__ ( \ > >> > "0: lr.w %0, %2\n" \ > >> > " bne %0, %z3, 1f\n" \ > >> > - " sc.w.rl %1, %z4, %2\n" \ > >> > + " sc.w.aqrl %1, %z4, %2\n" \ > >> > " bnez %1, 0b\n" \ > >> > - " fence rw, rw\n" \ > >> > "1:\n" \ > >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > >> > : "rJ" ((long)__old), "rJ" (__new) \ > >> > @@ -319,9 +318,8 @@ > >> > __asm__ __volatile__ ( \ > >> > "0: lr.d %0, %2\n" \ > >> > " bne %0, %z3, 1f\n" \ > >> > - " sc.d.rl %1, %z4, %2\n" \ > >> > + " sc.d.aqrl %1, %z4, %2\n" \ > >> > " bnez %1, 0b\n" \ > >> > - " fence rw, rw\n" \ > >> > "1:\n" \ > >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > >> > : "rJ" (__old), "rJ" (__new) \
Guo, On Mon, Jun 13, 2022 at 07:49:50PM +0800, Guo Ren wrote: > On Thu, Jun 2, 2022 at 1:59 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > On Sun, 22 May 2022 06:12:56 PDT (-0700), guoren@kernel.org wrote: > > > On Sun, May 22, 2022 at 4:46 AM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > >> > > >> On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren@kernel.org wrote: > > >> > From: Guo Ren <guoren@linux.alibaba.com> > > >> > > > >> > The current implementation is the same with 8e86f0b409a4 > > >> > ("arm64: atomics: fix use of acquire + release for full barrier > > >> > semantics"). RISC-V could combine acquire and release into the SC > > >> > instructions and it could reduce a fence instruction to gain better > > >> > performance. Here is related descriptio from RISC-V ISA 10.2 > > >> > Load-Reserved/Store-Conditional Instructions: > > >> > > > >> > - .aq: The LR/SC sequence can be given acquire semantics by > > >> > setting the aq bit on the LR instruction. > > >> > - .rl: The LR/SC sequence can be given release semantics by > > >> > setting the rl bit on the SC instruction. > > >> > - .aqrl: Setting the aq bit on the LR instruction, and setting > > >> > both the aq and the rl bit on the SC instruction makes > > >> > the LR/SC sequence sequentially consistent, meaning that > > >> > it cannot be reordered with earlier or later memory > > >> > operations from the same hart. > > >> > > > >> > Software should not set the rl bit on an LR instruction unless > > >> > the aq bit is also set, nor should software set the aq bit on an > > >> > SC instruction unless the rl bit is also set. LR.rl and SC.aq > > >> > instructions are not guaranteed to provide any stronger ordering > > >> > than those with both bits clear, but may result in lower > > >> > performance. > > >> > > > >> > The only difference is when sc.w/d.aqrl failed, it would cause .aq > > >> > effect than before. But it's okay for sematic because overlap > > >> > address LR couldn't beyond relating SC. > > >> > > >> IIUC that's not accurate, or at least wasn't in 2018. The ISA tends to > > >> drift around a bit, so it's possible things have changed since then. > > >> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > > Thx for explaining, that why I suspected with the sentence in the comment: > > > "which do not give full-ordering with .aqrl" > > > > Sorry, I'm not quite sure what you're saying here. My understanding is > > that this change results in mappings that violate LKMM, based on the > > rationale in that previous commit. IIUC that all still holds, but I'm > > not really a memory model person so I frequently miss stuff around > > there. > 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > is about fixup wrong spinlock/unlock implementation and not relate to > this patch. No. The commit in question is evidence of the fact that the changes you are presenting here (as an optimization) were buggy/incorrect at the time in which that commit was worked out. > Actually, sc.w.aqrl is very strong and the same with: > fence rw, rw > sc.w > fence rw,rw > > So "which do not give full-ordering with .aqrl" is not writen in > RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > > > > >> describes the issue more specifically, that's when we added these > > >> fences. There have certainly been complains that these fences are too > > >> heavyweight for the HW to go fast, but IIUC it's the best option we have > > > Yeah, it would reduce the performance on D1 and our next-generation > > > processor has optimized fence performance a lot. > > > > Definately a bummer that the fences make the HW go slow, but I don't > > really see any other way to go about this. If you think these mappings > > are valid for LKMM and RVWMO then we should figure this out, but trying > > to drop fences to make HW go faster in ways that violate the memory > > model is going to lead to insanity. > Actually, this patch is okay with the ISA spec, and Dan also thought > it was valid. > > ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw "Thoughts" on this regard have _changed_. Please compare that quote with, e.g. https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ So here's a suggestion: Reviewers of your patches have asked: How come that code we used to consider as buggy is now considered "an optimization" (correct)? Denying the evidence or going around it is not making their job (and this upstreaming) easier, so why don't you address it? Take time to review previous works and discussions in this area, understand them, and integrate such knowledge in future submissions. Andrea > ------ > > 2. Using .aqrl to replace the fence rw, rw is okay to ISA manual, > > right? And reducing a fence instruction to gain better performance: > > "0: lr.w %[p], %[c]\n" > > " sub %[rc], %[p], %[o]\n" > > " bltz %[rc], 1f\n". > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > Yes, using .aqrl is valid. > > Dan > ------ > > > > > > I can definately buy the argument that we have mappings of various > > application memory models that are difficult to make fast given the ISA > > encodings of RVWMO we have, but that's not really an issue we can fix in > > the atomic mappings. > > > > >> given the current set of memory model primitives we implement in the > > >> ISA (ie, there's more in RVWMO but no way to encode that). > > >> > > >> The others all look good, though, and as these are really all > > >> independent cleanups I'm going to go ahead and put those three on > > >> for-next. > > >> > > >> There's also a bunch of checkpatch errors. The ones about "*" seem > > >> spurious, but the alignment ones aren't. Here's my fixups: > > > Thx for the fixup. > > > > > >> > > >> diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > > >> index 34f757dfc8f2..0bde499fa8bc 100644 > > >> --- a/arch/riscv/include/asm/atomic.h > > >> +++ b/arch/riscv/include/asm/atomic.h > > >> @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor, i) > > >> * versions, while the logical ops only have fetch versions. > > >> */ > > >> #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > > >> - atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > > >> + atomic##prefix##_t *v) \ > > >> { \ > > >> register c_type ret; \ > > >> __asm__ __volatile__ ( \ > > >> @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > > >> : "memory"); \ > > >> return ret; \ > > >> } \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > > >> - atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > > >> + atomic##prefix##_t *v) \ > > >> { \ > > >> register c_type ret; \ > > >> __asm__ __volatile__ ( \ > > >> @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > > >> : "memory"); \ > > >> return ret; \ > > >> } \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > > >> - atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > > >> + atomic##prefix##_t *v) \ > > >> { \ > > >> register c_type ret; \ > > >> __asm__ __volatile__ ( \ > > >> @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > > >> : "memory"); \ > > >> return ret; \ > > >> } \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > > >> { \ > > >> register c_type ret; \ > > >> __asm__ __volatile__ ( \ > > >> @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > > >> } > > >> > > >> #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > > >> - atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > > >> + atomic##prefix##_t *v) \ > > >> { \ > > >> - return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > > >> + return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > > >> } \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > > >> - atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > > >> + atomic##prefix##_t *v) \ > > >> { \ > > >> - return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > > >> + return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > > >> } \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_##op##_return_release(c_type i, \ > > >> - atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_##op##_return_release(c_type i, \ > > >> + atomic##prefix##_t *v) \ > > >> { \ > > >> - return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > > >> + return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > > >> } \ > > >> -static __always_inline \ > > >> -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > > >> +static __always_inline c_type \ > > >> +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > > >> { \ > > >> - return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > > >> + return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > > >> } > > >> > > >> #ifdef CONFIG_GENERIC_ATOMIC64 > > >> > > >> > > >> > > > >> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > > >> > Signed-off-by: Guo Ren <guoren@kernel.org> > > >> > Cc: Palmer Dabbelt <palmer@dabbelt.com> > > >> > Cc: Mark Rutland <mark.rutland@arm.com> > > >> > Cc: Dan Lustig <dlustig@nvidia.com> > > >> > Cc: Andrea Parri <parri.andrea@gmail.com> > > >> > --- > > >> > arch/riscv/include/asm/atomic.h | 24 ++++++++---------------- > > >> > arch/riscv/include/asm/cmpxchg.h | 6 ++---- > > >> > 2 files changed, 10 insertions(+), 20 deletions(-) > > >> > > > >> > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > > >> > index 34f757dfc8f2..aef8aa9ac4f4 100644 > > >> > --- a/arch/riscv/include/asm/atomic.h > > >> > +++ b/arch/riscv/include/asm/atomic.h > > >> > @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int > > >> > "0: lr.w %[p], %[c]\n" > > >> > " beq %[p], %[u], 1f\n" > > >> > " add %[rc], %[p], %[a]\n" > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : [a]"r" (a), [u]"r" (u) > > >> > @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, > > >> > "0: lr.d %[p], %[c]\n" > > >> > " beq %[p], %[u], 1f\n" > > >> > " add %[rc], %[p], %[a]\n" > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : [a]"r" (a), [u]"r" (u) > > >> > @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) > > >> > "0: lr.w %[p], %[c]\n" > > >> > " bltz %[p], 1f\n" > > >> > " addi %[rc], %[p], 1\n" > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : > > >> > @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) > > >> > "0: lr.w %[p], %[c]\n" > > >> > " bgtz %[p], 1f\n" > > >> > " addi %[rc], %[p], -1\n" > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : > > >> > @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) > > >> > "0: lr.w %[p], %[c]\n" > > >> > " addi %[rc], %[p], -1\n" > > >> > " bltz %[rc], 1f\n" > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : > > >> > @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) > > >> > "0: lr.d %[p], %[c]\n" > > >> > " bltz %[p], 1f\n" > > >> > " addi %[rc], %[p], 1\n" > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : > > >> > @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) > > >> > "0: lr.d %[p], %[c]\n" > > >> > " bgtz %[p], 1f\n" > > >> > " addi %[rc], %[p], -1\n" > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : > > >> > @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) > > >> > "0: lr.d %[p], %[c]\n" > > >> > " addi %[rc], %[p], -1\n" > > >> > " bltz %[rc], 1f\n" > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > >> > " bnez %[rc], 0b\n" > > >> > - " fence rw, rw\n" > > >> > "1:\n" > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > >> > : > > >> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h > > >> > index 1af8db92250b..9269fceb86e0 100644 > > >> > --- a/arch/riscv/include/asm/cmpxchg.h > > >> > +++ b/arch/riscv/include/asm/cmpxchg.h > > >> > @@ -307,9 +307,8 @@ > > >> > __asm__ __volatile__ ( \ > > >> > "0: lr.w %0, %2\n" \ > > >> > " bne %0, %z3, 1f\n" \ > > >> > - " sc.w.rl %1, %z4, %2\n" \ > > >> > + " sc.w.aqrl %1, %z4, %2\n" \ > > >> > " bnez %1, 0b\n" \ > > >> > - " fence rw, rw\n" \ > > >> > "1:\n" \ > > >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > > >> > : "rJ" ((long)__old), "rJ" (__new) \ > > >> > @@ -319,9 +318,8 @@ > > >> > __asm__ __volatile__ ( \ > > >> > "0: lr.d %0, %2\n" \ > > >> > " bne %0, %z3, 1f\n" \ > > >> > - " sc.d.rl %1, %z4, %2\n" \ > > >> > + " sc.d.aqrl %1, %z4, %2\n" \ > > >> > " bnez %1, 0b\n" \ > > >> > - " fence rw, rw\n" \ > > >> > "1:\n" \ > > >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > > >> > : "rJ" (__old), "rJ" (__new) \ > > > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
Hi, On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: [...] > > 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > is about fixup wrong spinlock/unlock implementation and not relate to > > this patch. > > No. The commit in question is evidence of the fact that the changes > you are presenting here (as an optimization) were buggy/incorrect at > the time in which that commit was worked out. > > > > Actually, sc.w.aqrl is very strong and the same with: > > fence rw, rw > > sc.w > > fence rw,rw > > > > So "which do not give full-ordering with .aqrl" is not writen in > > RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > > > > > > > >> describes the issue more specifically, that's when we added these > > > >> fences. There have certainly been complains that these fences are too > > > >> heavyweight for the HW to go fast, but IIUC it's the best option we have > > > > Yeah, it would reduce the performance on D1 and our next-generation > > > > processor has optimized fence performance a lot. > > > > > > Definately a bummer that the fences make the HW go slow, but I don't > > > really see any other way to go about this. If you think these mappings > > > are valid for LKMM and RVWMO then we should figure this out, but trying > > > to drop fences to make HW go faster in ways that violate the memory > > > model is going to lead to insanity. > > Actually, this patch is okay with the ISA spec, and Dan also thought > > it was valid. > > > > ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > > "Thoughts" on this regard have _changed_. Please compare that quote > with, e.g. > > https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > > So here's a suggestion: > > Reviewers of your patches have asked: How come that code we used to > consider as buggy is now considered "an optimization" (correct)? > > Denying the evidence or going around it is not making their job (and > this upstreaming) easier, so why don't you address it? Take time to > review previous works and discussions in this area, understand them, > and integrate such knowledge in future submissions. > I agree with Andrea. And I actually took a look into this, and I think I find some explanation. There are two versions of RISV memory model here: Model 2017: released at Dec 1, 2017 as a draft https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ Model 2018: released at May 2, 2018 https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ Noted that previous conversation about commit 5ce6c1f3535f happened at March 2018. So the timeline is roughly: Model 2017 -> commit 5ce6c1f3535f -> Model 2018 And in the email thread of Model 2018, the commit related to model changes also got mentioned: https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a in that commit, we can see the changes related to sc.aqrl are: to have occurred between the LR and a successful SC. The LR/SC sequence can be given acquire semantics by setting the {\em aq} bit on -the SC instruction. The LR/SC sequence can be given release semantics -by setting the {\em rl} bit on the LR instruction. Setting both {\em - aq} and {\em rl} bits on the LR instruction, and setting the {\em - aq} bit on the SC instruction makes the LR/SC sequence sequentially -consistent with respect to other sequentially consistent atomic -operations. +the LR instruction. The LR/SC sequence can be given release semantics +by setting the {\em rl} bit on the SC instruction. Setting the {\em + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em + rl} bit on the SC instruction makes the LR/SC sequence sequentially +consistent, meaning that it cannot be reordered with earlier or +later memory operations from the same hart. note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered against "earlier or later memory operations from the same hart", and this statement was not in Model 2017. So my understanding of the story is that at some point between March and May 2018, RISV memory model folks decided to add this rule, which does look more consistent with other parts of the model and is useful. And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered barrier ;-) Now if my understanding is correct, to move forward, it's better that 1) this patch gets resend with the above information (better rewording a bit), and 2) gets an Acked-by from Dan to confirm this is a correct history ;-) Regards, Boqun > Andrea > > [...]
On 6/22/2022 11:31 PM, Boqun Feng wrote: > Hi, > > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > [...] >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") >>> is about fixup wrong spinlock/unlock implementation and not relate to >>> this patch. >> >> No. The commit in question is evidence of the fact that the changes >> you are presenting here (as an optimization) were buggy/incorrect at >> the time in which that commit was worked out. >> >> >>> Actually, sc.w.aqrl is very strong and the same with: >>> fence rw, rw >>> sc.w >>> fence rw,rw >>> >>> So "which do not give full-ordering with .aqrl" is not writen in >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. >>> >>>> >>>>>> describes the issue more specifically, that's when we added these >>>>>> fences. There have certainly been complains that these fences are too >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have >>>>> Yeah, it would reduce the performance on D1 and our next-generation >>>>> processor has optimized fence performance a lot. >>>> >>>> Definately a bummer that the fences make the HW go slow, but I don't >>>> really see any other way to go about this. If you think these mappings >>>> are valid for LKMM and RVWMO then we should figure this out, but trying >>>> to drop fences to make HW go faster in ways that violate the memory >>>> model is going to lead to insanity. >>> Actually, this patch is okay with the ISA spec, and Dan also thought >>> it was valid. >>> >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw >> >> "Thoughts" on this regard have _changed_. Please compare that quote >> with, e.g. >> >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ >> >> So here's a suggestion: >> >> Reviewers of your patches have asked: How come that code we used to >> consider as buggy is now considered "an optimization" (correct)? >> >> Denying the evidence or going around it is not making their job (and >> this upstreaming) easier, so why don't you address it? Take time to >> review previous works and discussions in this area, understand them, >> and integrate such knowledge in future submissions. >> > > I agree with Andrea. > > And I actually took a look into this, and I think I find some > explanation. There are two versions of RISV memory model here: > > Model 2017: released at Dec 1, 2017 as a draft > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > > Model 2018: released at May 2, 2018 > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > > Noted that previous conversation about commit 5ce6c1f3535f happened at > March 2018. So the timeline is roughly: > > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > > And in the email thread of Model 2018, the commit related to model > changes also got mentioned: > > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > > in that commit, we can see the changes related to sc.aqrl are: > > to have occurred between the LR and a successful SC. The LR/SC > sequence can be given acquire semantics by setting the {\em aq} bit on > -the SC instruction. The LR/SC sequence can be given release semantics > -by setting the {\em rl} bit on the LR instruction. Setting both {\em > - aq} and {\em rl} bits on the LR instruction, and setting the {\em > - aq} bit on the SC instruction makes the LR/SC sequence sequentially > -consistent with respect to other sequentially consistent atomic > -operations. > +the LR instruction. The LR/SC sequence can be given release semantics > +by setting the {\em rl} bit on the SC instruction. Setting the {\em > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > + rl} bit on the SC instruction makes the LR/SC sequence sequentially > +consistent, meaning that it cannot be reordered with earlier or > +later memory operations from the same hart. > > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > against "earlier or later memory operations from the same hart", and > this statement was not in Model 2017. > > So my understanding of the story is that at some point between March and > May 2018, RISV memory model folks decided to add this rule, which does > look more consistent with other parts of the model and is useful. > > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > barrier ;-) > > Now if my understanding is correct, to move forward, it's better that 1) > this patch gets resend with the above information (better rewording a > bit), and 2) gets an Acked-by from Dan to confirm this is a correct > history ;-) I'm a bit lost as to why digging into RISC-V mailing list history is relevant here...what's relevant is what was ratified in the RVWMO chapter of the RISC-V spec, and whether the code you're proposing is the most optimized code that is correct wrt RVWMO. Is your claim that the code you're proposing to fix was based on a pre-RVWMO RISC-V memory model definition, and you're updating it to be more RVWMO-compliant? Dan > Regards, > Boqun > >> Andrea >> >> > [...]
On Thu, Jun 23, 2022 at 01:09:23PM -0400, Dan Lustig wrote: > On 6/22/2022 11:31 PM, Boqun Feng wrote: > > Hi, > > > > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > > [...] > >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > >>> is about fixup wrong spinlock/unlock implementation and not relate to > >>> this patch. > >> > >> No. The commit in question is evidence of the fact that the changes > >> you are presenting here (as an optimization) were buggy/incorrect at > >> the time in which that commit was worked out. > >> > >> > >>> Actually, sc.w.aqrl is very strong and the same with: > >>> fence rw, rw > >>> sc.w > >>> fence rw,rw > >>> > >>> So "which do not give full-ordering with .aqrl" is not writen in > >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > >>> > >>>> > >>>>>> describes the issue more specifically, that's when we added these > >>>>>> fences. There have certainly been complains that these fences are too > >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > >>>>> Yeah, it would reduce the performance on D1 and our next-generation > >>>>> processor has optimized fence performance a lot. > >>>> > >>>> Definately a bummer that the fences make the HW go slow, but I don't > >>>> really see any other way to go about this. If you think these mappings > >>>> are valid for LKMM and RVWMO then we should figure this out, but trying > >>>> to drop fences to make HW go faster in ways that violate the memory > >>>> model is going to lead to insanity. > >>> Actually, this patch is okay with the ISA spec, and Dan also thought > >>> it was valid. > >>> > >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > >> > >> "Thoughts" on this regard have _changed_. Please compare that quote > >> with, e.g. > >> > >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > >> > >> So here's a suggestion: > >> > >> Reviewers of your patches have asked: How come that code we used to > >> consider as buggy is now considered "an optimization" (correct)? > >> > >> Denying the evidence or going around it is not making their job (and > >> this upstreaming) easier, so why don't you address it? Take time to > >> review previous works and discussions in this area, understand them, > >> and integrate such knowledge in future submissions. > >> > > > > I agree with Andrea. > > > > And I actually took a look into this, and I think I find some > > explanation. There are two versions of RISV memory model here: > > > > Model 2017: released at Dec 1, 2017 as a draft > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > > > > Model 2018: released at May 2, 2018 > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > > > > Noted that previous conversation about commit 5ce6c1f3535f happened at > > March 2018. So the timeline is roughly: > > > > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > > > > And in the email thread of Model 2018, the commit related to model > > changes also got mentioned: > > > > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > > > > in that commit, we can see the changes related to sc.aqrl are: > > > > to have occurred between the LR and a successful SC. The LR/SC > > sequence can be given acquire semantics by setting the {\em aq} bit on > > -the SC instruction. The LR/SC sequence can be given release semantics > > -by setting the {\em rl} bit on the LR instruction. Setting both {\em > > - aq} and {\em rl} bits on the LR instruction, and setting the {\em > > - aq} bit on the SC instruction makes the LR/SC sequence sequentially > > -consistent with respect to other sequentially consistent atomic > > -operations. > > +the LR instruction. The LR/SC sequence can be given release semantics > > +by setting the {\em rl} bit on the SC instruction. Setting the {\em > > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > > + rl} bit on the SC instruction makes the LR/SC sequence sequentially > > +consistent, meaning that it cannot be reordered with earlier or > > +later memory operations from the same hart. > > > > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > > against "earlier or later memory operations from the same hart", and > > this statement was not in Model 2017. > > > > So my understanding of the story is that at some point between March and > > May 2018, RISV memory model folks decided to add this rule, which does > > look more consistent with other parts of the model and is useful. > > > > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > > barrier ;-) > > > > Now if my understanding is correct, to move forward, it's better that 1) > > this patch gets resend with the above information (better rewording a > > bit), and 2) gets an Acked-by from Dan to confirm this is a correct > > history ;-) > > I'm a bit lost as to why digging into RISC-V mailing list history is > relevant here...what's relevant is what was ratified in the RVWMO > chapter of the RISC-V spec, and whether the code you're proposing > is the most optimized code that is correct wrt RVWMO. > > Is your claim that the code you're proposing to fix was based on a > pre-RVWMO RISC-V memory model definition, and you're updating it to > be more RVWMO-compliant? > Well, first it's not my code ;-) The thing is that this patch proposed by Guo Ren kinda fixes/revertes a previous commit 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences"). It looks to me that Guo Ren's patch fits the current RISCV memory model and Linux kernel memory model, but the question is that commit 5ce6c1f3535f was also a fix, so why do we change things back and forth? If I understand correctly, this is also what Palmer and Andrea asked for. My understanding is that commit 5ce6c1f3535f was based on a draft RVWMO that was different than the current one. I'd love to record this difference in the commit log of Guo Ren's patch, so that later on we know why we changed things back and forth. To do so, the confirmation from RVWMO authors is helpful. Hope that I make things more clear ;-) Regards, Boqun > Dan > > > Regards, > > Boqun > > > >> Andrea > >> > >> > > [...]
On Thu, 23 Jun 2022 10:55:11 PDT (-0700), boqun.feng@gmail.com wrote: > On Thu, Jun 23, 2022 at 01:09:23PM -0400, Dan Lustig wrote: >> On 6/22/2022 11:31 PM, Boqun Feng wrote: >> > Hi, >> > >> > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: >> > [...] >> >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") >> >>> is about fixup wrong spinlock/unlock implementation and not relate to >> >>> this patch. >> >> >> >> No. The commit in question is evidence of the fact that the changes >> >> you are presenting here (as an optimization) were buggy/incorrect at >> >> the time in which that commit was worked out. >> >> >> >> >> >>> Actually, sc.w.aqrl is very strong and the same with: >> >>> fence rw, rw >> >>> sc.w >> >>> fence rw,rw >> >>> >> >>> So "which do not give full-ordering with .aqrl" is not writen in >> >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. >> >>> >> >>>> >> >>>>>> describes the issue more specifically, that's when we added these >> >>>>>> fences. There have certainly been complains that these fences are too >> >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have >> >>>>> Yeah, it would reduce the performance on D1 and our next-generation >> >>>>> processor has optimized fence performance a lot. >> >>>> >> >>>> Definately a bummer that the fences make the HW go slow, but I don't >> >>>> really see any other way to go about this. If you think these mappings >> >>>> are valid for LKMM and RVWMO then we should figure this out, but trying >> >>>> to drop fences to make HW go faster in ways that violate the memory >> >>>> model is going to lead to insanity. >> >>> Actually, this patch is okay with the ISA spec, and Dan also thought >> >>> it was valid. >> >>> >> >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw >> >> >> >> "Thoughts" on this regard have _changed_. Please compare that quote >> >> with, e.g. >> >> >> >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ >> >> >> >> So here's a suggestion: >> >> >> >> Reviewers of your patches have asked: How come that code we used to >> >> consider as buggy is now considered "an optimization" (correct)? >> >> >> >> Denying the evidence or going around it is not making their job (and >> >> this upstreaming) easier, so why don't you address it? Take time to >> >> review previous works and discussions in this area, understand them, >> >> and integrate such knowledge in future submissions. >> >> >> > >> > I agree with Andrea. >> > >> > And I actually took a look into this, and I think I find some >> > explanation. There are two versions of RISV memory model here: >> > >> > Model 2017: released at Dec 1, 2017 as a draft >> > >> > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ >> > >> > Model 2018: released at May 2, 2018 >> > >> > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ >> > >> > Noted that previous conversation about commit 5ce6c1f3535f happened at >> > March 2018. So the timeline is roughly: >> > >> > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 >> > >> > And in the email thread of Model 2018, the commit related to model >> > changes also got mentioned: >> > >> > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a >> > >> > in that commit, we can see the changes related to sc.aqrl are: >> > >> > to have occurred between the LR and a successful SC. The LR/SC >> > sequence can be given acquire semantics by setting the {\em aq} bit on >> > -the SC instruction. The LR/SC sequence can be given release semantics >> > -by setting the {\em rl} bit on the LR instruction. Setting both {\em >> > - aq} and {\em rl} bits on the LR instruction, and setting the {\em >> > - aq} bit on the SC instruction makes the LR/SC sequence sequentially >> > -consistent with respect to other sequentially consistent atomic >> > -operations. >> > +the LR instruction. The LR/SC sequence can be given release semantics >> > +by setting the {\em rl} bit on the SC instruction. Setting the {\em >> > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em >> > + rl} bit on the SC instruction makes the LR/SC sequence sequentially >> > +consistent, meaning that it cannot be reordered with earlier or >> > +later memory operations from the same hart. >> > >> > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered >> > against "earlier or later memory operations from the same hart", and >> > this statement was not in Model 2017. >> > >> > So my understanding of the story is that at some point between March and >> > May 2018, RISV memory model folks decided to add this rule, which does >> > look more consistent with other parts of the model and is useful. >> > >> > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered >> > barrier ;-) >> > >> > Now if my understanding is correct, to move forward, it's better that 1) >> > this patch gets resend with the above information (better rewording a >> > bit), and 2) gets an Acked-by from Dan to confirm this is a correct >> > history ;-) >> >> I'm a bit lost as to why digging into RISC-V mailing list history is >> relevant here...what's relevant is what was ratified in the RVWMO >> chapter of the RISC-V spec, and whether the code you're proposing >> is the most optimized code that is correct wrt RVWMO. >> >> Is your claim that the code you're proposing to fix was based on a >> pre-RVWMO RISC-V memory model definition, and you're updating it to >> be more RVWMO-compliant? >> > > Well, first it's not my code ;-) > > The thing is that this patch proposed by Guo Ren kinda fixes/revertes a > previous commit 5ce6c1f3535f ("riscv/atomic: Strengthen implementations > with fences"). It looks to me that Guo Ren's patch fits the current > RISCV memory model and Linux kernel memory model, but the question is > that commit 5ce6c1f3535f was also a fix, so why do we change things > back and forth? If I understand correctly, this is also what Palmer and > Andrea asked for. That's essentially my confusion. I'm not really a formal memory model guy so I can easily get lost in these bits, but when I saw it I remembered having looked at a fix before. I dug that up, saw it was from someone who likley knows a lot more about formal memory models than I do, and thus figured I'd ask everyone to see what's up. IMO if that original fix was made to a pre-ratification version of WMO, this new version is legal WRT the ratified WMO then the code change is fine to take on for-next. That said, we should be explicit about why it's legal and why the reasoning in the previous patch is no loger connect, just to make sure everyone can follow along. > My understanding is that commit 5ce6c1f3535f was based on a draft RVWMO > that was different than the current one. I'd love to record this > difference in the commit log of Guo Ren's patch, so that later on we > know why we changed things back and forth. To do so, the confirmation > from RVWMO authors is helpful. Agreed. IMO that's always good hygine, but it's extra important when we're dealing with external specifications in a complex field like formal models. > Hope that I make things more clear ;-) > > Regards, > Boqun > >> Dan >> >> > Regards, >> > Boqun >> > >> >> Andrea >> >> >> >> >> > [...]
On Tue, Jun 14, 2022 at 7:03 PM Andrea Parri <parri.andrea@gmail.com> wrote: > > Guo, > > On Mon, Jun 13, 2022 at 07:49:50PM +0800, Guo Ren wrote: > > On Thu, Jun 2, 2022 at 1:59 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > > > On Sun, 22 May 2022 06:12:56 PDT (-0700), guoren@kernel.org wrote: > > > > On Sun, May 22, 2022 at 4:46 AM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > >> > > > >> On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren@kernel.org wrote: > > > >> > From: Guo Ren <guoren@linux.alibaba.com> > > > >> > > > > >> > The current implementation is the same with 8e86f0b409a4 > > > >> > ("arm64: atomics: fix use of acquire + release for full barrier > > > >> > semantics"). RISC-V could combine acquire and release into the SC > > > >> > instructions and it could reduce a fence instruction to gain better > > > >> > performance. Here is related descriptio from RISC-V ISA 10.2 > > > >> > Load-Reserved/Store-Conditional Instructions: > > > >> > > > > >> > - .aq: The LR/SC sequence can be given acquire semantics by > > > >> > setting the aq bit on the LR instruction. > > > >> > - .rl: The LR/SC sequence can be given release semantics by > > > >> > setting the rl bit on the SC instruction. > > > >> > - .aqrl: Setting the aq bit on the LR instruction, and setting > > > >> > both the aq and the rl bit on the SC instruction makes > > > >> > the LR/SC sequence sequentially consistent, meaning that > > > >> > it cannot be reordered with earlier or later memory > > > >> > operations from the same hart. > > > >> > > > > >> > Software should not set the rl bit on an LR instruction unless > > > >> > the aq bit is also set, nor should software set the aq bit on an > > > >> > SC instruction unless the rl bit is also set. LR.rl and SC.aq > > > >> > instructions are not guaranteed to provide any stronger ordering > > > >> > than those with both bits clear, but may result in lower > > > >> > performance. > > > >> > > > > >> > The only difference is when sc.w/d.aqrl failed, it would cause .aq > > > >> > effect than before. But it's okay for sematic because overlap > > > >> > address LR couldn't beyond relating SC. > > > >> > > > >> IIUC that's not accurate, or at least wasn't in 2018. The ISA tends to > > > >> drift around a bit, so it's possible things have changed since then. > > > >> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > > > Thx for explaining, that why I suspected with the sentence in the comment: > > > > "which do not give full-ordering with .aqrl" > > > > > > Sorry, I'm not quite sure what you're saying here. My understanding is > > > that this change results in mappings that violate LKMM, based on the > > > rationale in that previous commit. IIUC that all still holds, but I'm > > > not really a memory model person so I frequently miss stuff around > > > there. > > 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > is about fixup wrong spinlock/unlock implementation and not relate to > > this patch. > > No. The commit in question is evidence of the fact that the changes > you are presenting here (as an optimization) were buggy/incorrect at > the time in which that commit was worked out. I think I've mixed it with 0123f4d76ca6 ("riscv/spinlock: Strengthen implementations with fences"). > > > > Actually, sc.w.aqrl is very strong and the same with: > > fence rw, rw > > sc.w > > fence rw,rw > > > > So "which do not give full-ordering with .aqrl" is not writen in > > RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > > > > > > > >> describes the issue more specifically, that's when we added these > > > >> fences. There have certainly been complains that these fences are too > > > >> heavyweight for the HW to go fast, but IIUC it's the best option we have > > > > Yeah, it would reduce the performance on D1 and our next-generation > > > > processor has optimized fence performance a lot. > > > > > > Definately a bummer that the fences make the HW go slow, but I don't > > > really see any other way to go about this. If you think these mappings > > > are valid for LKMM and RVWMO then we should figure this out, but trying > > > to drop fences to make HW go faster in ways that violate the memory > > > model is going to lead to insanity. > > Actually, this patch is okay with the ISA spec, and Dan also thought > > it was valid. > > > > ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > > "Thoughts" on this regard have _changed_. Please compare that quote > with, e.g. > > https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > > So here's a suggestion: > > Reviewers of your patches have asked: How come that code we used to > consider as buggy is now considered "an optimization" (correct)? > > Denying the evidence or going around it is not making their job (and > this upstreaming) easier, so why don't you address it? Take time to > review previous works and discussions in this area, understand them, > and integrate such knowledge in future submissions. > > Andrea > > > > ------ > > > 2. Using .aqrl to replace the fence rw, rw is okay to ISA manual, > > > right? And reducing a fence instruction to gain better performance: > > > "0: lr.w %[p], %[c]\n" > > > " sub %[rc], %[p], %[o]\n" > > > " bltz %[rc], 1f\n". > > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > > " bnez %[rc], 0b\n" > > > - " fence rw, rw\n" > > > > Yes, using .aqrl is valid. > > > > Dan > > ------ > > > > > > > > > > I can definately buy the argument that we have mappings of various > > > application memory models that are difficult to make fast given the ISA > > > encodings of RVWMO we have, but that's not really an issue we can fix in > > > the atomic mappings. > > > > > > >> given the current set of memory model primitives we implement in the > > > >> ISA (ie, there's more in RVWMO but no way to encode that). > > > >> > > > >> The others all look good, though, and as these are really all > > > >> independent cleanups I'm going to go ahead and put those three on > > > >> for-next. > > > >> > > > >> There's also a bunch of checkpatch errors. The ones about "*" seem > > > >> spurious, but the alignment ones aren't. Here's my fixups: > > > > Thx for the fixup. > > > > > > > >> > > > >> diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > > > >> index 34f757dfc8f2..0bde499fa8bc 100644 > > > >> --- a/arch/riscv/include/asm/atomic.h > > > >> +++ b/arch/riscv/include/asm/atomic.h > > > >> @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor, i) > > > >> * versions, while the logical ops only have fetch versions. > > > >> */ > > > >> #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > > > >> - atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > > > >> + atomic##prefix##_t *v) \ > > > >> { \ > > > >> register c_type ret; \ > > > >> __asm__ __volatile__ ( \ > > > >> @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ > > > >> : "memory"); \ > > > >> return ret; \ > > > >> } \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > > > >> - atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > > > >> + atomic##prefix##_t *v) \ > > > >> { \ > > > >> register c_type ret; \ > > > >> __asm__ __volatile__ ( \ > > > >> @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i, \ > > > >> : "memory"); \ > > > >> return ret; \ > > > >> } \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > > > >> - atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > > > >> + atomic##prefix##_t *v) \ > > > >> { \ > > > >> register c_type ret; \ > > > >> __asm__ __volatile__ ( \ > > > >> @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i, \ > > > >> : "memory"); \ > > > >> return ret; \ > > > >> } \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > > > >> { \ > > > >> register c_type ret; \ > > > >> __asm__ __volatile__ ( \ > > > >> @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ > > > >> } > > > >> > > > >> #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > > > >> - atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ > > > >> + atomic##prefix##_t *v) \ > > > >> { \ > > > >> - return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > > > >> + return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ > > > >> } \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > > > >> - atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_##op##_return_acquire(c_type i, \ > > > >> + atomic##prefix##_t *v) \ > > > >> { \ > > > >> - return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > > > >> + return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I; \ > > > >> } \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_##op##_return_release(c_type i, \ > > > >> - atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_##op##_return_release(c_type i, \ > > > >> + atomic##prefix##_t *v) \ > > > >> { \ > > > >> - return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > > > >> + return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I; \ > > > >> } \ > > > >> -static __always_inline \ > > > >> -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > > > >> +static __always_inline c_type \ > > > >> +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ > > > >> { \ > > > >> - return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > > > >> + return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ > > > >> } > > > >> > > > >> #ifdef CONFIG_GENERIC_ATOMIC64 > > > >> > > > >> > > > >> > > > > >> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > > > >> > Signed-off-by: Guo Ren <guoren@kernel.org> > > > >> > Cc: Palmer Dabbelt <palmer@dabbelt.com> > > > >> > Cc: Mark Rutland <mark.rutland@arm.com> > > > >> > Cc: Dan Lustig <dlustig@nvidia.com> > > > >> > Cc: Andrea Parri <parri.andrea@gmail.com> > > > >> > --- > > > >> > arch/riscv/include/asm/atomic.h | 24 ++++++++---------------- > > > >> > arch/riscv/include/asm/cmpxchg.h | 6 ++---- > > > >> > 2 files changed, 10 insertions(+), 20 deletions(-) > > > >> > > > > >> > diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h > > > >> > index 34f757dfc8f2..aef8aa9ac4f4 100644 > > > >> > --- a/arch/riscv/include/asm/atomic.h > > > >> > +++ b/arch/riscv/include/asm/atomic.h > > > >> > @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int > > > >> > "0: lr.w %[p], %[c]\n" > > > >> > " beq %[p], %[u], 1f\n" > > > >> > " add %[rc], %[p], %[a]\n" > > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : [a]"r" (a), [u]"r" (u) > > > >> > @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, > > > >> > "0: lr.d %[p], %[c]\n" > > > >> > " beq %[p], %[u], 1f\n" > > > >> > " add %[rc], %[p], %[a]\n" > > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : [a]"r" (a), [u]"r" (u) > > > >> > @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) > > > >> > "0: lr.w %[p], %[c]\n" > > > >> > " bltz %[p], 1f\n" > > > >> > " addi %[rc], %[p], 1\n" > > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : > > > >> > @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) > > > >> > "0: lr.w %[p], %[c]\n" > > > >> > " bgtz %[p], 1f\n" > > > >> > " addi %[rc], %[p], -1\n" > > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : > > > >> > @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) > > > >> > "0: lr.w %[p], %[c]\n" > > > >> > " addi %[rc], %[p], -1\n" > > > >> > " bltz %[rc], 1f\n" > > > >> > - " sc.w.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : > > > >> > @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) > > > >> > "0: lr.d %[p], %[c]\n" > > > >> > " bltz %[p], 1f\n" > > > >> > " addi %[rc], %[p], 1\n" > > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : > > > >> > @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) > > > >> > "0: lr.d %[p], %[c]\n" > > > >> > " bgtz %[p], 1f\n" > > > >> > " addi %[rc], %[p], -1\n" > > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : > > > >> > @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) > > > >> > "0: lr.d %[p], %[c]\n" > > > >> > " addi %[rc], %[p], -1\n" > > > >> > " bltz %[rc], 1f\n" > > > >> > - " sc.d.rl %[rc], %[rc], %[c]\n" > > > >> > + " sc.d.aqrl %[rc], %[rc], %[c]\n" > > > >> > " bnez %[rc], 0b\n" > > > >> > - " fence rw, rw\n" > > > >> > "1:\n" > > > >> > : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) > > > >> > : > > > >> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h > > > >> > index 1af8db92250b..9269fceb86e0 100644 > > > >> > --- a/arch/riscv/include/asm/cmpxchg.h > > > >> > +++ b/arch/riscv/include/asm/cmpxchg.h > > > >> > @@ -307,9 +307,8 @@ > > > >> > __asm__ __volatile__ ( \ > > > >> > "0: lr.w %0, %2\n" \ > > > >> > " bne %0, %z3, 1f\n" \ > > > >> > - " sc.w.rl %1, %z4, %2\n" \ > > > >> > + " sc.w.aqrl %1, %z4, %2\n" \ > > > >> > " bnez %1, 0b\n" \ > > > >> > - " fence rw, rw\n" \ > > > >> > "1:\n" \ > > > >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > > > >> > : "rJ" ((long)__old), "rJ" (__new) \ > > > >> > @@ -319,9 +318,8 @@ > > > >> > __asm__ __volatile__ ( \ > > > >> > "0: lr.d %0, %2\n" \ > > > >> > " bne %0, %z3, 1f\n" \ > > > >> > - " sc.d.rl %1, %z4, %2\n" \ > > > >> > + " sc.d.aqrl %1, %z4, %2\n" \ > > > >> > " bnez %1, 0b\n" \ > > > >> > - " fence rw, rw\n" \ > > > >> > "1:\n" \ > > > >> > : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ > > > >> > : "rJ" (__old), "rJ" (__new) \ > > > > > > > > -- > > Best Regards > > Guo Ren > > > > ML: https://lore.kernel.org/linux-csky/ -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
q On Fri, Jun 24, 2022 at 1:55 AM Boqun Feng <boqun.feng@gmail.com> wrote: > > On Thu, Jun 23, 2022 at 01:09:23PM -0400, Dan Lustig wrote: > > On 6/22/2022 11:31 PM, Boqun Feng wrote: > > > Hi, > > > > > > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > > > [...] > > >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > >>> is about fixup wrong spinlock/unlock implementation and not relate to > > >>> this patch. > > >> > > >> No. The commit in question is evidence of the fact that the changes > > >> you are presenting here (as an optimization) were buggy/incorrect at > > >> the time in which that commit was worked out. > > >> > > >> > > >>> Actually, sc.w.aqrl is very strong and the same with: > > >>> fence rw, rw > > >>> sc.w > > >>> fence rw,rw > > >>> > > >>> So "which do not give full-ordering with .aqrl" is not writen in > > >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > >>> > > >>>> > > >>>>>> describes the issue more specifically, that's when we added these > > >>>>>> fences. There have certainly been complains that these fences are too > > >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > > >>>>> Yeah, it would reduce the performance on D1 and our next-generation > > >>>>> processor has optimized fence performance a lot. > > >>>> > > >>>> Definately a bummer that the fences make the HW go slow, but I don't > > >>>> really see any other way to go about this. If you think these mappings > > >>>> are valid for LKMM and RVWMO then we should figure this out, but trying > > >>>> to drop fences to make HW go faster in ways that violate the memory > > >>>> model is going to lead to insanity. > > >>> Actually, this patch is okay with the ISA spec, and Dan also thought > > >>> it was valid. > > >>> > > >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > > >> > > >> "Thoughts" on this regard have _changed_. Please compare that quote > > >> with, e.g. > > >> > > >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > > >> > > >> So here's a suggestion: > > >> > > >> Reviewers of your patches have asked: How come that code we used to > > >> consider as buggy is now considered "an optimization" (correct)? > > >> > > >> Denying the evidence or going around it is not making their job (and > > >> this upstreaming) easier, so why don't you address it? Take time to > > >> review previous works and discussions in this area, understand them, > > >> and integrate such knowledge in future submissions. > > >> > > > > > > I agree with Andrea. > > > > > > And I actually took a look into this, and I think I find some > > > explanation. There are two versions of RISV memory model here: > > > > > > Model 2017: released at Dec 1, 2017 as a draft > > > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > > > > > > Model 2018: released at May 2, 2018 > > > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > > > > > > Noted that previous conversation about commit 5ce6c1f3535f happened at > > > March 2018. So the timeline is roughly: > > > > > > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > > > > > > And in the email thread of Model 2018, the commit related to model > > > changes also got mentioned: > > > > > > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > > > > > > in that commit, we can see the changes related to sc.aqrl are: > > > > > > to have occurred between the LR and a successful SC. The LR/SC > > > sequence can be given acquire semantics by setting the {\em aq} bit on > > > -the SC instruction. The LR/SC sequence can be given release semantics > > > -by setting the {\em rl} bit on the LR instruction. Setting both {\em > > > - aq} and {\em rl} bits on the LR instruction, and setting the {\em > > > - aq} bit on the SC instruction makes the LR/SC sequence sequentially > > > -consistent with respect to other sequentially consistent atomic > > > -operations. > > > +the LR instruction. The LR/SC sequence can be given release semantics > > > +by setting the {\em rl} bit on the SC instruction. Setting the {\em > > > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > > > + rl} bit on the SC instruction makes the LR/SC sequence sequentially > > > +consistent, meaning that it cannot be reordered with earlier or > > > +later memory operations from the same hart. > > > > > > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > > > against "earlier or later memory operations from the same hart", and > > > this statement was not in Model 2017. > > > > > > So my understanding of the story is that at some point between March and > > > May 2018, RISV memory model folks decided to add this rule, which does > > > look more consistent with other parts of the model and is useful. > > > > > > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > > > barrier ;-) > > > > > > Now if my understanding is correct, to move forward, it's better that 1) > > > this patch gets resend with the above information (better rewording a > > > bit), and 2) gets an Acked-by from Dan to confirm this is a correct > > > history ;-) > > > > I'm a bit lost as to why digging into RISC-V mailing list history is > > relevant here...what's relevant is what was ratified in the RVWMO > > chapter of the RISC-V spec, and whether the code you're proposing > > is the most optimized code that is correct wrt RVWMO. > > > > Is your claim that the code you're proposing to fix was based on a > > pre-RVWMO RISC-V memory model definition, and you're updating it to > > be more RVWMO-compliant? > > > > Well, first it's not my code ;-) > > The thing is that this patch proposed by Guo Ren kinda fixes/revertes a > previous commit 5ce6c1f3535f ("riscv/atomic: Strengthen implementations > with fences"). It looks to me that Guo Ren's patch fits the current > RISCV memory model and Linux kernel memory model, but the question is > that commit 5ce6c1f3535f was also a fix, so why do we change things > back and forth? If I understand correctly, this is also what Palmer and > Andrea asked for. I think the patch is still a little different from 5ce6c1f3535f: before: lr.w.aqrl + sc.w.aqrl 5ce6c1f3535f: lr.w + sc.w.rl + fence this patch: lr.w + sc.w.aqrl > > My understanding is that commit 5ce6c1f3535f was based on a draft RVWMO > that was different than the current one. I'd love to record this > difference in the commit log of Guo Ren's patch, so that later on we > know why we changed things back and forth. To do so, the confirmation > from RVWMO authors is helpful. Thx for the effort on the patch and so much related history information which let me clearer. The motivation of this patch is to save one fence instruction and let .aqrl give the RCsc synchronization point. We also have used the .aqrl in ATOMIC_FETCH_OP: " amo" #asm_op "." #asm_type ".aqrl %1, %2, %0" \ I think using .aqrl would be better and union into one style in atomic.h would be clearer. (Some are .aqrl, some are .rl + fence) > > Hope that I make things more clear ;-) > > Regards, > Boqun > > > Dan > > > > > Regards, > > > Boqun > > > > > >> Andrea > > >> > > >> > > > [...] -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: > > On 6/22/2022 11:31 PM, Boqun Feng wrote: > > Hi, > > > > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > > [...] > >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > >>> is about fixup wrong spinlock/unlock implementation and not relate to > >>> this patch. > >> > >> No. The commit in question is evidence of the fact that the changes > >> you are presenting here (as an optimization) were buggy/incorrect at > >> the time in which that commit was worked out. > >> > >> > >>> Actually, sc.w.aqrl is very strong and the same with: > >>> fence rw, rw > >>> sc.w > >>> fence rw,rw > >>> > >>> So "which do not give full-ordering with .aqrl" is not writen in > >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > >>> > >>>> > >>>>>> describes the issue more specifically, that's when we added these > >>>>>> fences. There have certainly been complains that these fences are too > >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > >>>>> Yeah, it would reduce the performance on D1 and our next-generation > >>>>> processor has optimized fence performance a lot. > >>>> > >>>> Definately a bummer that the fences make the HW go slow, but I don't > >>>> really see any other way to go about this. If you think these mappings > >>>> are valid for LKMM and RVWMO then we should figure this out, but trying > >>>> to drop fences to make HW go faster in ways that violate the memory > >>>> model is going to lead to insanity. > >>> Actually, this patch is okay with the ISA spec, and Dan also thought > >>> it was valid. > >>> > >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > >> > >> "Thoughts" on this regard have _changed_. Please compare that quote > >> with, e.g. > >> > >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > >> > >> So here's a suggestion: > >> > >> Reviewers of your patches have asked: How come that code we used to > >> consider as buggy is now considered "an optimization" (correct)? > >> > >> Denying the evidence or going around it is not making their job (and > >> this upstreaming) easier, so why don't you address it? Take time to > >> review previous works and discussions in this area, understand them, > >> and integrate such knowledge in future submissions. > >> > > > > I agree with Andrea. > > > > And I actually took a look into this, and I think I find some > > explanation. There are two versions of RISV memory model here: > > > > Model 2017: released at Dec 1, 2017 as a draft > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > > > > Model 2018: released at May 2, 2018 > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > > > > Noted that previous conversation about commit 5ce6c1f3535f happened at > > March 2018. So the timeline is roughly: > > > > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > > > > And in the email thread of Model 2018, the commit related to model > > changes also got mentioned: > > > > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > > > > in that commit, we can see the changes related to sc.aqrl are: > > > > to have occurred between the LR and a successful SC. The LR/SC > > sequence can be given acquire semantics by setting the {\em aq} bit on > > -the SC instruction. The LR/SC sequence can be given release semantics > > -by setting the {\em rl} bit on the LR instruction. Setting both {\em > > - aq} and {\em rl} bits on the LR instruction, and setting the {\em > > - aq} bit on the SC instruction makes the LR/SC sequence sequentially > > -consistent with respect to other sequentially consistent atomic > > -operations. > > +the LR instruction. The LR/SC sequence can be given release semantics > > +by setting the {\em rl} bit on the SC instruction. Setting the {\em > > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > > + rl} bit on the SC instruction makes the LR/SC sequence sequentially > > +consistent, meaning that it cannot be reordered with earlier or > > +later memory operations from the same hart. > > > > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > > against "earlier or later memory operations from the same hart", and > > this statement was not in Model 2017. > > > > So my understanding of the story is that at some point between March and > > May 2018, RISV memory model folks decided to add this rule, which does > > look more consistent with other parts of the model and is useful. > > > > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > > barrier ;-) > > > > Now if my understanding is correct, to move forward, it's better that 1) > > this patch gets resend with the above information (better rewording a > > bit), and 2) gets an Acked-by from Dan to confirm this is a correct > > history ;-) > > I'm a bit lost as to why digging into RISC-V mailing list history is > relevant here...what's relevant is what was ratified in the RVWMO > chapter of the RISC-V spec, and whether the code you're proposing > is the most optimized code that is correct wrt RVWMO. > > Is your claim that the code you're proposing to fix was based on a > pre-RVWMO RISC-V memory model definition, and you're updating it to > be more RVWMO-compliant? Could "lr + beq + sc.aqrl" provides a conditional RCsc here with current spec? I only found "lr.aq + sc.aqrl" despcriton which is un-conditional RCsc. > > Dan > > > Regards, > > Boqun > > > >> Andrea > >> > >> > > [...] -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Sat, Jun 25, 2022 at 01:29:50PM +0800, Guo Ren wrote: > On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: > > > > On 6/22/2022 11:31 PM, Boqun Feng wrote: > > > Hi, > > > > > > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > > > [...] > > >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > >>> is about fixup wrong spinlock/unlock implementation and not relate to > > >>> this patch. > > >> > > >> No. The commit in question is evidence of the fact that the changes > > >> you are presenting here (as an optimization) were buggy/incorrect at > > >> the time in which that commit was worked out. > > >> > > >> > > >>> Actually, sc.w.aqrl is very strong and the same with: > > >>> fence rw, rw > > >>> sc.w > > >>> fence rw,rw > > >>> > > >>> So "which do not give full-ordering with .aqrl" is not writen in > > >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > >>> > > >>>> > > >>>>>> describes the issue more specifically, that's when we added these > > >>>>>> fences. There have certainly been complains that these fences are too > > >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > > >>>>> Yeah, it would reduce the performance on D1 and our next-generation > > >>>>> processor has optimized fence performance a lot. > > >>>> > > >>>> Definately a bummer that the fences make the HW go slow, but I don't > > >>>> really see any other way to go about this. If you think these mappings > > >>>> are valid for LKMM and RVWMO then we should figure this out, but trying > > >>>> to drop fences to make HW go faster in ways that violate the memory > > >>>> model is going to lead to insanity. > > >>> Actually, this patch is okay with the ISA spec, and Dan also thought > > >>> it was valid. > > >>> > > >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > > >> > > >> "Thoughts" on this regard have _changed_. Please compare that quote > > >> with, e.g. > > >> > > >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > > >> > > >> So here's a suggestion: > > >> > > >> Reviewers of your patches have asked: How come that code we used to > > >> consider as buggy is now considered "an optimization" (correct)? > > >> > > >> Denying the evidence or going around it is not making their job (and > > >> this upstreaming) easier, so why don't you address it? Take time to > > >> review previous works and discussions in this area, understand them, > > >> and integrate such knowledge in future submissions. > > >> > > > > > > I agree with Andrea. > > > > > > And I actually took a look into this, and I think I find some > > > explanation. There are two versions of RISV memory model here: > > > > > > Model 2017: released at Dec 1, 2017 as a draft > > > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > > > > > > Model 2018: released at May 2, 2018 > > > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > > > > > > Noted that previous conversation about commit 5ce6c1f3535f happened at > > > March 2018. So the timeline is roughly: > > > > > > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > > > > > > And in the email thread of Model 2018, the commit related to model > > > changes also got mentioned: > > > > > > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > > > > > > in that commit, we can see the changes related to sc.aqrl are: > > > > > > to have occurred between the LR and a successful SC. The LR/SC > > > sequence can be given acquire semantics by setting the {\em aq} bit on > > > -the SC instruction. The LR/SC sequence can be given release semantics > > > -by setting the {\em rl} bit on the LR instruction. Setting both {\em > > > - aq} and {\em rl} bits on the LR instruction, and setting the {\em > > > - aq} bit on the SC instruction makes the LR/SC sequence sequentially > > > -consistent with respect to other sequentially consistent atomic > > > -operations. > > > +the LR instruction. The LR/SC sequence can be given release semantics > > > +by setting the {\em rl} bit on the SC instruction. Setting the {\em > > > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > > > + rl} bit on the SC instruction makes the LR/SC sequence sequentially > > > +consistent, meaning that it cannot be reordered with earlier or > > > +later memory operations from the same hart. > > > > > > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > > > against "earlier or later memory operations from the same hart", and > > > this statement was not in Model 2017. > > > > > > So my understanding of the story is that at some point between March and > > > May 2018, RISV memory model folks decided to add this rule, which does > > > look more consistent with other parts of the model and is useful. > > > > > > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > > > barrier ;-) > > > > > > Now if my understanding is correct, to move forward, it's better that 1) > > > this patch gets resend with the above information (better rewording a > > > bit), and 2) gets an Acked-by from Dan to confirm this is a correct > > > history ;-) > > > > I'm a bit lost as to why digging into RISC-V mailing list history is > > relevant here...what's relevant is what was ratified in the RVWMO > > chapter of the RISC-V spec, and whether the code you're proposing > > is the most optimized code that is correct wrt RVWMO. > > > > Is your claim that the code you're proposing to fix was based on a > > pre-RVWMO RISC-V memory model definition, and you're updating it to > > be more RVWMO-compliant? > Could "lr + beq + sc.aqrl" provides a conditional RCsc here with > current spec? I only found "lr.aq + sc.aqrl" despcriton which is > un-conditional RCsc. > /me put the temporary RISCV memory model hat on and pretend to be a RISCV memory expert. I think the answer is yes, it's actually quite straightforwards given that RISCV treats PPO (Preserved Program Order) as part of GMO (Global Memory Order), considering the following (A and B are memory accesses): A .. sc.aqrl // M .. B , A has a ->ppo ordering to M since "sc.aqrl" is a RELEASE, and M has a ->ppo ordeing to B since "sc.aqrl" is an AQUIRE, so A ->ppo M ->ppo B And since RISCV describes that PPO is part of GMO: """ The subset of program order that must be respected by the global memory order is known as preserved program order. """ also in the herd model: (* Main model axiom *) acyclic co | rfe | fr | ppo as Model , therefore the ordering between A and B is GMO and GMO should be respected by all harts. Regards, Boqun > > > > Dan > > > > > Regards, > > > Boqun > > > > > >> Andrea > > >> > > >> > > > [...] > > > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
On 7/6/2022 8:03 PM, Boqun Feng wrote: > On Sat, Jun 25, 2022 at 01:29:50PM +0800, Guo Ren wrote: >> On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: >>> >>> On 6/22/2022 11:31 PM, Boqun Feng wrote: >>>> Hi, >>>> >>>> On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: >>>> [...] >>>>>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") >>>>>> is about fixup wrong spinlock/unlock implementation and not relate to >>>>>> this patch. >>>>> >>>>> No. The commit in question is evidence of the fact that the changes >>>>> you are presenting here (as an optimization) were buggy/incorrect at >>>>> the time in which that commit was worked out. >>>>> >>>>> >>>>>> Actually, sc.w.aqrl is very strong and the same with: >>>>>> fence rw, rw >>>>>> sc.w >>>>>> fence rw,rw >>>>>> >>>>>> So "which do not give full-ordering with .aqrl" is not writen in >>>>>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. >>>>>> >>>>>>> >>>>>>>>> describes the issue more specifically, that's when we added these >>>>>>>>> fences. There have certainly been complains that these fences are too >>>>>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have >>>>>>>> Yeah, it would reduce the performance on D1 and our next-generation >>>>>>>> processor has optimized fence performance a lot. >>>>>>> >>>>>>> Definately a bummer that the fences make the HW go slow, but I don't >>>>>>> really see any other way to go about this. If you think these mappings >>>>>>> are valid for LKMM and RVWMO then we should figure this out, but trying >>>>>>> to drop fences to make HW go faster in ways that violate the memory >>>>>>> model is going to lead to insanity. >>>>>> Actually, this patch is okay with the ISA spec, and Dan also thought >>>>>> it was valid. >>>>>> >>>>>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw >>>>> >>>>> "Thoughts" on this regard have _changed_. Please compare that quote >>>>> with, e.g. >>>>> >>>>> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ >>>>> >>>>> So here's a suggestion: >>>>> >>>>> Reviewers of your patches have asked: How come that code we used to >>>>> consider as buggy is now considered "an optimization" (correct)? >>>>> >>>>> Denying the evidence or going around it is not making their job (and >>>>> this upstreaming) easier, so why don't you address it? Take time to >>>>> review previous works and discussions in this area, understand them, >>>>> and integrate such knowledge in future submissions. >>>>> >>>> >>>> I agree with Andrea. >>>> >>>> And I actually took a look into this, and I think I find some >>>> explanation. There are two versions of RISV memory model here: >>>> >>>> Model 2017: released at Dec 1, 2017 as a draft >>>> >>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ >>>> >>>> Model 2018: released at May 2, 2018 >>>> >>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ >>>> >>>> Noted that previous conversation about commit 5ce6c1f3535f happened at >>>> March 2018. So the timeline is roughly: >>>> >>>> Model 2017 -> commit 5ce6c1f3535f -> Model 2018 >>>> >>>> And in the email thread of Model 2018, the commit related to model >>>> changes also got mentioned: >>>> >>>> https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a >>>> >>>> in that commit, we can see the changes related to sc.aqrl are: >>>> >>>> to have occurred between the LR and a successful SC. The LR/SC >>>> sequence can be given acquire semantics by setting the {\em aq} bit on >>>> -the SC instruction. The LR/SC sequence can be given release semantics >>>> -by setting the {\em rl} bit on the LR instruction. Setting both {\em >>>> - aq} and {\em rl} bits on the LR instruction, and setting the {\em >>>> - aq} bit on the SC instruction makes the LR/SC sequence sequentially >>>> -consistent with respect to other sequentially consistent atomic >>>> -operations. >>>> +the LR instruction. The LR/SC sequence can be given release semantics >>>> +by setting the {\em rl} bit on the SC instruction. Setting the {\em >>>> + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em >>>> + rl} bit on the SC instruction makes the LR/SC sequence sequentially >>>> +consistent, meaning that it cannot be reordered with earlier or >>>> +later memory operations from the same hart. >>>> >>>> note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered >>>> against "earlier or later memory operations from the same hart", and >>>> this statement was not in Model 2017. >>>> >>>> So my understanding of the story is that at some point between March and >>>> May 2018, RISV memory model folks decided to add this rule, which does >>>> look more consistent with other parts of the model and is useful. >>>> >>>> And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered >>>> barrier ;-) >>>> >>>> Now if my understanding is correct, to move forward, it's better that 1) >>>> this patch gets resend with the above information (better rewording a >>>> bit), and 2) gets an Acked-by from Dan to confirm this is a correct >>>> history ;-) >>> >>> I'm a bit lost as to why digging into RISC-V mailing list history is >>> relevant here...what's relevant is what was ratified in the RVWMO >>> chapter of the RISC-V spec, and whether the code you're proposing >>> is the most optimized code that is correct wrt RVWMO. >>> >>> Is your claim that the code you're proposing to fix was based on a >>> pre-RVWMO RISC-V memory model definition, and you're updating it to >>> be more RVWMO-compliant? >> Could "lr + beq + sc.aqrl" provides a conditional RCsc here with >> current spec? I only found "lr.aq + sc.aqrl" despcriton which is >> un-conditional RCsc. >> > > /me put the temporary RISCV memory model hat on and pretend to be a > RISCV memory expert. > > I think the answer is yes, it's actually quite straightforwards given > that RISCV treats PPO (Preserved Program Order) as part of GMO (Global > Memory Order), considering the following (A and B are memory accesses): > > A > .. > sc.aqrl // M > .. > B > > , A has a ->ppo ordering to M since "sc.aqrl" is a RELEASE, and M has > a ->ppo ordeing to B since "sc.aqrl" is an AQUIRE, so > > A ->ppo M ->ppo B > > And since RISCV describes that PPO is part of GMO: > > """ > The subset of program order that must be respected by the global memory > order is known as preserved program order. > """ > > also in the herd model: > > (* Main model axiom *) > acyclic co | rfe | fr | ppo as Model > > , therefore the ordering between A and B is GMO and GMO should be > respected by all harts. > > Regards, > Boqun I agree with Boqun's reasoning, at least for the case where there is no branch. But to confirm, was the original question about also having a branch, I assume to the instruction immediately after the sc? If so, then yes, that would make the .aqrl effect conditional. Dan > >>> >>> Dan >>> >>>> Regards, >>>> Boqun >>>> >>>>> Andrea >>>>> >>>>> >>>> [...] >> >> >> >> -- >> Best Regards >> Guo Ren >> >> ML: https://lore.kernel.org/linux-csky/
On Wed, Jul 13, 2022 at 9:38 PM Dan Lustig <dlustig@nvidia.com> wrote: > > On 7/6/2022 8:03 PM, Boqun Feng wrote: > > On Sat, Jun 25, 2022 at 01:29:50PM +0800, Guo Ren wrote: > >> On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: > >>> > >>> On 6/22/2022 11:31 PM, Boqun Feng wrote: > >>>> Hi, > >>>> > >>>> On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > >>>> [...] > >>>>>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > >>>>>> is about fixup wrong spinlock/unlock implementation and not relate to > >>>>>> this patch. > >>>>> > >>>>> No. The commit in question is evidence of the fact that the changes > >>>>> you are presenting here (as an optimization) were buggy/incorrect at > >>>>> the time in which that commit was worked out. > >>>>> > >>>>> > >>>>>> Actually, sc.w.aqrl is very strong and the same with: > >>>>>> fence rw, rw > >>>>>> sc.w > >>>>>> fence rw,rw > >>>>>> > >>>>>> So "which do not give full-ordering with .aqrl" is not writen in > >>>>>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > >>>>>> > >>>>>>> > >>>>>>>>> describes the issue more specifically, that's when we added these > >>>>>>>>> fences. There have certainly been complains that these fences are too > >>>>>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > >>>>>>>> Yeah, it would reduce the performance on D1 and our next-generation > >>>>>>>> processor has optimized fence performance a lot. > >>>>>>> > >>>>>>> Definately a bummer that the fences make the HW go slow, but I don't > >>>>>>> really see any other way to go about this. If you think these mappings > >>>>>>> are valid for LKMM and RVWMO then we should figure this out, but trying > >>>>>>> to drop fences to make HW go faster in ways that violate the memory > >>>>>>> model is going to lead to insanity. > >>>>>> Actually, this patch is okay with the ISA spec, and Dan also thought > >>>>>> it was valid. > >>>>>> > >>>>>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > >>>>> > >>>>> "Thoughts" on this regard have _changed_. Please compare that quote > >>>>> with, e.g. > >>>>> > >>>>> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > >>>>> > >>>>> So here's a suggestion: > >>>>> > >>>>> Reviewers of your patches have asked: How come that code we used to > >>>>> consider as buggy is now considered "an optimization" (correct)? > >>>>> > >>>>> Denying the evidence or going around it is not making their job (and > >>>>> this upstreaming) easier, so why don't you address it? Take time to > >>>>> review previous works and discussions in this area, understand them, > >>>>> and integrate such knowledge in future submissions. > >>>>> > >>>> > >>>> I agree with Andrea. > >>>> > >>>> And I actually took a look into this, and I think I find some > >>>> explanation. There are two versions of RISV memory model here: > >>>> > >>>> Model 2017: released at Dec 1, 2017 as a draft > >>>> > >>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > >>>> > >>>> Model 2018: released at May 2, 2018 > >>>> > >>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > >>>> > >>>> Noted that previous conversation about commit 5ce6c1f3535f happened at > >>>> March 2018. So the timeline is roughly: > >>>> > >>>> Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > >>>> > >>>> And in the email thread of Model 2018, the commit related to model > >>>> changes also got mentioned: > >>>> > >>>> https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > >>>> > >>>> in that commit, we can see the changes related to sc.aqrl are: > >>>> > >>>> to have occurred between the LR and a successful SC. The LR/SC > >>>> sequence can be given acquire semantics by setting the {\em aq} bit on > >>>> -the SC instruction. The LR/SC sequence can be given release semantics > >>>> -by setting the {\em rl} bit on the LR instruction. Setting both {\em > >>>> - aq} and {\em rl} bits on the LR instruction, and setting the {\em > >>>> - aq} bit on the SC instruction makes the LR/SC sequence sequentially > >>>> -consistent with respect to other sequentially consistent atomic > >>>> -operations. > >>>> +the LR instruction. The LR/SC sequence can be given release semantics > >>>> +by setting the {\em rl} bit on the SC instruction. Setting the {\em > >>>> + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > >>>> + rl} bit on the SC instruction makes the LR/SC sequence sequentially > >>>> +consistent, meaning that it cannot be reordered with earlier or > >>>> +later memory operations from the same hart. > >>>> > >>>> note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > >>>> against "earlier or later memory operations from the same hart", and > >>>> this statement was not in Model 2017. > >>>> > >>>> So my understanding of the story is that at some point between March and > >>>> May 2018, RISV memory model folks decided to add this rule, which does > >>>> look more consistent with other parts of the model and is useful. > >>>> > >>>> And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > >>>> barrier ;-) > >>>> > >>>> Now if my understanding is correct, to move forward, it's better that 1) > >>>> this patch gets resend with the above information (better rewording a > >>>> bit), and 2) gets an Acked-by from Dan to confirm this is a correct > >>>> history ;-) > >>> > >>> I'm a bit lost as to why digging into RISC-V mailing list history is > >>> relevant here...what's relevant is what was ratified in the RVWMO > >>> chapter of the RISC-V spec, and whether the code you're proposing > >>> is the most optimized code that is correct wrt RVWMO. > >>> > >>> Is your claim that the code you're proposing to fix was based on a > >>> pre-RVWMO RISC-V memory model definition, and you're updating it to > >>> be more RVWMO-compliant? > >> Could "lr + beq + sc.aqrl" provides a conditional RCsc here with > >> current spec? I only found "lr.aq + sc.aqrl" despcriton which is > >> un-conditional RCsc. > >> > > > > /me put the temporary RISCV memory model hat on and pretend to be a > > RISCV memory expert. > > > > I think the answer is yes, it's actually quite straightforwards given > > that RISCV treats PPO (Preserved Program Order) as part of GMO (Global > > Memory Order), considering the following (A and B are memory accesses): > > > > A > > .. > > sc.aqrl // M > > .. > > B > > > > , A has a ->ppo ordering to M since "sc.aqrl" is a RELEASE, and M has > > a ->ppo ordeing to B since "sc.aqrl" is an AQUIRE, so > > > > A ->ppo M ->ppo B > > > > And since RISCV describes that PPO is part of GMO: > > > > """ > > The subset of program order that must be respected by the global memory > > order is known as preserved program order. > > """ > > > > also in the herd model: > > > > (* Main model axiom *) > > acyclic co | rfe | fr | ppo as Model > > > > , therefore the ordering between A and B is GMO and GMO should be > > respected by all harts. > > > > Regards, > > Boqun > > I agree with Boqun's reasoning, at least for the case where there > is no branch. > > But to confirm, was the original question about also having a branch, > I assume to the instruction immediately after the sc? If so, then > yes, that would make the .aqrl effect conditional. >> Could "lr + beq + sc.aqrl" provides a conditional RCsc here with current spec? >> I only found "lr.aq + sc.aqrl" despcriton which is un-conditional RCsc. In ISA spec, I found: "Setting the aq bit on the LR instruction, and setting both the aq and the rl bit on the SC instruction makes the LR/SC sequence sequentially consistent, meaning that it cannot be reordered with earlier or later memory operations from the same hart." No "lr + bnez + sc.aqrl" or "lr.aq + bnez + sc.aqrl" example description. >> Could "lr + beq + sc.aqrl" provides a conditional RCsc here with current spec? So, the above is legal for the RVWMO & LKMM? > > Dan > > > > >>> > >>> Dan > >>> > >>>> Regards, > >>>> Boqun > >>>> > >>>>> Andrea > >>>>> > >>>>> > >>>> [...] > >> > >> > >> > >> -- > >> Best Regards > >> Guo Ren > >> > >> ML: https://lore.kernel.org/linux-csky/
On Thu, Jul 7, 2022 at 8:04 AM Boqun Feng <boqun.feng@gmail.com> wrote: > > On Sat, Jun 25, 2022 at 01:29:50PM +0800, Guo Ren wrote: > > On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: > > > > > > On 6/22/2022 11:31 PM, Boqun Feng wrote: > > > > Hi, > > > > > > > > On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > > > > [...] > > > >>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > > > >>> is about fixup wrong spinlock/unlock implementation and not relate to > > > >>> this patch. > > > >> > > > >> No. The commit in question is evidence of the fact that the changes > > > >> you are presenting here (as an optimization) were buggy/incorrect at > > > >> the time in which that commit was worked out. > > > >> > > > >> > > > >>> Actually, sc.w.aqrl is very strong and the same with: > > > >>> fence rw, rw > > > >>> sc.w > > > >>> fence rw,rw > > > >>> > > > >>> So "which do not give full-ordering with .aqrl" is not writen in > > > >>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > > > >>> > > > >>>> > > > >>>>>> describes the issue more specifically, that's when we added these > > > >>>>>> fences. There have certainly been complains that these fences are too > > > >>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > > > >>>>> Yeah, it would reduce the performance on D1 and our next-generation > > > >>>>> processor has optimized fence performance a lot. > > > >>>> > > > >>>> Definately a bummer that the fences make the HW go slow, but I don't > > > >>>> really see any other way to go about this. If you think these mappings > > > >>>> are valid for LKMM and RVWMO then we should figure this out, but trying > > > >>>> to drop fences to make HW go faster in ways that violate the memory > > > >>>> model is going to lead to insanity. > > > >>> Actually, this patch is okay with the ISA spec, and Dan also thought > > > >>> it was valid. > > > >>> > > > >>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > > > >> > > > >> "Thoughts" on this regard have _changed_. Please compare that quote > > > >> with, e.g. > > > >> > > > >> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > > > >> > > > >> So here's a suggestion: > > > >> > > > >> Reviewers of your patches have asked: How come that code we used to > > > >> consider as buggy is now considered "an optimization" (correct)? > > > >> > > > >> Denying the evidence or going around it is not making their job (and > > > >> this upstreaming) easier, so why don't you address it? Take time to > > > >> review previous works and discussions in this area, understand them, > > > >> and integrate such knowledge in future submissions. > > > >> > > > > > > > > I agree with Andrea. > > > > > > > > And I actually took a look into this, and I think I find some > > > > explanation. There are two versions of RISV memory model here: > > > > > > > > Model 2017: released at Dec 1, 2017 as a draft > > > > > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > > > > > > > > Model 2018: released at May 2, 2018 > > > > > > > > https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > > > > > > > > Noted that previous conversation about commit 5ce6c1f3535f happened at > > > > March 2018. So the timeline is roughly: > > > > > > > > Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > > > > > > > > And in the email thread of Model 2018, the commit related to model > > > > changes also got mentioned: > > > > > > > > https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > > > > > > > > in that commit, we can see the changes related to sc.aqrl are: > > > > > > > > to have occurred between the LR and a successful SC. The LR/SC > > > > sequence can be given acquire semantics by setting the {\em aq} bit on > > > > -the SC instruction. The LR/SC sequence can be given release semantics > > > > -by setting the {\em rl} bit on the LR instruction. Setting both {\em > > > > - aq} and {\em rl} bits on the LR instruction, and setting the {\em > > > > - aq} bit on the SC instruction makes the LR/SC sequence sequentially > > > > -consistent with respect to other sequentially consistent atomic > > > > -operations. > > > > +the LR instruction. The LR/SC sequence can be given release semantics > > > > +by setting the {\em rl} bit on the SC instruction. Setting the {\em > > > > + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > > > > + rl} bit on the SC instruction makes the LR/SC sequence sequentially > > > > +consistent, meaning that it cannot be reordered with earlier or > > > > +later memory operations from the same hart. > > > > > > > > note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > > > > against "earlier or later memory operations from the same hart", and > > > > this statement was not in Model 2017. > > > > > > > > So my understanding of the story is that at some point between March and > > > > May 2018, RISV memory model folks decided to add this rule, which does > > > > look more consistent with other parts of the model and is useful. > > > > > > > > And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > > > > barrier ;-) > > > > > > > > Now if my understanding is correct, to move forward, it's better that 1) > > > > this patch gets resend with the above information (better rewording a > > > > bit), and 2) gets an Acked-by from Dan to confirm this is a correct > > > > history ;-) > > > > > > I'm a bit lost as to why digging into RISC-V mailing list history is > > > relevant here...what's relevant is what was ratified in the RVWMO > > > chapter of the RISC-V spec, and whether the code you're proposing > > > is the most optimized code that is correct wrt RVWMO. > > > > > > Is your claim that the code you're proposing to fix was based on a > > > pre-RVWMO RISC-V memory model definition, and you're updating it to > > > be more RVWMO-compliant? > > Could "lr + beq + sc.aqrl" provides a conditional RCsc here with > > current spec? I only found "lr.aq + sc.aqrl" despcriton which is > > un-conditional RCsc. > > > > /me put the temporary RISCV memory model hat on and pretend to be a > RISCV memory expert. > > I think the answer is yes, it's actually quite straightforwards given > that RISCV treats PPO (Preserved Program Order) as part of GMO (Global > Memory Order), considering the following (A and B are memory accesses): > > A > .. > sc.aqrl // M > .. > B > > , A has a ->ppo ordering to M since "sc.aqrl" is a RELEASE, and M has > a ->ppo ordeing to B since "sc.aqrl" is an AQUIRE, so > > A ->ppo M ->ppo B That also means M must fence.rl + sc + fence.aq. But in the release consistency model, "rl + aq" is not legal and has no guarantee at all. So sc.aqrl should be clarified in spec, but I only found "lr.aq + sc.aqrl" description, see the patch commit log. Could we treat sc.aqrl as a whole in ISA? Because in micro-arch, we must separate it into pieces for implementation. That is what the RVWMO should give out. > > And since RISCV describes that PPO is part of GMO: > > """ > The subset of program order that must be respected by the global memory > order is known as preserved program order. > """ > > also in the herd model: > > (* Main model axiom *) > acyclic co | rfe | fr | ppo as Model If the herd7 model has defined that, I think it should be legal. Good catch. > > , therefore the ordering between A and B is GMO and GMO should be > respected by all harts. > > Regards, > Boqun > > > > > > > Dan > > > > > > > Regards, > > > > Boqun > > > > > > > >> Andrea > > > >> > > > >> > > > > [...] > > > > > > > > -- > > Best Regards > > Guo Ren > > > > ML: https://lore.kernel.org/linux-csky/
On 7/13/2022 7:47 PM, Guo Ren wrote: > On Thu, Jul 7, 2022 at 8:04 AM Boqun Feng <boqun.feng@gmail.com> wrote: >> >> On Sat, Jun 25, 2022 at 01:29:50PM +0800, Guo Ren wrote: >>> On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: >>>> >>>> On 6/22/2022 11:31 PM, Boqun Feng wrote: >>>>> Hi, >>>>> >>>>> On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: >>>>> [...] >>>>>>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") >>>>>>> is about fixup wrong spinlock/unlock implementation and not relate to >>>>>>> this patch. >>>>>> >>>>>> No. The commit in question is evidence of the fact that the changes >>>>>> you are presenting here (as an optimization) were buggy/incorrect at >>>>>> the time in which that commit was worked out. >>>>>> >>>>>> >>>>>>> Actually, sc.w.aqrl is very strong and the same with: >>>>>>> fence rw, rw >>>>>>> sc.w >>>>>>> fence rw,rw >>>>>>> >>>>>>> So "which do not give full-ordering with .aqrl" is not writen in >>>>>>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. >>>>>>> >>>>>>>> >>>>>>>>>> describes the issue more specifically, that's when we added these >>>>>>>>>> fences. There have certainly been complains that these fences are too >>>>>>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have >>>>>>>>> Yeah, it would reduce the performance on D1 and our next-generation >>>>>>>>> processor has optimized fence performance a lot. >>>>>>>> >>>>>>>> Definately a bummer that the fences make the HW go slow, but I don't >>>>>>>> really see any other way to go about this. If you think these mappings >>>>>>>> are valid for LKMM and RVWMO then we should figure this out, but trying >>>>>>>> to drop fences to make HW go faster in ways that violate the memory >>>>>>>> model is going to lead to insanity. >>>>>>> Actually, this patch is okay with the ISA spec, and Dan also thought >>>>>>> it was valid. >>>>>>> >>>>>>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw >>>>>> >>>>>> "Thoughts" on this regard have _changed_. Please compare that quote >>>>>> with, e.g. >>>>>> >>>>>> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ >>>>>> >>>>>> So here's a suggestion: >>>>>> >>>>>> Reviewers of your patches have asked: How come that code we used to >>>>>> consider as buggy is now considered "an optimization" (correct)? >>>>>> >>>>>> Denying the evidence or going around it is not making their job (and >>>>>> this upstreaming) easier, so why don't you address it? Take time to >>>>>> review previous works and discussions in this area, understand them, >>>>>> and integrate such knowledge in future submissions. >>>>>> >>>>> >>>>> I agree with Andrea. >>>>> >>>>> And I actually took a look into this, and I think I find some >>>>> explanation. There are two versions of RISV memory model here: >>>>> >>>>> Model 2017: released at Dec 1, 2017 as a draft >>>>> >>>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ >>>>> >>>>> Model 2018: released at May 2, 2018 >>>>> >>>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ >>>>> >>>>> Noted that previous conversation about commit 5ce6c1f3535f happened at >>>>> March 2018. So the timeline is roughly: >>>>> >>>>> Model 2017 -> commit 5ce6c1f3535f -> Model 2018 >>>>> >>>>> And in the email thread of Model 2018, the commit related to model >>>>> changes also got mentioned: >>>>> >>>>> https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a >>>>> >>>>> in that commit, we can see the changes related to sc.aqrl are: >>>>> >>>>> to have occurred between the LR and a successful SC. The LR/SC >>>>> sequence can be given acquire semantics by setting the {\em aq} bit on >>>>> -the SC instruction. The LR/SC sequence can be given release semantics >>>>> -by setting the {\em rl} bit on the LR instruction. Setting both {\em >>>>> - aq} and {\em rl} bits on the LR instruction, and setting the {\em >>>>> - aq} bit on the SC instruction makes the LR/SC sequence sequentially >>>>> -consistent with respect to other sequentially consistent atomic >>>>> -operations. >>>>> +the LR instruction. The LR/SC sequence can be given release semantics >>>>> +by setting the {\em rl} bit on the SC instruction. Setting the {\em >>>>> + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em >>>>> + rl} bit on the SC instruction makes the LR/SC sequence sequentially >>>>> +consistent, meaning that it cannot be reordered with earlier or >>>>> +later memory operations from the same hart. >>>>> >>>>> note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered >>>>> against "earlier or later memory operations from the same hart", and >>>>> this statement was not in Model 2017. >>>>> >>>>> So my understanding of the story is that at some point between March and >>>>> May 2018, RISV memory model folks decided to add this rule, which does >>>>> look more consistent with other parts of the model and is useful. >>>>> >>>>> And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered >>>>> barrier ;-) >>>>> >>>>> Now if my understanding is correct, to move forward, it's better that 1) >>>>> this patch gets resend with the above information (better rewording a >>>>> bit), and 2) gets an Acked-by from Dan to confirm this is a correct >>>>> history ;-) >>>> >>>> I'm a bit lost as to why digging into RISC-V mailing list history is >>>> relevant here...what's relevant is what was ratified in the RVWMO >>>> chapter of the RISC-V spec, and whether the code you're proposing >>>> is the most optimized code that is correct wrt RVWMO. >>>> >>>> Is your claim that the code you're proposing to fix was based on a >>>> pre-RVWMO RISC-V memory model definition, and you're updating it to >>>> be more RVWMO-compliant? >>> Could "lr + beq + sc.aqrl" provides a conditional RCsc here with >>> current spec? I only found "lr.aq + sc.aqrl" despcriton which is >>> un-conditional RCsc. >>> >> >> /me put the temporary RISCV memory model hat on and pretend to be a >> RISCV memory expert. >> >> I think the answer is yes, it's actually quite straightforwards given >> that RISCV treats PPO (Preserved Program Order) as part of GMO (Global >> Memory Order), considering the following (A and B are memory accesses): >> >> A >> .. >> sc.aqrl // M >> .. >> B >> >> , A has a ->ppo ordering to M since "sc.aqrl" is a RELEASE, and M has >> a ->ppo ordeing to B since "sc.aqrl" is an AQUIRE, so >> >> A ->ppo M ->ppo B > That also means M must fence.rl + sc + fence.aq. But in the release > consistency model, "rl + aq" is not legal and has no guarantee at all. > > So sc.aqrl should be clarified in spec, but I only found "lr.aq + > sc.aqrl" description, see the patch commit log. The spec doesn't try to enumerate every possible synchronization instruction sequence. That's why the RVWMO rules are in place. > Could we treat sc.aqrl as a whole in ISA? Because in micro-arch, we > must separate it into pieces for implementation. > > That is what the RVWMO should give out. What exactly would you like the spec to say about this? RVWMO and the RISC-V spec in general describe the required architecturally observable behavior. They're not implementation guides. Generally speaking, I would expect splitting an sc.aqrl into a ".rl; sc; .aq" pattern to be OK. That wouldn't introduce new observable behaviors compared to treating the sc.aqrl as a single indivisible operation, would it? Dan >> And since RISCV describes that PPO is part of GMO: >> >> """ >> The subset of program order that must be respected by the global memory >> order is known as preserved program order. >> """ >> >> also in the herd model: >> >> (* Main model axiom *) >> acyclic co | rfe | fr | ppo as Model > If the herd7 model has defined that, I think it should be legal. Good catch. > > >> >> , therefore the ordering between A and B is GMO and GMO should be >> respected by all harts. >> >> Regards, >> Boqun >> >>>> >>>> Dan >>>> >>>>> Regards, >>>>> Boqun >>>>> >>>>>> Andrea >>>>>> >>>>>> >>>>> [...] >>> >>> >>> >>> -- >>> Best Regards >>> Guo Ren >>> >>> ML: https://lore.kernel.org/linux-csky/ > > >
On Thu, Jul 14, 2022 at 9:06 PM Dan Lustig <dlustig@nvidia.com> wrote: > > On 7/13/2022 7:47 PM, Guo Ren wrote: > > On Thu, Jul 7, 2022 at 8:04 AM Boqun Feng <boqun.feng@gmail.com> wrote: > >> > >> On Sat, Jun 25, 2022 at 01:29:50PM +0800, Guo Ren wrote: > >>> On Fri, Jun 24, 2022 at 1:09 AM Dan Lustig <dlustig@nvidia.com> wrote: > >>>> > >>>> On 6/22/2022 11:31 PM, Boqun Feng wrote: > >>>>> Hi, > >>>>> > >>>>> On Tue, Jun 14, 2022 at 01:03:47PM +0200, Andrea Parri wrote: > >>>>> [...] > >>>>>>> 5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") > >>>>>>> is about fixup wrong spinlock/unlock implementation and not relate to > >>>>>>> this patch. > >>>>>> > >>>>>> No. The commit in question is evidence of the fact that the changes > >>>>>> you are presenting here (as an optimization) were buggy/incorrect at > >>>>>> the time in which that commit was worked out. > >>>>>> > >>>>>> > >>>>>>> Actually, sc.w.aqrl is very strong and the same with: > >>>>>>> fence rw, rw > >>>>>>> sc.w > >>>>>>> fence rw,rw > >>>>>>> > >>>>>>> So "which do not give full-ordering with .aqrl" is not writen in > >>>>>>> RISC-V ISA and we could use sc.w/d.aqrl with LKMM. > >>>>>>> > >>>>>>>> > >>>>>>>>>> describes the issue more specifically, that's when we added these > >>>>>>>>>> fences. There have certainly been complains that these fences are too > >>>>>>>>>> heavyweight for the HW to go fast, but IIUC it's the best option we have > >>>>>>>>> Yeah, it would reduce the performance on D1 and our next-generation > >>>>>>>>> processor has optimized fence performance a lot. > >>>>>>>> > >>>>>>>> Definately a bummer that the fences make the HW go slow, but I don't > >>>>>>>> really see any other way to go about this. If you think these mappings > >>>>>>>> are valid for LKMM and RVWMO then we should figure this out, but trying > >>>>>>>> to drop fences to make HW go faster in ways that violate the memory > >>>>>>>> model is going to lead to insanity. > >>>>>>> Actually, this patch is okay with the ISA spec, and Dan also thought > >>>>>>> it was valid. > >>>>>>> > >>>>>>> ref: https://lore.kernel.org/lkml/41e01514-74ca-84f2-f5cc-2645c444fd8e@nvidia.com/raw > >>>>>> > >>>>>> "Thoughts" on this regard have _changed_. Please compare that quote > >>>>>> with, e.g. > >>>>>> > >>>>>> https://lore.kernel.org/linux-riscv/ddd5ca34-805b-60c4-bf2a-d6a9d95d89e7@nvidia.com/ > >>>>>> > >>>>>> So here's a suggestion: > >>>>>> > >>>>>> Reviewers of your patches have asked: How come that code we used to > >>>>>> consider as buggy is now considered "an optimization" (correct)? > >>>>>> > >>>>>> Denying the evidence or going around it is not making their job (and > >>>>>> this upstreaming) easier, so why don't you address it? Take time to > >>>>>> review previous works and discussions in this area, understand them, > >>>>>> and integrate such knowledge in future submissions. > >>>>>> > >>>>> > >>>>> I agree with Andrea. > >>>>> > >>>>> And I actually took a look into this, and I think I find some > >>>>> explanation. There are two versions of RISV memory model here: > >>>>> > >>>>> Model 2017: released at Dec 1, 2017 as a draft > >>>>> > >>>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/hKywNHBkAXM/m/QzUtxEWLBQAJ > >>>>> > >>>>> Model 2018: released at May 2, 2018 > >>>>> > >>>>> https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/xW03vmfmPuA/m/bMPk3UCWAgAJ > >>>>> > >>>>> Noted that previous conversation about commit 5ce6c1f3535f happened at > >>>>> March 2018. So the timeline is roughly: > >>>>> > >>>>> Model 2017 -> commit 5ce6c1f3535f -> Model 2018 > >>>>> > >>>>> And in the email thread of Model 2018, the commit related to model > >>>>> changes also got mentioned: > >>>>> > >>>>> https://github.com/riscv/riscv-isa-manual/commit/b875fe417948635ed68b9644ffdf718cb343a81a > >>>>> > >>>>> in that commit, we can see the changes related to sc.aqrl are: > >>>>> > >>>>> to have occurred between the LR and a successful SC. The LR/SC > >>>>> sequence can be given acquire semantics by setting the {\em aq} bit on > >>>>> -the SC instruction. The LR/SC sequence can be given release semantics > >>>>> -by setting the {\em rl} bit on the LR instruction. Setting both {\em > >>>>> - aq} and {\em rl} bits on the LR instruction, and setting the {\em > >>>>> - aq} bit on the SC instruction makes the LR/SC sequence sequentially > >>>>> -consistent with respect to other sequentially consistent atomic > >>>>> -operations. > >>>>> +the LR instruction. The LR/SC sequence can be given release semantics > >>>>> +by setting the {\em rl} bit on the SC instruction. Setting the {\em > >>>>> + aq} bit on the LR instruction, and setting both the {\em aq} and the {\em > >>>>> + rl} bit on the SC instruction makes the LR/SC sequence sequentially > >>>>> +consistent, meaning that it cannot be reordered with earlier or > >>>>> +later memory operations from the same hart. > >>>>> > >>>>> note that Model 2018 explicitly says that "ld.aq+sc.aqrl" is ordered > >>>>> against "earlier or later memory operations from the same hart", and > >>>>> this statement was not in Model 2017. > >>>>> > >>>>> So my understanding of the story is that at some point between March and > >>>>> May 2018, RISV memory model folks decided to add this rule, which does > >>>>> look more consistent with other parts of the model and is useful. > >>>>> > >>>>> And this is why (and when) "ld.aq+sc.aqrl" can be used as a fully-ordered > >>>>> barrier ;-) > >>>>> > >>>>> Now if my understanding is correct, to move forward, it's better that 1) > >>>>> this patch gets resend with the above information (better rewording a > >>>>> bit), and 2) gets an Acked-by from Dan to confirm this is a correct > >>>>> history ;-) > >>>> > >>>> I'm a bit lost as to why digging into RISC-V mailing list history is > >>>> relevant here...what's relevant is what was ratified in the RVWMO > >>>> chapter of the RISC-V spec, and whether the code you're proposing > >>>> is the most optimized code that is correct wrt RVWMO. > >>>> > >>>> Is your claim that the code you're proposing to fix was based on a > >>>> pre-RVWMO RISC-V memory model definition, and you're updating it to > >>>> be more RVWMO-compliant? > >>> Could "lr + beq + sc.aqrl" provides a conditional RCsc here with > >>> current spec? I only found "lr.aq + sc.aqrl" despcriton which is > >>> un-conditional RCsc. > >>> > >> > >> /me put the temporary RISCV memory model hat on and pretend to be a > >> RISCV memory expert. > >> > >> I think the answer is yes, it's actually quite straightforwards given > >> that RISCV treats PPO (Preserved Program Order) as part of GMO (Global > >> Memory Order), considering the following (A and B are memory accesses): > >> > >> A > >> .. > >> sc.aqrl // M > >> .. > >> B > >> > >> , A has a ->ppo ordering to M since "sc.aqrl" is a RELEASE, and M has > >> a ->ppo ordeing to B since "sc.aqrl" is an AQUIRE, so > >> > >> A ->ppo M ->ppo B > > That also means M must fence.rl + sc + fence.aq. But in the release > > consistency model, "rl + aq" is not legal and has no guarantee at all. > > > > So sc.aqrl should be clarified in spec, but I only found "lr.aq + > > sc.aqrl" description, see the patch commit log. > > The spec doesn't try to enumerate every possible synchronization > instruction sequence. That's why the RVWMO rules are in place. Okay, I just want to confirm "lr + sc.aqrl" is correct here. > > > Could we treat sc.aqrl as a whole in ISA? Because in micro-arch, we > > must separate it into pieces for implementation. > > > > That is what the RVWMO should give out. > > What exactly would you like the spec to say about this? RVWMO and the > RISC-V spec in general describe the required architecturally observable > behavior. They're not implementation guides. > > Generally speaking, I would expect splitting an sc.aqrl into a > ".rl; sc; .aq" pattern to be OK. That wouldn't introduce new observable > behaviors compared to treating the sc.aqrl as a single indivisible > operation, would it? Yes, I think the below modification is correct, and it could improve the performance in the fast path. Adding .aq annotation during the false loop won't cause side effects. right? "0: lr.d %[p], %[c]\n" " beq %[p], %[u], 1f\n" " add %[rc], %[p], %[a]\n" - " sc.d.rl %[rc], %[rc], %[c]\n" + " sc.d.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" > > Dan > > >> And since RISCV describes that PPO is part of GMO: > >> > >> """ > >> The subset of program order that must be respected by the global memory > >> order is known as preserved program order. > >> """ > >> > >> also in the herd model: > >> > >> (* Main model axiom *) > >> acyclic co | rfe | fr | ppo as Model > > If the herd7 model has defined that, I think it should be legal. Good catch. > > > > > >> > >> , therefore the ordering between A and B is GMO and GMO should be > >> respected by all harts. > >> > >> Regards, > >> Boqun > >> > >>>> > >>>> Dan > >>>> > >>>>> Regards, > >>>>> Boqun > >>>>> > >>>>>> Andrea > >>>>>> > >>>>>> > >>>>> [...] > >>> > >>> > >>> > >>> -- > >>> Best Regards > >>> Guo Ren > >>> > >>> ML: https://lore.kernel.org/linux-csky/ > > > > > > -- Best Regards Guo Ren
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 34f757dfc8f2..aef8aa9ac4f4 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int "0: lr.w %[p], %[c]\n" " beq %[p], %[u], 1f\n" " add %[rc], %[p], %[a]\n" - " sc.w.rl %[rc], %[rc], %[c]\n" + " sc.w.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : [a]"r" (a), [u]"r" (u) @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, "0: lr.d %[p], %[c]\n" " beq %[p], %[u], 1f\n" " add %[rc], %[p], %[a]\n" - " sc.d.rl %[rc], %[rc], %[c]\n" + " sc.d.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : [a]"r" (a), [u]"r" (u) @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) "0: lr.w %[p], %[c]\n" " bltz %[p], 1f\n" " addi %[rc], %[p], 1\n" - " sc.w.rl %[rc], %[rc], %[c]\n" + " sc.w.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) "0: lr.w %[p], %[c]\n" " bgtz %[p], 1f\n" " addi %[rc], %[p], -1\n" - " sc.w.rl %[rc], %[rc], %[c]\n" + " sc.w.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) "0: lr.w %[p], %[c]\n" " addi %[rc], %[p], -1\n" " bltz %[rc], 1f\n" - " sc.w.rl %[rc], %[rc], %[c]\n" + " sc.w.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) "0: lr.d %[p], %[c]\n" " bltz %[p], 1f\n" " addi %[rc], %[p], 1\n" - " sc.d.rl %[rc], %[rc], %[c]\n" + " sc.d.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) "0: lr.d %[p], %[c]\n" " bgtz %[p], 1f\n" " addi %[rc], %[p], -1\n" - " sc.d.rl %[rc], %[rc], %[c]\n" + " sc.d.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) "0: lr.d %[p], %[c]\n" " addi %[rc], %[p], -1\n" " bltz %[rc], 1f\n" - " sc.d.rl %[rc], %[rc], %[c]\n" + " sc.d.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 1af8db92250b..9269fceb86e0 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -307,9 +307,8 @@ __asm__ __volatile__ ( \ "0: lr.w %0, %2\n" \ " bne %0, %z3, 1f\n" \ - " sc.w.rl %1, %z4, %2\n" \ + " sc.w.aqrl %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - " fence rw, rw\n" \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" ((long)__old), "rJ" (__new) \ @@ -319,9 +318,8 @@ __asm__ __volatile__ ( \ "0: lr.d %0, %2\n" \ " bne %0, %z3, 1f\n" \ - " sc.d.rl %1, %z4, %2\n" \ + " sc.d.aqrl %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - " fence rw, rw\n" \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" (__old), "rJ" (__new) \