Message ID | 1616868399-82848-4-git-send-email-guoren@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | riscv: Add qspinlock/qrwlock | expand |
On 3/27/21 2:06 PM, guoren@kernel.org wrote: > From: Guo Ren <guoren@linux.alibaba.com> > > Some architectures don't have sub-word swap atomic instruction, > they only have the full word's one. > > The sub-word swap only improve the performance when: > NR_CPUS < 16K > * 0- 7: locked byte > * 8: pending > * 9-15: not used > * 16-17: tail index > * 18-31: tail cpu (+1) > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > Please let architecture select xchg16/xchg32 to implement > xchg_tail. > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Will Deacon <will@kernel.org> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Waiman Long <longman@redhat.com> > Cc: Arnd Bergmann <arnd@arndb.de> > Cc: Anup Patel <anup@brainfault.org> > --- > kernel/Kconfig.locks | 3 +++ > kernel/locking/qspinlock.c | 44 +++++++++++++++++++++----------------- > 2 files changed, 27 insertions(+), 20 deletions(-) > > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks > index 3de8fd11873b..d02f1261f73f 100644 > --- a/kernel/Kconfig.locks > +++ b/kernel/Kconfig.locks > @@ -239,6 +239,9 @@ config LOCK_SPIN_ON_OWNER > config ARCH_USE_QUEUED_SPINLOCKS > bool > > +config ARCH_USE_QUEUED_SPINLOCKS_XCHG32 > + bool > + > config QUEUED_SPINLOCKS > def_bool y if ARCH_USE_QUEUED_SPINLOCKS > depends on SMP > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c > index cbff6ba53d56..54de0632c6a8 100644 > --- a/kernel/locking/qspinlock.c > +++ b/kernel/locking/qspinlock.c > @@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) > WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); > } > > -/* > - * xchg_tail - Put in the new queue tail code word & retrieve previous one > - * @lock : Pointer to queued spinlock structure > - * @tail : The new queue tail code word > - * Return: The previous queue tail code word > - * > - * xchg(lock, tail), which heads an address dependency > - * > - * p,*,* -> n,*,* ; prev = xchg(lock, node) > - */ > -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > -{ > - /* > - * We can use relaxed semantics since the caller ensures that the > - * MCS node is properly initialized before updating the tail. > - */ > - return (u32)xchg_relaxed(&lock->tail, > - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; > -} > - > #else /* _Q_PENDING_BITS == 8 */ > > /** > @@ -206,6 +186,30 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) > { > atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); > } > +#endif > + > +#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32) > +/* > + * xchg_tail - Put in the new queue tail code word & retrieve previous one > + * @lock : Pointer to queued spinlock structure > + * @tail : The new queue tail code word > + * Return: The previous queue tail code word > + * > + * xchg(lock, tail), which heads an address dependency > + * > + * p,*,* -> n,*,* ; prev = xchg(lock, node) > + */ > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > +{ > + /* > + * We can use relaxed semantics since the caller ensures that the > + * MCS node is properly initialized before updating the tail. > + */ > + return (u32)xchg_relaxed(&lock->tail, > + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; > +} > + > +#else > > /** > * xchg_tail - Put in the new queue tail code word & retrieve previous one I don't have any problem adding a CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32 config option to control that. One minor nit: #endif /* _Q_PENDING_BITS == 8 */ You should probably remove the comment at the trailing end of the corresponding "#endif" as it is now wrong. Cheers, Longman
On Sun, Mar 28, 2021 at 2:43 AM Waiman Long <longman@redhat.com> wrote: > > On 3/27/21 2:06 PM, guoren@kernel.org wrote: > > From: Guo Ren <guoren@linux.alibaba.com> > > > > Some architectures don't have sub-word swap atomic instruction, > > they only have the full word's one. > > > > The sub-word swap only improve the performance when: > > NR_CPUS < 16K > > * 0- 7: locked byte > > * 8: pending > > * 9-15: not used > > * 16-17: tail index > > * 18-31: tail cpu (+1) > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > Please let architecture select xchg16/xchg32 to implement > > xchg_tail. > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > > Cc: Peter Zijlstra <peterz@infradead.org> > > Cc: Will Deacon <will@kernel.org> > > Cc: Ingo Molnar <mingo@redhat.com> > > Cc: Waiman Long <longman@redhat.com> > > Cc: Arnd Bergmann <arnd@arndb.de> > > Cc: Anup Patel <anup@brainfault.org> > > --- > > kernel/Kconfig.locks | 3 +++ > > kernel/locking/qspinlock.c | 44 +++++++++++++++++++++----------------- > > 2 files changed, 27 insertions(+), 20 deletions(-) > > > > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks > > index 3de8fd11873b..d02f1261f73f 100644 > > --- a/kernel/Kconfig.locks > > +++ b/kernel/Kconfig.locks > > @@ -239,6 +239,9 @@ config LOCK_SPIN_ON_OWNER > > config ARCH_USE_QUEUED_SPINLOCKS > > bool > > > > +config ARCH_USE_QUEUED_SPINLOCKS_XCHG32 > > + bool > > + > > config QUEUED_SPINLOCKS > > def_bool y if ARCH_USE_QUEUED_SPINLOCKS > > depends on SMP > > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c > > index cbff6ba53d56..54de0632c6a8 100644 > > --- a/kernel/locking/qspinlock.c > > +++ b/kernel/locking/qspinlock.c > > @@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) > > WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); > > } > > > > -/* > > - * xchg_tail - Put in the new queue tail code word & retrieve previous one > > - * @lock : Pointer to queued spinlock structure > > - * @tail : The new queue tail code word > > - * Return: The previous queue tail code word > > - * > > - * xchg(lock, tail), which heads an address dependency > > - * > > - * p,*,* -> n,*,* ; prev = xchg(lock, node) > > - */ > > -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > -{ > > - /* > > - * We can use relaxed semantics since the caller ensures that the > > - * MCS node is properly initialized before updating the tail. > > - */ > > - return (u32)xchg_relaxed(&lock->tail, > > - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; > > -} > > - > > #else /* _Q_PENDING_BITS == 8 */ > > > > /** > > @@ -206,6 +186,30 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) > > { > > atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); > > } > > +#endif > > + > > +#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32) > > +/* > > + * xchg_tail - Put in the new queue tail code word & retrieve previous one > > + * @lock : Pointer to queued spinlock structure > > + * @tail : The new queue tail code word > > + * Return: The previous queue tail code word > > + * > > + * xchg(lock, tail), which heads an address dependency > > + * > > + * p,*,* -> n,*,* ; prev = xchg(lock, node) > > + */ > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > +{ > > + /* > > + * We can use relaxed semantics since the caller ensures that the > > + * MCS node is properly initialized before updating the tail. > > + */ > > + return (u32)xchg_relaxed(&lock->tail, > > + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; > > +} > > + > > +#else > > > > /** > > * xchg_tail - Put in the new queue tail code word & retrieve previous one > > I don't have any problem adding a > CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32 config option to control that. Thx > > One minor nit: > > #endif /* _Q_PENDING_BITS == 8 */ > > You should probably remove the comment at the trailing end of the > corresponding "#endif" as it is now wrong. I'll fix it in next patch
On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > From: Guo Ren <guoren@linux.alibaba.com> > > Some architectures don't have sub-word swap atomic instruction, > they only have the full word's one. > > The sub-word swap only improve the performance when: > NR_CPUS < 16K > * 0- 7: locked byte > * 8: pending > * 9-15: not used > * 16-17: tail index > * 18-31: tail cpu (+1) > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > Please let architecture select xchg16/xchg32 to implement > xchg_tail. So I really don't like this, this pushes complexity into the generic code for something that's really not needed. Lots of RISC already implement sub-word atomics using word ll/sc. Obviously they're not sharing code like they should be :/ See for example arch/mips/kernel/cmpxchg.c. Also, I really do think doing ticket locks first is a far more sensible step.
On Mon, Mar 29, 2021 at 9:52 AM Peter Zijlstra <peterz@infradead.org> wrote: > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > > From: Guo Ren <guoren@linux.alibaba.com> > > > > Some architectures don't have sub-word swap atomic instruction, > > they only have the full word's one. > > > > The sub-word swap only improve the performance when: > > NR_CPUS < 16K > > * 0- 7: locked byte > > * 8: pending > > * 9-15: not used > > * 16-17: tail index > > * 18-31: tail cpu (+1) > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > Please let architecture select xchg16/xchg32 to implement > > xchg_tail. > > So I really don't like this, this pushes complexity into the generic > code for something that's really not needed. > > Lots of RISC already implement sub-word atomics using word ll/sc. > Obviously they're not sharing code like they should be :/ See for > example arch/mips/kernel/cmpxchg.c. That is what the previous version of the patch set did, right? I think this v4 is nicer because the code is already there in qspinlock.c and just gets moved around, and the implementation is likely more efficient this way. The mips version could be made more generic, but it is also less efficient than a simple xchg since it requires an indirect function call plus nesting a pair of loops instead in place of the single single ll/sc loop in the 32-bit xchg. I think the weakly typed xchg/cmpxchg implementation causes more problems than it solves, and we'd be better off using a stronger version in general, with the 8-bit and 16-bit exchanges using separate helpers in the same way that the fixed-length cmpxchg64 is separate already, there are only a couple of instances for each of these in the kernel. Unfortunately, there is roughly a 50:50 split between fixed 32-bit and long/pointer-sized xchg/cmpxchg users in the kernel, so making the interface completely fixed-type would add a ton of churn. I created an experimental patch for this, but it's probably not worth it. > Also, I really do think doing ticket locks first is a far more sensible > step. I think this is the important point to look at first. From what I can tell, most users of ticket spinlocks have moved to qspinlock over time, but I'm not sure about the exact tradeoff, in particular on large machines without a 16-bit xchg operation. FWIW, the implementations in the other SMP capable architectures are alpha simple arc simple+custom arm ticket arm64 qspinlock (formerly ticket) csky ticket/qrwlock (formerly simple+ticket/qrwlock) hexagon simple ia64 ticket mips qspinlock (formerly ticket) openrisc qspinlock parisc custom powerpc simple+qspinlock (formerly simple) riscv simple s390 custom-queue sh simple sparc qspinlock x86 qspinlock (formerly ticket+qspinlock) xtensa qspinlock 32-bit Arm is the only relevant user of ticket locks these days, but its hardware has a practical limit of 16 CPUs and four nodes, with most implementations having a single CPU cluster of at most four cores. We had the same discussion about xchg16() when Sebastian submitted the arm32 qspinlock implementation: https://lore.kernel.org/linux-arm-kernel/20191007214439.27891-1-sebastian@breakpoint.cc/ Arnd
On Mon, Mar 29, 2021 at 11:41:19AM +0200, Arnd Bergmann wrote: > On Mon, Mar 29, 2021 at 9:52 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > > > Some architectures don't have sub-word swap atomic instruction, > > > they only have the full word's one. > > > > > > The sub-word swap only improve the performance when: > > > NR_CPUS < 16K > > > * 0- 7: locked byte > > > * 8: pending > > > * 9-15: not used > > > * 16-17: tail index > > > * 18-31: tail cpu (+1) > > > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > > > Please let architecture select xchg16/xchg32 to implement > > > xchg_tail. > > > > So I really don't like this, this pushes complexity into the generic > > code for something that's really not needed. > > > > Lots of RISC already implement sub-word atomics using word ll/sc. > > Obviously they're not sharing code like they should be :/ See for > > example arch/mips/kernel/cmpxchg.c. > > That is what the previous version of the patch set did, right? > > I think this v4 is nicer because the code is already there in > qspinlock.c and just gets moved around, and the implementation > is likely more efficient this way. The mips version could be made > more generic, but it is also less efficient than a simple xchg > since it requires an indirect function call plus nesting a pair of > loops instead in place of the single single ll/sc loop in the 32-bit > xchg. > > I think the weakly typed xchg/cmpxchg implementation causes > more problems than it solves, and we'd be better off using > a stronger version in general, with the 8-bit and 16-bit exchanges > using separate helpers in the same way that the fixed-length > cmpxchg64 is separate already, there are only a couple of instances > for each of these in the kernel. > > Unfortunately, there is roughly a 50:50 split between fixed 32-bit > and long/pointer-sized xchg/cmpxchg users in the kernel, so > making the interface completely fixed-type would add a ton of > churn. I created an experimental patch for this, but it's probably > not worth it. The mips code is pretty horrible. Using a cmpxchg loop on an ll/sc arch is jus daft. And that's exactly what the generic xchg_tail() thing does too. A single LL/SC loop that sets either the upper or lower 16 bits of the word is always better. Anyway, an additional 'funny' is that I suspect you cannot prove fwd progress of the entire primitive with any of this on. But who cares about details anyway.. :/ And the whole WFE optimization that was relevant for the ticket lock, is _still_ relevant for qspinlock, except that seems to have gone missing again. I just don't have much confidence here that people actually understand what they're doing or why.
On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > > From: Guo Ren <guoren@linux.alibaba.com> > > > > Some architectures don't have sub-word swap atomic instruction, > > they only have the full word's one. > > > > The sub-word swap only improve the performance when: > > NR_CPUS < 16K > > * 0- 7: locked byte > > * 8: pending > > * 9-15: not used > > * 16-17: tail index > > * 18-31: tail cpu (+1) > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > Please let architecture select xchg16/xchg32 to implement > > xchg_tail. > > So I really don't like this, this pushes complexity into the generic > code for something that's really not needed. > > Lots of RISC already implement sub-word atomics using word ll/sc. > Obviously they're not sharing code like they should be :/ See for > example arch/mips/kernel/cmpxchg.c. I see, we've done two versions of this: - Using cmpxchg codes from MIPS by Michael - Re-write with assembly codes by Guo But using the full-word atomic xchg instructions implement xchg16 has the semantic risk for atomic operations. I don't think export xchg16 in a none-sub-word atomic machine is correct. > > Also, I really do think doing ticket locks first is a far more sensible > step. NACK by Anup -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Mon, Mar 29, 2021 at 07:19:29PM +0800, Guo Ren wrote: > On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > > > Some architectures don't have sub-word swap atomic instruction, > > > they only have the full word's one. > > > > > > The sub-word swap only improve the performance when: > > > NR_CPUS < 16K > > > * 0- 7: locked byte > > > * 8: pending > > > * 9-15: not used > > > * 16-17: tail index > > > * 18-31: tail cpu (+1) > > > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > > > Please let architecture select xchg16/xchg32 to implement > > > xchg_tail. > > > > So I really don't like this, this pushes complexity into the generic > > code for something that's really not needed. > > > > Lots of RISC already implement sub-word atomics using word ll/sc. > > Obviously they're not sharing code like they should be :/ See for > > example arch/mips/kernel/cmpxchg.c. > I see, we've done two versions of this: > - Using cmpxchg codes from MIPS by Michael > - Re-write with assembly codes by Guo > > But using the full-word atomic xchg instructions implement xchg16 has > the semantic risk for atomic operations. What? -ENOPARSE > > Also, I really do think doing ticket locks first is a far more sensible > > step. > NACK by Anup Who's he when he's not sending NAKs ?
On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote: > Anyway, an additional 'funny' is that I suspect you cannot prove fwd > progress of the entire primitive with any of this on. But who cares > about details anyway.. :/ What's the architectural guarantee on LL/SC progress for RISC-V ? And what if you double loop it like cmpxchg() ?
On Mon, Mar 29, 2021 at 7:26 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Mar 29, 2021 at 07:19:29PM +0800, Guo Ren wrote: > > On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > > > > > Some architectures don't have sub-word swap atomic instruction, > > > > they only have the full word's one. > > > > > > > > The sub-word swap only improve the performance when: > > > > NR_CPUS < 16K > > > > * 0- 7: locked byte > > > > * 8: pending > > > > * 9-15: not used > > > > * 16-17: tail index > > > > * 18-31: tail cpu (+1) > > > > > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > > > > > Please let architecture select xchg16/xchg32 to implement > > > > xchg_tail. > > > > > > So I really don't like this, this pushes complexity into the generic > > > code for something that's really not needed. > > > > > > Lots of RISC already implement sub-word atomics using word ll/sc. > > > Obviously they're not sharing code like they should be :/ See for > > > example arch/mips/kernel/cmpxchg.c. > > I see, we've done two versions of this: > > - Using cmpxchg codes from MIPS by Michael > > - Re-write with assembly codes by Guo > > > > But using the full-word atomic xchg instructions implement xchg16 has > > the semantic risk for atomic operations. > > What? -ENOPARSE u32 a = 0x55aa66bb; u16 *ptr = &a; CPU0 CPU1 ========= ========= xchg16(ptr, new) while(1) WRITE_ONCE(*(ptr + 1), x); When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > Also, I really do think doing ticket locks first is a far more sensible > > > step. > > NACK by Anup > > Who's he when he's not sending NAKs ? We've talked before: https://lore.kernel.org/linux-riscv/CAAhSdy1JHLUFwu7RuCaQ+RUWRBks2KsDva7EpRt8--4ZfofSUQ@mail.gmail.com/T/#t
> -----Original Message----- > From: Peter Zijlstra <peterz@infradead.org> > Sent: 29 March 2021 16:57 > To: Guo Ren <guoren@kernel.org> > Cc: linux-riscv <linux-riscv@lists.infradead.org>; Linux Kernel Mailing List > <linux-kernel@vger.kernel.org>; linux-csky@vger.kernel.org; linux-arch > <linux-arch@vger.kernel.org>; Guo Ren <guoren@linux.alibaba.com>; Will > Deacon <will@kernel.org>; Ingo Molnar <mingo@redhat.com>; Waiman > Long <longman@redhat.com>; Arnd Bergmann <arnd@arndb.de>; Anup > Patel <anup@brainfault.org> > Subject: Re: [PATCH v4 3/4] locking/qspinlock: Add > ARCH_USE_QUEUED_SPINLOCKS_XCHG32 > > On Mon, Mar 29, 2021 at 07:19:29PM +0800, Guo Ren wrote: > > On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> > wrote: > > > > > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote: > > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > > > > > Some architectures don't have sub-word swap atomic instruction, > > > > they only have the full word's one. > > > > > > > > The sub-word swap only improve the performance when: > > > > NR_CPUS < 16K > > > > * 0- 7: locked byte > > > > * 8: pending > > > > * 9-15: not used > > > > * 16-17: tail index > > > > * 18-31: tail cpu (+1) > > > > > > > > The 9-15 bits are wasted to use xchg16 in xchg_tail. > > > > > > > > Please let architecture select xchg16/xchg32 to implement > > > > xchg_tail. > > > > > > So I really don't like this, this pushes complexity into the generic > > > code for something that's really not needed. > > > > > > Lots of RISC already implement sub-word atomics using word ll/sc. > > > Obviously they're not sharing code like they should be :/ See for > > > example arch/mips/kernel/cmpxchg.c. > > I see, we've done two versions of this: > > - Using cmpxchg codes from MIPS by Michael > > - Re-write with assembly codes by Guo > > > > But using the full-word atomic xchg instructions implement xchg16 has > > the semantic risk for atomic operations. > > What? -ENOPARSE > > > > Also, I really do think doing ticket locks first is a far more > > > sensible step. > > NACK by Anup > > Who's he when he's not sending NAKs ? We had discussions in the RISC-V platforms group about this. Over there, We had evaluated all spin lock approaches (ticket, qspinlock, etc) tried in Linux till now. It was concluded in those discussions that eventually we have to move to qspinlock (even if we moved to ticket spinlock temporarily) because qspinlock avoids cache line bouncing. Also, moving to qspinlock will be aligned with other major architectures supported in Linux (such as x86, ARM64) Some of the organizations working on high-end RISC-V systems (> 32 CPUs) are interested in having an optimized spinlock implementation (just like other major architectures x86 and ARM64). Based on above, Linux RISC-V should move to qspinlock. Regards, Anup
On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > u32 a = 0x55aa66bb; > u16 *ptr = &a; > > CPU0 CPU1 > ========= ========= > xchg16(ptr, new) while(1) > WRITE_ONCE(*(ptr + 1), x); > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. Then I think your LL/SC is broken. That also means you really don't want to build super complex locking primitives on top, because that live-lock will percolate through. Step 1 would be to get your architecute fixed such that it can provide fwd progress guarantees for LL/SC. Otherwise there's absolutely no point in building complex systems with it.
On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote: > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd > > progress of the entire primitive with any of this on. But who cares > > about details anyway.. :/ > > What's the architectural guarantee on LL/SC progress for RISC-V ? funct5 | aq | rl | rs2 | rs1 | funct3 | rd | opcode 5 1 1 5 5 3 5 7 LR.W/D ordering 0 addr width dest AMO SC.W/D ordering src addr width dest AMO LR.W loads a word from the address in rs1, places the sign-extended value in rd, and registers a reservation set—a set of bytes that subsumes the bytes in the addressed word. SC.W conditionally writes a word in rs2 to the address in rs1: the SC.W succeeds only if the reservation is still valid and the reservation set contains the bytes being written. If the SC.W succeeds, the instruction writes the word in rs2 to memory, and it writes zero to rd. If the SC.W fails, the instruction does not write to memory, and it writes a nonzero value to rd. Regardless of success or failure, executing an SC.W instruction *invalidates any reservation held by this hart*. More details, ref: https://github.com/riscv/riscv-isa-manual > And what if you double loop it like cmpxchg() ? Can you give a code snippet? -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Mon, Mar 29, 2021 at 12:13:10PM +0000, Anup Patel wrote: > We had discussions in the RISC-V platforms group about this. Over there, > We had evaluated all spin lock approaches (ticket, qspinlock, etc) tried > in Linux till now. It was concluded in those discussions that eventually we > have to move to qspinlock (even if we moved to ticket spinlock temporarily) > because qspinlock avoids cache line bouncing. Also, moving to qspinlock > will be aligned with other major architectures supported in Linux (such as > x86, ARM64) > > Some of the organizations working on high-end RISC-V systems (> 32 > CPUs) are interested in having an optimized spinlock implementation > (just like other major architectures x86 and ARM64). > > Based on above, Linux RISC-V should move to qspinlock. That's all well and good, but first you need architectural forward progress guarantees. Otherwise there's absolutely no point in building complex locking primitives. And unless you already have such big systems in silicon, where you can benchmark against simpler locks (like ticket) there really is no point in the complexity.
On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote: > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd > > > progress of the entire primitive with any of this on. But who cares > > > about details anyway.. :/ > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > funct5 | aq | rl | rs2 | rs1 | funct3 | rd | opcode > 5 1 1 5 5 3 5 7 > LR.W/D ordering 0 addr width dest AMO > SC.W/D ordering src addr width dest AMO > > LR.W loads a word from the address in rs1, places the sign-extended > value in rd, and registers a reservation set—a set of bytes that > subsumes the bytes in the addressed word. SC.W conditionally writes a > word in rs2 to the address in rs1: the SC.W succeeds only if the > reservation is still valid and the reservation set contains the bytes > being written. If the SC.W succeeds, the instruction writes the word > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the > instruction does not write to memory, and it writes a nonzero value to > rd. Regardless of success or failure, executing an SC.W instruction > *invalidates any reservation held by this hart*. > > More details, ref: > https://github.com/riscv/riscv-isa-manual I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant link, as this defines memory areas that either do or do not have forward progress guarantees, including this part: "When LR/SC is used for memory locations marked RsrvNonEventual, software should provide alternative fall-back mechanisms used when lack of progress is detected." My reading of this is that if the example you tried stalls, then either the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, or that the PMA is marked RsrvEventual but the implementation is buggy. It also seems that the current "amoswap" based implementation would be reliable independent of RsrvEventual/RsrvNonEventual. arm64 is already in the situation of having to choose between two cmpxchg() implementation at runtime to allow falling back to a slower but more general version, but it's best to avoid that if you can. Arnd [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas
On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote: > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote: > > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd > > > > progress of the entire primitive with any of this on. But who cares > > > > about details anyway.. :/ > > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > > > funct5 | aq | rl | rs2 | rs1 | funct3 | rd | opcode > > 5 1 1 5 5 3 5 7 > > LR.W/D ordering 0 addr width dest AMO > > SC.W/D ordering src addr width dest AMO > > > > LR.W loads a word from the address in rs1, places the sign-extended > > value in rd, and registers a reservation set—a set of bytes that > > subsumes the bytes in the addressed word. SC.W conditionally writes a > > word in rs2 to the address in rs1: the SC.W succeeds only if the > > reservation is still valid and the reservation set contains the bytes > > being written. If the SC.W succeeds, the instruction writes the word > > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the > > instruction does not write to memory, and it writes a nonzero value to > > rd. Regardless of success or failure, executing an SC.W instruction > > *invalidates any reservation held by this hart*. > > > > More details, ref: > > https://github.com/riscv/riscv-isa-manual > > I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant > link, as this defines memory areas that either do or do not have > forward progress guarantees, including this part: > > "When LR/SC is used for memory locations marked RsrvNonEventual, > software should provide alternative fall-back mechanisms used when > lack of progress is detected." > > My reading of this is that if the example you tried stalls, then either > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, > or that the PMA is marked RsrvEventual but the implementation is > buggy. Yes, PMA just defines physical memory region attributes, But in our processor, when MMU is enabled (satp's value register > 2) in s-mode, it will look at our custom PTE's attributes BIT(63) ref [1]: PTE format: | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 SO C B SH SE RSW D A G U X W R V ^ ^ ^ ^ ^ BIT(63): SO - Strong Order BIT(62): C - Cacheable BIT(61): B - Bufferable BIT(60): SH - Shareable BIT(59): SE - Security So the memory also could be RsrvNone/RsrvEventual. [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88 > > It also seems that the current "amoswap" based implementation > would be reliable independent of RsrvEventual/RsrvNonEventual. Yes, the hardware implementation of AMO could be different from LR/SC. AMO could use ACE snoop holding to lock the bus in hw coherency design, but LR/SC uses an exclusive monitor without locking the bus. > arm64 is already in the situation of having to choose between > two cmpxchg() implementation at runtime to allow falling back to > a slower but more general version, but it's best to avoid that if you > can. Current RISC-V needn't multiple versions to select, and all AMO & LR/SC has been defined in the spec. RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't think LR/SC would be slower than CAS, and CAS is just good for code size. > > Arnd > > [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > u32 a = 0x55aa66bb; > > u16 *ptr = &a; > > > > CPU0 CPU1 > > ========= ========= > > xchg16(ptr, new) while(1) > > WRITE_ONCE(*(ptr + 1), x); > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > Then I think your LL/SC is broken. > > That also means you really don't want to build super complex locking > primitives on top, because that live-lock will percolate through. Do you mean the below implementation has live-lock risk? +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new, val = atomic_read(&lock->val); + + for (;;) { + new = (val & _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + return old; +} > > Step 1 would be to get your architecute fixed such that it can provide > fwd progress guarantees for LL/SC. Otherwise there's absolutely no point > in building complex systems with it. Quote Waiman's comment [1] on xchg16 optimization: "This optimization is needed to make the qspinlock achieve performance parity with ticket spinlock at light load." [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/ So for a non-xhg16 machine: - ticket-lock for small numbers of CPUs - qspinlock for large numbers of CPUs Okay, I'll put all of them into the next patch :P
> -----Original Message----- > From: Guo Ren <guoren@kernel.org> > Sent: 30 March 2021 08:44 > To: Peter Zijlstra <peterz@infradead.org> > Cc: linux-riscv <linux-riscv@lists.infradead.org>; Linux Kernel Mailing List > <linux-kernel@vger.kernel.org>; linux-csky@vger.kernel.org; linux-arch > <linux-arch@vger.kernel.org>; Guo Ren <guoren@linux.alibaba.com>; Will > Deacon <will@kernel.org>; Ingo Molnar <mingo@redhat.com>; Waiman > Long <longman@redhat.com>; Arnd Bergmann <arnd@arndb.de>; Anup > Patel <anup@brainfault.org> > Subject: Re: [PATCH v4 3/4] locking/qspinlock: Add > ARCH_USE_QUEUED_SPINLOCKS_XCHG32 > > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> > wrote: > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > u32 a = 0x55aa66bb; > > > u16 *ptr = &a; > > > > > > CPU0 CPU1 > > > ========= ========= > > > xchg16(ptr, new) while(1) > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > Then I think your LL/SC is broken. > > > > That also means you really don't want to build super complex locking > > primitives on top, because that live-lock will percolate through. > Do you mean the below implementation has live-lock risk? > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > +{ > + u32 old, new, val = atomic_read(&lock->val); > + > + for (;;) { > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > + old = atomic_cmpxchg(&lock->val, val, new); > + if (old == val) > + break; > + > + val = old; > + } > + return old; > +} > > > > > > Step 1 would be to get your architecute fixed such that it can provide > > fwd progress guarantees for LL/SC. Otherwise there's absolutely no > > point in building complex systems with it. > > Quote Waiman's comment [1] on xchg16 optimization: > > "This optimization is needed to make the qspinlock achieve performance > parity with ticket spinlock at light load." > > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email- > Waiman.Long@hp.com/ > > So for a non-xhg16 machine: > - ticket-lock for small numbers of CPUs > - qspinlock for large numbers of CPUs > > Okay, I'll put all of them into the next patch I would suggest to have separate Kconfig opitons for ticket spinlock in Linux RISC-V which will be disabled by default. This means Linux RISC-V will use qspinlock by default and use ticket spinlock only when ticket spinlock kconfig is enabled. Regards, Anup
On Tue, Mar 30, 2021 at 7:56 AM Guo Ren <guoren@kernel.org> wrote: > > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > > > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote: > > > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd > > > > > progress of the entire primitive with any of this on. But who cares > > > > > about details anyway.. :/ > > > > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > > > > > funct5 | aq | rl | rs2 | rs1 | funct3 | rd | opcode > > > 5 1 1 5 5 3 5 7 > > > LR.W/D ordering 0 addr width dest AMO > > > SC.W/D ordering src addr width dest AMO > > > > > > LR.W loads a word from the address in rs1, places the sign-extended > > > value in rd, and registers a reservation set—a set of bytes that > > > subsumes the bytes in the addressed word. SC.W conditionally writes a > > > word in rs2 to the address in rs1: the SC.W succeeds only if the > > > reservation is still valid and the reservation set contains the bytes > > > being written. If the SC.W succeeds, the instruction writes the word > > > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the > > > instruction does not write to memory, and it writes a nonzero value to > > > rd. Regardless of success or failure, executing an SC.W instruction > > > *invalidates any reservation held by this hart*. > > > > > > More details, ref: > > > https://github.com/riscv/riscv-isa-manual > > > > I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant > > link, as this defines memory areas that either do or do not have > > forward progress guarantees, including this part: > > > > "When LR/SC is used for memory locations marked RsrvNonEventual, > > software should provide alternative fall-back mechanisms used when > > lack of progress is detected." > > > > My reading of this is that if the example you tried stalls, then either > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, > > or that the PMA is marked RsrvEventual but the implementation is > > buggy. > Yes, PMA just defines physical memory region attributes, But in our > processor, when MMU is enabled (satp's value register > 2) in s-mode, > it will look at our custom PTE's attributes BIT(63) ref [1]: > > PTE format: > | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 > SO C B SH SE RSW D A G U X W R V > ^ ^ ^ ^ ^ > BIT(63): SO - Strong Order > BIT(62): C - Cacheable > BIT(61): B - Bufferable > BIT(60): SH - Shareable > BIT(59): SE - Security > > So the memory also could be RsrvNone/RsrvEventual. > > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88 Is this about your C-sky architecture or your RISC-V implementation. If these PTE bits are in your RISC-V implementation then clearly your RISC-V implementation is not compliant with the RISC-V privilege spec because these bits are not defined in RISC-V privilege spec. Regards, Anup > > > > > It also seems that the current "amoswap" based implementation > > would be reliable independent of RsrvEventual/RsrvNonEventual. > Yes, the hardware implementation of AMO could be different from LR/SC. > AMO could use ACE snoop holding to lock the bus in hw coherency > design, but LR/SC uses an exclusive monitor without locking the bus. > > > arm64 is already in the situation of having to choose between > > two cmpxchg() implementation at runtime to allow falling back to > > a slower but more general version, but it's best to avoid that if you > > can. > Current RISC-V needn't multiple versions to select, and all AMO & > LR/SC has been defined in the spec. > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > think LR/SC would be slower than CAS, and CAS is just good for code > size. > > > > > Arnd > > > > [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
On Tue, Mar 30, 2021 at 1:51 PM Anup Patel <anup@brainfault.org> wrote: > > On Tue, Mar 30, 2021 at 7:56 AM Guo Ren <guoren@kernel.org> wrote: > > > > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > > > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > > > > > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote: > > > > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd > > > > > > progress of the entire primitive with any of this on. But who cares > > > > > > about details anyway.. :/ > > > > > > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > > > > > > > funct5 | aq | rl | rs2 | rs1 | funct3 | rd | opcode > > > > 5 1 1 5 5 3 5 7 > > > > LR.W/D ordering 0 addr width dest AMO > > > > SC.W/D ordering src addr width dest AMO > > > > > > > > LR.W loads a word from the address in rs1, places the sign-extended > > > > value in rd, and registers a reservation set—a set of bytes that > > > > subsumes the bytes in the addressed word. SC.W conditionally writes a > > > > word in rs2 to the address in rs1: the SC.W succeeds only if the > > > > reservation is still valid and the reservation set contains the bytes > > > > being written. If the SC.W succeeds, the instruction writes the word > > > > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the > > > > instruction does not write to memory, and it writes a nonzero value to > > > > rd. Regardless of success or failure, executing an SC.W instruction > > > > *invalidates any reservation held by this hart*. > > > > > > > > More details, ref: > > > > https://github.com/riscv/riscv-isa-manual > > > > > > I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant > > > link, as this defines memory areas that either do or do not have > > > forward progress guarantees, including this part: > > > > > > "When LR/SC is used for memory locations marked RsrvNonEventual, > > > software should provide alternative fall-back mechanisms used when > > > lack of progress is detected." > > > > > > My reading of this is that if the example you tried stalls, then either > > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, > > > or that the PMA is marked RsrvEventual but the implementation is > > > buggy. > > Yes, PMA just defines physical memory region attributes, But in our > > processor, when MMU is enabled (satp's value register > 2) in s-mode, > > it will look at our custom PTE's attributes BIT(63) ref [1]: > > > > PTE format: > > | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 > > SO C B SH SE RSW D A G U X W R V > > ^ ^ ^ ^ ^ > > BIT(63): SO - Strong Order > > BIT(62): C - Cacheable > > BIT(61): B - Bufferable > > BIT(60): SH - Shareable > > BIT(59): SE - Security > > > > So the memory also could be RsrvNone/RsrvEventual. > > > > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88 > > Is this about your C-sky architecture or your RISC-V implementation. It's in RISC-V implementation. > > If these PTE bits are in your RISC-V implementation then clearly your > RISC-V implementation is not compliant with the RISC-V privilege spec > because these bits are not defined in RISC-V privilege spec. We could disable it if the vendor's SOC has a coherency interconnect bus, so C910 is compliant with standard privilege spec. ps: I remember someone has mentioned a similar design in 1.12-draft-VM-TASKGROUP: "Bit 63 indicates that the PTE uses a custom implementation-specific encoding. If bit 63 is set, the algorithm for virtual-to-physical address translation is implementation-defined. If bit 63 is not set, the algorithm for virtual-to-physical address translation is described in Section 4.4.2. Bit 62 indicates the use of naturally aligned power-of-two (NAPOT) address translation contiguity, as described in Section 4.4.2. Bits 61–xx indicate cacheability attributes associated with the virtual address in question, as de-scribed in Section 4.4.3. Bits xx–54 are reserved for future use." > > Regards, > Anup > > > > > > > > It also seems that the current "amoswap" based implementation > > > would be reliable independent of RsrvEventual/RsrvNonEventual. > > Yes, the hardware implementation of AMO could be different from LR/SC. > > AMO could use ACE snoop holding to lock the bus in hw coherency > > design, but LR/SC uses an exclusive monitor without locking the bus. > > > > > arm64 is already in the situation of having to choose between > > > two cmpxchg() implementation at runtime to allow falling back to > > > a slower but more general version, but it's best to avoid that if you > > > can. > > Current RISC-V needn't multiple versions to select, and all AMO & > > LR/SC has been defined in the spec. > > > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > > think LR/SC would be slower than CAS, and CAS is just good for code > > size. > > > > > > > > Arnd > > > > > > [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas > > > > -- > > Best Regards > > Guo Ren > > > > ML: https://lore.kernel.org/linux-csky/ -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Tue, Mar 30, 2021 at 12:54 PM Anup Patel <Anup.Patel@wdc.com> wrote: > > > > > -----Original Message----- > > From: Guo Ren <guoren@kernel.org> > > Sent: 30 March 2021 08:44 > > To: Peter Zijlstra <peterz@infradead.org> > > Cc: linux-riscv <linux-riscv@lists.infradead.org>; Linux Kernel Mailing List > > <linux-kernel@vger.kernel.org>; linux-csky@vger.kernel.org; linux-arch > > <linux-arch@vger.kernel.org>; Guo Ren <guoren@linux.alibaba.com>; Will > > Deacon <will@kernel.org>; Ingo Molnar <mingo@redhat.com>; Waiman > > Long <longman@redhat.com>; Arnd Bergmann <arnd@arndb.de>; Anup > > Patel <anup@brainfault.org> > > Subject: Re: [PATCH v4 3/4] locking/qspinlock: Add > > ARCH_USE_QUEUED_SPINLOCKS_XCHG32 > > > > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> > > wrote: > > > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > > u32 a = 0x55aa66bb; > > > > u16 *ptr = &a; > > > > > > > > CPU0 CPU1 > > > > ========= ========= > > > > xchg16(ptr, new) while(1) > > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > > > Then I think your LL/SC is broken. > > > > > > That also means you really don't want to build super complex locking > > > primitives on top, because that live-lock will percolate through. > > Do you mean the below implementation has live-lock risk? > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > +{ > > + u32 old, new, val = atomic_read(&lock->val); > > + > > + for (;;) { > > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > > + old = atomic_cmpxchg(&lock->val, val, new); > > + if (old == val) > > + break; > > + > > + val = old; > > + } > > + return old; > > +} > > > > > > > > > > Step 1 would be to get your architecute fixed such that it can provide > > > fwd progress guarantees for LL/SC. Otherwise there's absolutely no > > > point in building complex systems with it. > > > > Quote Waiman's comment [1] on xchg16 optimization: > > > > "This optimization is needed to make the qspinlock achieve performance > > parity with ticket spinlock at light load." > > > > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email- > > Waiman.Long@hp.com/ > > > > So for a non-xhg16 machine: > > - ticket-lock for small numbers of CPUs > > - qspinlock for large numbers of CPUs > > > > Okay, I'll put all of them into the next patch > > I would suggest to have separate Kconfig opitons for ticket spinlock > in Linux RISC-V which will be disabled by default. This means Linux > RISC-V will use qspinlock by default and use ticket spinlock only when > ticket spinlock kconfig is enabled. OK > > Regards, > Anup
On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote: > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote: > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > > > "When LR/SC is used for memory locations marked RsrvNonEventual, > > software should provide alternative fall-back mechanisms used when > > lack of progress is detected." > > > > My reading of this is that if the example you tried stalls, then either > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, > > or that the PMA is marked RsrvEventual but the implementation is > > buggy. > > Yes, PMA just defines physical memory region attributes, But in our > processor, when MMU is enabled (satp's value register > 2) in s-mode, > it will look at our custom PTE's attributes BIT(63) ref [1]: > > PTE format: > | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 > SO C B SH SE RSW D A G U X W R V > ^ ^ ^ ^ ^ > BIT(63): SO - Strong Order > BIT(62): C - Cacheable > BIT(61): B - Bufferable > BIT(60): SH - Shareable > BIT(59): SE - Security > > So the memory also could be RsrvNone/RsrvEventual. I was not talking about RsrvNone, which would clearly mean that you cannot use lr/sc at all (trap would trap, right?), but "RsrvNonEventual", which would explain the behavior you described in an earlier reply: | u32 a = 0x55aa66bb; | u16 *ptr = &a; | | CPU0 CPU1 | ========= ========= | xchg16(ptr, new) while(1) | WRITE_ONCE(*(ptr + 1), x); | | When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. As I understand, this example must not cause a deadlock on a compliant hardware implementation when the underlying memory has RsrvEventual behavior, but could deadlock in case of RsrvNonEventual > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88 > > > > > It also seems that the current "amoswap" based implementation > > would be reliable independent of RsrvEventual/RsrvNonEventual. > > Yes, the hardware implementation of AMO could be different from LR/SC. > AMO could use ACE snoop holding to lock the bus in hw coherency > design, but LR/SC uses an exclusive monitor without locking the bus. > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > think LR/SC would be slower than CAS, and CAS is just good for code > size. What I meant here is that the current spinlock uses a simple amoswap, which presumably does not suffer from the lack of forward process you described. Arnd
From: Guo Ren > Sent: 30 March 2021 04:14 ... > > Step 1 would be to get your architecute fixed such that it can provide > > fwd progress guarantees for LL/SC. Otherwise there's absolutely no point > > in building complex systems with it. > > Quote Waiman's comment [1] on xchg16 optimization: > > "This optimization is needed to make the qspinlock achieve performance > parity with ticket spinlock at light load." > > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/ > > So for a non-xhg16 machine: > - ticket-lock for small numbers of CPUs > - qspinlock for large numbers of CPUs > > Okay, I'll put all of them into the next patch :P Doesn't that also imply that you need ticket-locks for lightly contended locks even with a lot of CPUs? If you actually get a lot of CPUs waiting on the same spinlock you probably ought to change the code to reduce lock contention. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 3/29/21 11:13 PM, Guo Ren wrote: > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: >> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: >>> u32 a = 0x55aa66bb; >>> u16 *ptr = &a; >>> >>> CPU0 CPU1 >>> ========= ========= >>> xchg16(ptr, new) while(1) >>> WRITE_ONCE(*(ptr + 1), x); >>> >>> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. >> Then I think your LL/SC is broken. >> >> That also means you really don't want to build super complex locking >> primitives on top, because that live-lock will percolate through. > Do you mean the below implementation has live-lock risk? > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > +{ > + u32 old, new, val = atomic_read(&lock->val); > + > + for (;;) { > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > + old = atomic_cmpxchg(&lock->val, val, new); > + if (old == val) > + break; > + > + val = old; > + } > + return old; > +} If there is a continuous stream of incoming spinlock takers, it is possible that some cpus may have to wait a long time to set the tail right. However, this should only happen on artificial workload. I doubt it will happen with real workload or with limit number of cpus. > >> Step 1 would be to get your architecute fixed such that it can provide >> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point >> in building complex systems with it. > Quote Waiman's comment [1] on xchg16 optimization: > > "This optimization is needed to make the qspinlock achieve performance > parity with ticket spinlock at light load." > > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/ > > So for a non-xhg16 machine: > - ticket-lock for small numbers of CPUs > - qspinlock for large numbers of CPUs > > Okay, I'll put all of them into the next patch :P > It is true that qspinlock may not offer much advantage when the number of cpus is small. It shines for systems with many cpus. You may use NR_CPUS to determine if the default should be ticket or qspinlock with user override. To determine the right NR_CPUS threshold, you may need to run on real SMP RISCV systems to find out. Cheers, Longman
On Tue, Mar 30, 2021 at 11:13:55AM +0800, Guo Ren wrote: > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > u32 a = 0x55aa66bb; > > > u16 *ptr = &a; > > > > > > CPU0 CPU1 > > > ========= ========= > > > xchg16(ptr, new) while(1) > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > Then I think your LL/SC is broken. > > > > That also means you really don't want to build super complex locking > > primitives on top, because that live-lock will percolate through. > Do you mean the below implementation has live-lock risk? > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > +{ > + u32 old, new, val = atomic_read(&lock->val); > + > + for (;;) { > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > + old = atomic_cmpxchg(&lock->val, val, new); > + if (old == val) > + break; > + > + val = old; > + } > + return old; > +} That entirely depends on the architecture (and cmpxchg() impementation). There are a number of cases: * architecture has cmpxchg() instruction (x86, s390, sparc, etc.). - architecture provides fwd progress (x86) - architecture requires backoff for progress (sparc) * architecture does not have cmpxchg, and implements it using LL/SC. and here things get *really* interesting, because while an architecture can have LL/SC fwd progress, that does not translate into cmpxchg() also having the same guarantees and all bets are off. The real bummer is that C can do cmpxchg(), but there is no way it can do LL/SC. And even if we'd teach C how to do LL/SC, it couldn't be generic because architectures lacking it can't emulate it using cmpxchg() (there's a fun class of bugs there). So while the above code might be the best we can do in generic code, it's really up to the architecture to make it work.
On Tue, Mar 30, 2021 at 06:08:40PM +0200, Peter Zijlstra wrote: > On Tue, Mar 30, 2021 at 11:13:55AM +0800, Guo Ren wrote: > > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > > u32 a = 0x55aa66bb; > > > > u16 *ptr = &a; > > > > > > > > CPU0 CPU1 > > > > ========= ========= > > > > xchg16(ptr, new) while(1) > > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > > > Then I think your LL/SC is broken. > > > > > > That also means you really don't want to build super complex locking > > > primitives on top, because that live-lock will percolate through. > > > Do you mean the below implementation has live-lock risk? > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > +{ > > + u32 old, new, val = atomic_read(&lock->val); > > + > > + for (;;) { > > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > > + old = atomic_cmpxchg(&lock->val, val, new); > > + if (old == val) > > + break; > > + > > + val = old; > > + } > > + return old; > > +} > > That entirely depends on the architecture (and cmpxchg() impementation). > > There are a number of cases: > > * architecture has cmpxchg() instruction (x86, s390, sparc, etc.). > > - architecture provides fwd progress (x86) > - architecture requires backoff for progress (sparc) > > * architecture does not have cmpxchg, and implements it using LL/SC. > > and here things get *really* interesting, because while an > architecture can have LL/SC fwd progress, that does not translate into > cmpxchg() also having the same guarantees and all bets are off. > > The real bummer is that C can do cmpxchg(), but there is no way it can > do LL/SC. And even if we'd teach C how to do LL/SC, it couldn't be > generic because architectures lacking it can't emulate it using > cmpxchg() (there's a fun class of bugs there). > > So while the above code might be the best we can do in generic code, > it's really up to the architecture to make it work. I just want to chime in here, there may be a better spot in the thread to mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code which I have on my todo list somwhere to replace the other generic implementations like that in mips. arch/openrisc/include/asm/cmpxchg.h The idea would be that architectures just implement these methods: long cmpxchg_u32(*ptr,old,new) long xchg_u32(*ptr,val) Then the rest of the generic header would implement cmpxchg. -Stafford
On Tue, Mar 30, 2021 at 3:12 PM Arnd Bergmann <arnd@arndb.de> wrote: > > On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote: > > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > > > > > "When LR/SC is used for memory locations marked RsrvNonEventual, > > > software should provide alternative fall-back mechanisms used when > > > lack of progress is detected." > > > > > > My reading of this is that if the example you tried stalls, then either > > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, > > > or that the PMA is marked RsrvEventual but the implementation is > > > buggy. > > > > Yes, PMA just defines physical memory region attributes, But in our > > processor, when MMU is enabled (satp's value register > 2) in s-mode, > > it will look at our custom PTE's attributes BIT(63) ref [1]: > > > > PTE format: > > | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 > > SO C B SH SE RSW D A G U X W R V > > ^ ^ ^ ^ ^ > > BIT(63): SO - Strong Order > > BIT(62): C - Cacheable > > BIT(61): B - Bufferable > > BIT(60): SH - Shareable > > BIT(59): SE - Security > > > > So the memory also could be RsrvNone/RsrvEventual. > > I was not talking about RsrvNone, which would clearly mean that > you cannot use lr/sc at all (trap would trap, right?), but "RsrvNonEventual", > which would explain the behavior you described in an earlier reply: > > | u32 a = 0x55aa66bb; > | u16 *ptr = &a; > | > | CPU0 CPU1 > | ========= ========= > | xchg16(ptr, new) while(1) > | WRITE_ONCE(*(ptr + 1), x); > | > | When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > As I understand, this example must not cause a deadlock on > a compliant hardware implementation when the underlying memory > has RsrvEventual behavior, but could deadlock in case of > RsrvNonEventual Thx for the nice explanation: - RsrvNonEventual - depends on software fall-back mechanisms, and just I'm worried about. - RsrvEventual - HW would provide the eventual success guarantee. > > > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88 > > > > > > > > It also seems that the current "amoswap" based implementation > > > would be reliable independent of RsrvEventual/RsrvNonEventual. > > > > Yes, the hardware implementation of AMO could be different from LR/SC. > > AMO could use ACE snoop holding to lock the bus in hw coherency > > design, but LR/SC uses an exclusive monitor without locking the bus. > > > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > > think LR/SC would be slower than CAS, and CAS is just good for code > > size. > > What I meant here is that the current spinlock uses a simple amoswap, > which presumably does not suffer from the lack of forward process you > described. Does that mean we should prevent using LR/SC (if RsrvNonEventual)?
On Wednesday, 31 March 2021 5:18:56 PM NZDT Guo Ren wrote: > > > [1] > > > https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc > > > 52afd0fcbf88> > > > > > It also seems that the current "amoswap" based implementation > > > > would be reliable independent of RsrvEventual/RsrvNonEventual. > > > > > > Yes, the hardware implementation of AMO could be different from LR/SC. > > > AMO could use ACE snoop holding to lock the bus in hw coherency > > > design, but LR/SC uses an exclusive monitor without locking the bus. > > > > > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > > > think LR/SC would be slower than CAS, and CAS is just good for code > > > size. > > > > What I meant here is that the current spinlock uses a simple amoswap, > > which presumably does not suffer from the lack of forward process you > > described. > > Does that mean we should prevent using LR/SC (if RsrvNonEventual)? Let me provide another data-point, I'm working on a high-end highly speculative implementation with many concurrent instructions in flight - from my point of view both sorts of AMO (LR/SC and swap/add/etc) require me to grab a cache line in an exclusive modifiable state (so no difference there). More importantly both sorts of AMO instructions (unlike most loads and stores) can't be speculated (not even LR because it changes hidden state, I found this out the hard way bringing up the kernel). This means that both LR AND SC individually can't be executed until all speculation is resolved (that means that they happen really late in the execute path and block the resolution of the speculation of subsequent instructions) - equally a single amoswap/add/etc instruction can't happen until late in the execute path - so both require the same cache line state, but one of these sorts of events is better than two of them. Which in short means that amoswap/add/etc is better for small things - small buzzy lock loops, while LR/SC is better for more complex things with actual processing between the LR and SC. ---- Another issue here is to consider is what happens when you hit one of these tight spinlocks when the branch target cache is empty and they fail (ie loop back and try again) - the default branch prediction, and resulting speculation, is (very) likely to be looping back, while hopefully most locks are not contended when you hit them and that speculation would be wrong - a spinlock like this may not be so good: li a0, 1 loop: amoswap a1, a0, (a2) beqz a1, loop ..... subsequent code In my world with no BTC info the pipe fills with dozens of amoswaps, rather than the 'subsequent code'. While (in my world) code like this: li a0, 1 loop: amoswap a1, a0, (a2) bnez a1, 1f .... subsequent code 1: j loop would actually be better (in my world unconditional jump instructions are folded early and never see execution so they're sort of free, though they mess with the issue/decode rate). Smart compilers could move the "j loop" out of the way, while the double branch on failure is not a big deal since either the lock is still held (and you don't care if it's slow) or it's been released in which case the cache line has been stolen and the refetch of that cache line is going to dominate the next time around the loop I need to stress here that this is how my architecture works, other's will of course be different though I expect that other heavily speculative architectures to have similar issues :-) Paul Campbell Moonbase Otago
Hi Arnd On Wed, Mar 31, 2021 at 12:18 PM Guo Ren <guoren@kernel.org> wrote: > > On Tue, Mar 30, 2021 at 3:12 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > > On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote: > > > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote: > > > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ? > > > > > > > > "When LR/SC is used for memory locations marked RsrvNonEventual, > > > > software should provide alternative fall-back mechanisms used when > > > > lack of progress is detected." > > > > > > > > My reading of this is that if the example you tried stalls, then either > > > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this, > > > > or that the PMA is marked RsrvEventual but the implementation is > > > > buggy. > > > > > > Yes, PMA just defines physical memory region attributes, But in our > > > processor, when MMU is enabled (satp's value register > 2) in s-mode, > > > it will look at our custom PTE's attributes BIT(63) ref [1]: > > > > > > PTE format: > > > | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 > > > SO C B SH SE RSW D A G U X W R V > > > ^ ^ ^ ^ ^ > > > BIT(63): SO - Strong Order > > > BIT(62): C - Cacheable > > > BIT(61): B - Bufferable > > > BIT(60): SH - Shareable > > > BIT(59): SE - Security > > > > > > So the memory also could be RsrvNone/RsrvEventual. > > > > I was not talking about RsrvNone, which would clearly mean that > > you cannot use lr/sc at all (trap would trap, right?), but "RsrvNonEventual", > > which would explain the behavior you described in an earlier reply: > > > > | u32 a = 0x55aa66bb; > > | u16 *ptr = &a; > > | > > | CPU0 CPU1 > > | ========= ========= > > | xchg16(ptr, new) while(1) > > | WRITE_ONCE(*(ptr + 1), x); > > | > > | When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > As I understand, this example must not cause a deadlock on > > a compliant hardware implementation when the underlying memory > > has RsrvEventual behavior, but could deadlock in case of > > RsrvNonEventual > Thx for the nice explanation: > - RsrvNonEventual - depends on software fall-back mechanisms, and > just I'm worried about. > - RsrvEventual - HW would provide the eventual success guarantee. In riscv-spec 8.3 Eventual Success of Store-Conditional Instructions I found: "As a consequence of the eventuality guarantee, if some harts in an execution environment are executing constrained LR/SC loops, and no other harts or devices in the execution environment execute an unconditional store or AMO to that reservation set, then at least one hart will eventually exit its constrained LR/SC loop. *** By contrast, if other harts or devices continue to write to that reservation set, it ***is not guaranteed*** that any hart will exit its LR/SC loop.*** " Seems RsrvEventual couldn't solve the code's problem I've mentioned. > > > > > > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88 > > > > > > > > > > > It also seems that the current "amoswap" based implementation > > > > would be reliable independent of RsrvEventual/RsrvNonEventual. > > > > > > Yes, the hardware implementation of AMO could be different from LR/SC. > > > AMO could use ACE snoop holding to lock the bus in hw coherency > > > design, but LR/SC uses an exclusive monitor without locking the bus. > > > > > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > > > think LR/SC would be slower than CAS, and CAS is just good for code > > > size. > > > > What I meant here is that the current spinlock uses a simple amoswap, > > which presumably does not suffer from the lack of forward process you > > described. > Does that mean we should prevent using LR/SC (if RsrvNonEventual)? > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
On Wed, Mar 31, 2021 at 8:44 AM Guo Ren <guoren@kernel.org> wrote: > On Wed, Mar 31, 2021 at 12:18 PM Guo Ren <guoren@kernel.org> wrote: > > On Tue, Mar 30, 2021 at 3:12 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote: > > > As I understand, this example must not cause a deadlock on > > > a compliant hardware implementation when the underlying memory > > > has RsrvEventual behavior, but could deadlock in case of > > > RsrvNonEventual > > Thx for the nice explanation: > > - RsrvNonEventual - depends on software fall-back mechanisms, and > > just I'm worried about. > > - RsrvEventual - HW would provide the eventual success guarantee. > In riscv-spec 8.3 Eventual Success of Store-Conditional Instructions > > I found: > "As a consequence of the eventuality guarantee, if some harts in an > execution environment are > executing constrained LR/SC loops, and no other harts or devices in > the execution environment > execute an unconditional store or AMO to that reservation set, then at > least one hart will > eventually exit its constrained LR/SC loop. *** By contrast, if other > harts or devices continue to > write to that reservation set, it ***is not guaranteed*** that any > hart will exit its LR/SC loop.*** " > > Seems RsrvEventual couldn't solve the code's problem I've mentioned. Ok, got it. Arnd
On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > I just want to chime in here, there may be a better spot in the thread to > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code > which I have on my todo list somwhere to replace the other generic > implementations like that in mips. > > arch/openrisc/include/asm/cmpxchg.h > > The idea would be that architectures just implement these methods: > > long cmpxchg_u32(*ptr,old,new) > long xchg_u32(*ptr,val) > > Then the rest of the generic header would implement cmpxchg. I like the idea of generalizing it a little further. I'd suggest staying a little closer to the existing naming here though, as we already have cmpxchg() for the type-agnostic version, and cmpxchg64() for the fixed-length 64-bit version. I think a nice interface between architecture-specific and architecture independent code would be to have architectures provide arch_cmpxchg32()/arch_xchg32() as the most basic version, as well as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16() if they have instructions for those. The common code can then build cmpxchg16()/xchg16() on top of either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg() wrapper around those (or alternatively we can decide to have them only deal with fixed-32-bit and long/pointer sized atomics). Arnd
On Wed, Mar 31, 2021 at 09:23:27AM +0200, Arnd Bergmann wrote: > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > > > I just want to chime in here, there may be a better spot in the thread to > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code > > which I have on my todo list somwhere to replace the other generic > > implementations like that in mips. > > > > arch/openrisc/include/asm/cmpxchg.h > > > > The idea would be that architectures just implement these methods: > > > > long cmpxchg_u32(*ptr,old,new) > > long xchg_u32(*ptr,val) > > > > Then the rest of the generic header would implement cmpxchg. > > I like the idea of generalizing it a little further. I'd suggest staying a > little closer to the existing naming here though, as we already have > cmpxchg() for the type-agnostic version, and cmpxchg64() for the > fixed-length 64-bit version. OK. > I think a nice interface between architecture-specific and architecture > independent code would be to have architectures provide > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16() > if they have instructions for those. Thanks for the name suggestions, it makes it easier for me. > The common code can then build cmpxchg16()/xchg16() on top of > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg() > wrapper around those (or alternatively we can decide to have them > only deal with fixed-32-bit and long/pointer sized atomics). Yeah, that was the idea. -Stafford
On Tue, Mar 30, 2021 at 10:09 PM Waiman Long <longman@redhat.com> wrote: > > On 3/29/21 11:13 PM, Guo Ren wrote: > > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > >> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > >>> u32 a = 0x55aa66bb; > >>> u16 *ptr = &a; > >>> > >>> CPU0 CPU1 > >>> ========= ========= > >>> xchg16(ptr, new) while(1) > >>> WRITE_ONCE(*(ptr + 1), x); > >>> > >>> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > >> Then I think your LL/SC is broken. > >> > >> That also means you really don't want to build super complex locking > >> primitives on top, because that live-lock will percolate through. > > Do you mean the below implementation has live-lock risk? > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > +{ > > + u32 old, new, val = atomic_read(&lock->val); > > + > > + for (;;) { > > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > > + old = atomic_cmpxchg(&lock->val, val, new); > > + if (old == val) > > + break; > > + > > + val = old; > > + } > > + return old; > > +} > If there is a continuous stream of incoming spinlock takers, it is > possible that some cpus may have to wait a long time to set the tail > right. However, this should only happen on artificial workload. I doubt > it will happen with real workload or with limit number of cpus. Yes, I agree or it couldn't with NR_CPU > 16k. So the implementation above is suitable for non-sub-word-xchg architecture. > > > >> Step 1 would be to get your architecute fixed such that it can provide > >> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point > >> in building complex systems with it. > > Quote Waiman's comment [1] on xchg16 optimization: > > > > "This optimization is needed to make the qspinlock achieve performance > > parity with ticket spinlock at light load." > > > > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/ > > > > So for a non-xhg16 machine: > > - ticket-lock for small numbers of CPUs > > - qspinlock for large numbers of CPUs > > > > Okay, I'll put all of them into the next patch :P > > > It is true that qspinlock may not offer much advantage when the number > of cpus is small. It shines for systems with many cpus. You may use > NR_CPUS to determine if the default should be ticket or qspinlock with > user override. To determine the right NR_CPUS threshold, you may need to > run on real SMP RISCV systems to find out. We'd give the choice to the users, and they could select ticket-lock or qspinlock in riscv.
Hi Stafford, How do think add ARCH_USE_QUEUED_SPINLOCKS_XCHG32 in openrisc? https://lore.kernel.org/linux-riscv/1617201040-83905-7-git-send-email-guoren@kernel.org/T/#u On Wed, Mar 31, 2021 at 8:31 PM Stafford Horne <shorne@gmail.com> wrote: > > On Wed, Mar 31, 2021 at 09:23:27AM +0200, Arnd Bergmann wrote: > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > > > > > I just want to chime in here, there may be a better spot in the thread to > > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code > > > which I have on my todo list somwhere to replace the other generic > > > implementations like that in mips. > > > > > > arch/openrisc/include/asm/cmpxchg.h > > > > > > The idea would be that architectures just implement these methods: > > > > > > long cmpxchg_u32(*ptr,old,new) > > > long xchg_u32(*ptr,val) > > > > > > Then the rest of the generic header would implement cmpxchg. > > > > I like the idea of generalizing it a little further. I'd suggest staying a > > little closer to the existing naming here though, as we already have > > cmpxchg() for the type-agnostic version, and cmpxchg64() for the > > fixed-length 64-bit version. > > OK. > > > I think a nice interface between architecture-specific and architecture > > independent code would be to have architectures provide > > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well > > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16() > > if they have instructions for those. > > Thanks for the name suggestions, it makes it easier for me. > > > The common code can then build cmpxchg16()/xchg16() on top of > > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg() > > wrapper around those (or alternatively we can decide to have them > > only deal with fixed-32-bit and long/pointer sized atomics). > > Yeah, that was the idea. > > -Stafford
On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > u32 a = 0x55aa66bb; > > u16 *ptr = &a; > > > > CPU0 CPU1 > > ========= ========= > > xchg16(ptr, new) while(1) > > WRITE_ONCE(*(ptr + 1), x); > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > Then I think your LL/SC is broken. No, it's not broken LR.W/SC.W. Quote <8.3 Eventual Success of Store-Conditional Instructions>: "As a consequence of the eventuality guarantee, if some harts in an execution environment are executing constrained LR/SC loops, and no other harts or devices in the execution environment execute an unconditional store or AMO to that reservation set, then at least one hart will eventually exit its constrained LR/SC loop. By contrast, if other harts or devices continue to write to that reservation set, it is not guaranteed that any hart will exit its LR/SC loop." So I think it's a feature of LR/SC. How does the above code (also use ll.w/sc.w to implement xchg16) running on arm64? 1: ldxr eor cbnz ... 2f stxr cbnz ... 1b // I think it would deadlock for arm64. "LL/SC fwd progress" which you have mentioned could guarantee stxr success? How hardware could do that? > > That also means you really don't want to build super complex locking > primitives on top, because that live-lock will percolate through. > > Step 1 would be to get your architecute fixed such that it can provide > fwd progress guarantees for LL/SC. Otherwise there's absolutely no point > in building complex systems with it. -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
Hi Paul, Thx for the explanation, here is my comment. On Wed, Mar 31, 2021 at 1:33 PM Paul Campbell <taniwha@gmail.com> wrote: > > On Wednesday, 31 March 2021 5:18:56 PM NZDT Guo Ren wrote: > > > > [1] > > > > https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc > > > > 52afd0fcbf88> > > > > > > It also seems that the current "amoswap" based implementation > > > > > would be reliable independent of RsrvEventual/RsrvNonEventual. > > > > > > > > Yes, the hardware implementation of AMO could be different from LR/SC. > > > > AMO could use ACE snoop holding to lock the bus in hw coherency > > > > design, but LR/SC uses an exclusive monitor without locking the bus. > > > > > > > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't > > > > think LR/SC would be slower than CAS, and CAS is just good for code > > > > size. > > > > > > What I meant here is that the current spinlock uses a simple amoswap, > > > which presumably does not suffer from the lack of forward process you > > > described. > > > > Does that mean we should prevent using LR/SC (if RsrvNonEventual)? > > Let me provide another data-point, I'm working on a high-end highly > speculative implementation with many concurrent instructions in flight - from > my point of view both sorts of AMO (LR/SC and swap/add/etc) require me to > grab a cache line in an exclusive modifiable state (so no difference there). > > More importantly both sorts of AMO instructions (unlike most loads and > stores) can't be speculated (not even LR because it changes hidden state, I > found this out the hard way bringing up the kernel). > > This means that both LR AND SC individually can't be executed until all > speculation is resolved (that means that they happen really late in the > execute path and block the resolution of the speculation of subsequent > instructions) - equally a single amoswap/add/etc instruction can't happen > until late in the execute path - so both require the same cache line state, > but one of these sorts of events is better than two of them. > > Which in short means that amoswap/add/etc is better for small things - small > buzzy lock loops, while LR/SC is better for more complex things with actual > processing between the LR and SC. Seems your machine using the same way to implement LR/SC and AMO, but some machines would differ them. For AMO, I think it's would be like what you've described: - AMO would be separated into three parts: load & lock, ALU operation, store & unlock - load & lock, eg: we could using ACE protocol -SNOOP channel to holding the bus - Doing atomic AMO - store & unlock: Write the result back and releasing the ACE protocol -SNOOP channel I think the above is what you describe as how to "grab a cache line in an exclusive modifiable state". But for LR/SC, it's different. Because we have separated AMO into real three parts of instruction: - LR - Operation instructions - SC If we let LR holding ACE protocol -SNOOP channel and let SC release channel, that would break the ISA design (we couldn't let an instruction holding the snoop bus and made other harts hang up.) So LR/SC would use address monitors for every hart, to detect the target address has been written or not. That means LR/SC won't be implemented fwd progress guarantees. If you care about fwd progress guarantees, I think ISA should choose cmpxchg (eg: cas) instead of LR/SC. > > ---- > > Another issue here is to consider is what happens when you hit one of these > tight spinlocks when the branch target cache is empty and they fail (ie loop > back and try again) - the default branch prediction, and resulting > speculation, is (very) likely to be looping back, while hopefully most locks > are not contended when you hit them and that speculation would be wrong - a > spinlock like this may not be so good: > > li a0, 1 > loop: > amoswap a1, a0, (a2) > beqz a1, loop > ..... subsequent code > > In my world with no BTC info the pipe fills with dozens of amoswaps, rather > than the 'subsequent code'. While (in my world) code like this: > > li a0, 1 > loop: > amoswap a1, a0, (a2) > bnez a1, 1f > .... subsequent code > > 1: j loop > > would actually be better (in my world unconditional jump instructions are > folded early and never see execution so they're sort of free, though they mess > with the issue/decode rate). Smart compilers could move the "j loop" out of > the way, while the double branch on failure is not a big deal since either the > lock is still held (and you don't care if it's slow) or it's been released in > which case the cache line has been stolen and the refetch of that cache line > is going to dominate the next time around the loop Thx for sharing the view of the spinlock speculative path. But I think we should use smp_cond_load_acquire not looping. That means we could use wfe/cpu_relax to let other harts utilized the core's pipeline. So we needn't optimize the "subsequent code" speculative path in the multi-threads processing core and just let the hart relax. > > I need to stress here that this is how my architecture works, other's will of > course be different though I expect that other heavily speculative > architectures to have similar issues :-) > > Paul Campbell > Moonbase Otago > > >
On Wed, Mar 31, 2021 at 12:08 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Tue, Mar 30, 2021 at 11:13:55AM +0800, Guo Ren wrote: > > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > > u32 a = 0x55aa66bb; > > > > u16 *ptr = &a; > > > > > > > > CPU0 CPU1 > > > > ========= ========= > > > > xchg16(ptr, new) while(1) > > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > > > Then I think your LL/SC is broken. > > > > > > That also means you really don't want to build super complex locking > > > primitives on top, because that live-lock will percolate through. > > > Do you mean the below implementation has live-lock risk? > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > +{ > > + u32 old, new, val = atomic_read(&lock->val); > > + > > + for (;;) { > > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > > + old = atomic_cmpxchg(&lock->val, val, new); > > + if (old == val) > > + break; > > + > > + val = old; > > + } > > + return old; > > +} > > That entirely depends on the architecture (and cmpxchg() impementation). > > There are a number of cases: > > * architecture has cmpxchg() instruction (x86, s390, sparc, etc.). > > - architecture provides fwd progress (x86) > - architecture requires backoff for progress (sparc) > > * architecture does not have cmpxchg, and implements it using LL/SC. > > and here things get *really* interesting, because while an > architecture can have LL/SC fwd progress, that does not translate into > cmpxchg() also having the same guarantees and all bets are off. Seems riscv spec didn't mandatory LR/SC fwd guarantee, ref: In riscv-spec 8.3 Eventual Success of Store-Conditional Instructions "As a consequence of the eventuality guarantee, if some harts in an execution environment are executing constrained LR/SC loops, and no other harts or devices in the execution environment execute an unconditional store or AMO to that reservation set, then at least one hart will eventually exit its constrained LR/SC loop. *** By contrast, if other harts or devices continue to write to that reservation set, it ***is not guaranteed*** that any hart will exit its LR/SC loop.*** " > > The real bummer is that C can do cmpxchg(), but there is no way it can > do LL/SC. And even if we'd teach C how to do LL/SC, it couldn't be > generic because architectures lacking it can't emulate it using > cmpxchg() (there's a fun class of bugs there). > > So while the above code might be the best we can do in generic code, > it's really up to the architecture to make it work. -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
On Tue, Mar 30, 2021 at 10:09 PM Waiman Long <longman@redhat.com> wrote: > > On 3/29/21 11:13 PM, Guo Ren wrote: > > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > >> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > >>> u32 a = 0x55aa66bb; > >>> u16 *ptr = &a; > >>> > >>> CPU0 CPU1 > >>> ========= ========= > >>> xchg16(ptr, new) while(1) > >>> WRITE_ONCE(*(ptr + 1), x); > >>> > >>> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > >> Then I think your LL/SC is broken. > >> > >> That also means you really don't want to build super complex locking > >> primitives on top, because that live-lock will percolate through. > > Do you mean the below implementation has live-lock risk? > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > > +{ > > + u32 old, new, val = atomic_read(&lock->val); > > + > > + for (;;) { > > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > > + old = atomic_cmpxchg(&lock->val, val, new); > > + if (old == val) > > + break; > > + > > + val = old; > > + } > > + return old; > > +} > If there is a continuous stream of incoming spinlock takers, it is > possible that some cpus may have to wait a long time to set the tail > right. However, this should only happen on artificial workload. I doubt > it will happen with real workload or with limit number of cpus. Yes, I think is ok for LR/SC in riscv, becasue CPU0 LR CPU1 LR CPU0 SC //success CPU1 SC //fail or CPU0 LR CPU1 LR CPU1 SC //success CPU0 SC //fail So always one store condition would success. I think it's OK. > > > >> Step 1 would be to get your architecute fixed such that it can provide > >> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point > >> in building complex systems with it. > > Quote Waiman's comment [1] on xchg16 optimization: > > > > "This optimization is needed to make the qspinlock achieve performance > > parity with ticket spinlock at light load." > > > > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/ > > > > So for a non-xhg16 machine: > > - ticket-lock for small numbers of CPUs > > - qspinlock for large numbers of CPUs > > > > Okay, I'll put all of them into the next patch :P > > > It is true that qspinlock may not offer much advantage when the number > of cpus is small. It shines for systems with many cpus. You may use > NR_CPUS to determine if the default should be ticket or qspinlock with > user override. To determine the right NR_CPUS threshold, you may need to > run on real SMP RISCV systems to find out. I Agree
On Wed, Mar 31, 2021 at 3:23 PM Arnd Bergmann <arnd@arndb.de> wrote: > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > > > I just want to chime in here, there may be a better spot in the thread to > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code > > which I have on my todo list somwhere to replace the other generic > > implementations like that in mips. > > > > arch/openrisc/include/asm/cmpxchg.h > > > > The idea would be that architectures just implement these methods: > > > > long cmpxchg_u32(*ptr,old,new) > > long xchg_u32(*ptr,val) > > > > Then the rest of the generic header would implement cmpxchg. > > I like the idea of generalizing it a little further. I'd suggest staying a > little closer to the existing naming here though, as we already have > cmpxchg() for the type-agnostic version, and cmpxchg64() for the > fixed-length 64-bit version. > > I think a nice interface between architecture-specific and architecture > independent code would be to have architectures provide > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16() > if they have instructions for those. > > The common code can then build cmpxchg16()/xchg16() on top of > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg() > wrapper around those (or alternatively we can decide to have them > only deal with fixed-32-bit and long/pointer sized atomics). I think these emulation codes are suitable for some architectures but not riscv. We shouldn't export xchg16/cmpxchg16(emulated by lr.w/sc.w) in riscv, We should forbid these sub-word atomic primitive and lets the programmers consider their atomic design.
On Wed, Mar 31, 2021 at 11:22:35PM +0800, Guo Ren wrote: > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > u32 a = 0x55aa66bb; > > > u16 *ptr = &a; > > > > > > CPU0 CPU1 > > > ========= ========= > > > xchg16(ptr, new) while(1) > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > Then I think your LL/SC is broken. > No, it's not broken LR.W/SC.W. Quote <8.3 Eventual Success of > Store-Conditional Instructions>: > > "As a consequence of the eventuality guarantee, if some harts in an > execution environment are executing constrained LR/SC loops, and no > other harts or devices in the execution environment execute an > unconditional store or AMO to that reservation set, then at least one > hart will eventually exit its constrained LR/SC loop. By contrast, if > other harts or devices continue to write to that reservation set, it > is not guaranteed that any hart will exit its LR/SC loop." (there, reflowed it for you) That just means your arch spec is broken too :-) > So I think it's a feature of LR/SC. How does the above code (also use > ll.w/sc.w to implement xchg16) running on arm64? > > 1: ldxr > eor > cbnz ... 2f > stxr > cbnz ... 1b // I think it would deadlock for arm64. > > "LL/SC fwd progress" which you have mentioned could guarantee stxr > success? How hardware could do that? I'm not a hardware person; I've never actually build anything larger than a 4 bit adder with nand gates (IIRC, 25+ years ago). And I'll let Will answer the ARM64 part. That said, I think the idea is that if you lock the line (load-locked is a clue ofcourse) to the core until either: an exception (or anything else that is guaranteed to fail LL/SC), SC or N instructions, then a competing LL/SC will stall in the LL while the first core makes progress. This same principle is key to hardware progress for cmpxchg/cas loops, don't instantly yield the exclusive hold on the cacheline, keep it around for a while. Out-of-order CPUs can do even better I think, by virtue of them being able to see tight loops. Anyway, given you have such a crap architecture (and here I thought RISC-V was supposed to be a modern design *sigh*), you had better go look at the sparc64 atomic implementation which has a software backoff for failed CAS in order to make fwd progress.
On Wed, Mar 31, 2021 at 11:10:53PM +0800, Guo Ren wrote: > Hi Stafford, > > How do think add ARCH_USE_QUEUED_SPINLOCKS_XCHG32 in openrisc? > > https://lore.kernel.org/linux-riscv/1617201040-83905-7-git-send-email-guoren@kernel.org/T/#u Sorry I missed your mail here. It is true that OpenRISC doesn't have xchg16, so using the xchg32 code is better. Acked-by: Stafford Horne <shorne@gmail.com> > On Wed, Mar 31, 2021 at 8:31 PM Stafford Horne <shorne@gmail.com> wrote: > > > > On Wed, Mar 31, 2021 at 09:23:27AM +0200, Arnd Bergmann wrote: > > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > > > > > > > I just want to chime in here, there may be a better spot in the thread to > > > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code > > > > which I have on my todo list somwhere to replace the other generic > > > > implementations like that in mips. > > > > > > > > arch/openrisc/include/asm/cmpxchg.h > > > > > > > > The idea would be that architectures just implement these methods: > > > > > > > > long cmpxchg_u32(*ptr,old,new) > > > > long xchg_u32(*ptr,val) > > > > > > > > Then the rest of the generic header would implement cmpxchg. > > > > > > I like the idea of generalizing it a little further. I'd suggest staying a > > > little closer to the existing naming here though, as we already have > > > cmpxchg() for the type-agnostic version, and cmpxchg64() for the > > > fixed-length 64-bit version. > > > > OK. > > > > > I think a nice interface between architecture-specific and architecture > > > independent code would be to have architectures provide > > > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well > > > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16() > > > if they have instructions for those. > > > > Thanks for the name suggestions, it makes it easier for me. > > > > > The common code can then build cmpxchg16()/xchg16() on top of > > > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg() > > > wrapper around those (or alternatively we can decide to have them > > > only deal with fixed-32-bit and long/pointer sized atomics). > > > > Yeah, that was the idea. > > > > -Stafford > > > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
On Tue, Apr 06, 2021 at 11:50:38AM +0800, Guo Ren wrote: > On Wed, Mar 31, 2021 at 3:23 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > > > > > I just want to chime in here, there may be a better spot in the thread to > > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code > > > which I have on my todo list somwhere to replace the other generic > > > implementations like that in mips. > > > > > > arch/openrisc/include/asm/cmpxchg.h > > > > > > The idea would be that architectures just implement these methods: > > > > > > long cmpxchg_u32(*ptr,old,new) > > > long xchg_u32(*ptr,val) > > > > > > Then the rest of the generic header would implement cmpxchg. > > > > I like the idea of generalizing it a little further. I'd suggest staying a > > little closer to the existing naming here though, as we already have > > cmpxchg() for the type-agnostic version, and cmpxchg64() for the > > fixed-length 64-bit version. > > > > I think a nice interface between architecture-specific and architecture > > independent code would be to have architectures provide > > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well > > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16() > > if they have instructions for those. > > > > The common code can then build cmpxchg16()/xchg16() on top of > > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg() > > wrapper around those (or alternatively we can decide to have them > > only deal with fixed-32-bit and long/pointer sized atomics). > I think these emulation codes are suitable for some architectures but not riscv. > > We shouldn't export xchg16/cmpxchg16(emulated by lr.w/sc.w) in riscv, > We should forbid these sub-word atomic primitive and lets the > programmers consider their atomic design. Fair enough, having the generic sub-word emulation would be something that an architecture can select to use/export. -Stafford
On Wed, Mar 31, 2021 at 11:22:35PM +0800, Guo Ren wrote: > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote: > > > u32 a = 0x55aa66bb; > > > u16 *ptr = &a; > > > > > > CPU0 CPU1 > > > ========= ========= > > > xchg16(ptr, new) while(1) > > > WRITE_ONCE(*(ptr + 1), x); > > > > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock. > > > > Then I think your LL/SC is broken. > No, it's not broken LR.W/SC.W. Quote <8.3 Eventual Success of > Store-Conditional Instructions>: > > "As a consequence of the eventuality guarantee, if some harts in an > execution environment are > executing constrained LR/SC loops, and no other harts or devices in > the execution environment > execute an unconditional store or AMO to that reservation set, then at > least one hart will > eventually exit its constrained LR/SC loop. By contrast, if other > harts or devices continue to > write to that reservation set, it is not guaranteed that any hart will > exit its LR/SC loop." > > So I think it's a feature of LR/SC. How does the above code (also use > ll.w/sc.w to implement xchg16) running on arm64? > > 1: ldxr > eor > cbnz ... 2f > stxr > cbnz ... 1b // I think it would deadlock for arm64. > > "LL/SC fwd progress" which you have mentioned could guarantee stxr > success? How hardware could do that? > Actually, "old" riscv standard does provide fwd progress ;-) In https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf Section "7.2 Load-Reserved/Store-Conditional Instructions": """ One advantage of CAS is that it guarantees that some hart eventually makes progress, whereas an LR/SC atomic sequence could livelock indefinitely on some systems. To avoid this concern, we added an architectural guarantee of forward progress to LR/SC atomic sequences. The restrictions on LR/SC sequence contents allows an implementation to **capture a cache line on the LR and complete the LR/SC sequence by holding off remote cache interventions for a bounded short time**. """ The guarantee is removed later due to "Earlier versions of this specification imposed a stronger starvation-freedom guarantee. However, the weaker livelock-freedom guarantee is sufficient to implement the C11 and C++11 languages, and is substantially easier to provide in some microarchitectural styles." But I take it as an example that hardware can guarantee this. Regards, Boqun > > > > That also means you really don't want to build super complex locking > > primitives on top, because that live-lock will percolate through. > > > > Step 1 would be to get your architecute fixed such that it can provide > > fwd progress guarantees for LL/SC. Otherwise there's absolutely no point > > in building complex systems with it. > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
On Tue, Apr 6, 2021 at 10:56 AM Stafford Horne <shorne@gmail.com> wrote: > On Tue, Apr 06, 2021 at 11:50:38AM +0800, Guo Ren wrote: > > On Wed, Mar 31, 2021 at 3:23 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote: > > > > We shouldn't export xchg16/cmpxchg16(emulated by lr.w/sc.w) in riscv, > > We should forbid these sub-word atomic primitive and lets the > > programmers consider their atomic design. > > Fair enough, having the generic sub-word emulation would be something that > an architecture can select to use/export. I still have the feeling that we should generalize and unify the exact behavior across architectures as much as possible here, while possibly also trying to simplify the interface a little. Looking through the various xchg()/cmpxchg() implementations, I find eight distinct ways to do 8-bit and 16-bit atomics: Full support: ia64, m68k (Atari only), x86, arm32 (v6k+), arm64 gcc/clang __sync_{val,bool}_compare_and_swap: s390 Emulated through ll/sc: alpha, powerpc Emulated through cmpxchg loop: mips, openrisc, xtensa (xchg but not cmpxchg), sparc64 (cmpxchg_u8, xchg_u16 but not cmpxchg_u16 and xchg_u8!) Emulated through local_irq_save (non SMP only): h8300, m68k (most), microblaze, mips, nds32, nios2 Emulated through hashed spinlock: parisc (8-bit only added in 2020, 16-bit still missing) Forced compile-time error: arm32 (v4/v5/v6 non-SMP), arc, csky, riscv, parisc (16 bit), sparc32, sparc64, xtensa (cmpxchg) Silently broken: hexagon Since there are really only a handful of instances in the kernel that use the cmpxchg() or xchg() on u8/u16 variables, it would seem best to just disallow those completely and have a separate set of functions here, with only 64-bit architectures using any variable-type wrapper to handle both 32-bit and 64-bit arguments. Interestingly, the s390 version using __sync_val_compare_and_swap() seems to produce nice output on all architectures that have atomic instructions, with any supported compiler, to the point where I think we could just use that to replace most of the inline-asm versions except for arm64: #define cmpxchg(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __o = (o); \ __typeof__(*(ptr)) __n = (n); \ (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\ }) Not how gcc's acquire/release behavior of __sync_val_compare_and_swap() relates to what the kernel wants here. The gcc documentation also recommends using the standard __atomic_compare_exchange_n() builtin instead, which would allow constructing release/acquire/relaxed versions as well, but I could not get it to produce equally good output. (possibly I was using it wrong) Arnd
On Wed, Apr 07, 2021 at 01:24:12AM +0800, Boqun Feng wrote: > Actually, "old" riscv standard does provide fwd progress ;-) In > > https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf > > Section "7.2 Load-Reserved/Store-Conditional Instructions": > > """ > One advantage of CAS is that it guarantees that some hart eventually > makes progress, whereas an LR/SC atomic sequence could livelock > indefinitely on some systems. To avoid this concern, we added an This is not inherent to CAS, there's plenty of CAS implementations that are prone to livelock (unfortunately). > architectural guarantee of forward progress to LR/SC atomic sequences. > The restrictions on LR/SC sequence contents allows an implementation to > **capture a cache line on the LR and complete the LR/SC sequence by > holding off remote cache interventions for a bounded short time**. > """ > > The guarantee is removed later due to "Earlier versions of this > specification imposed a stronger starvation-freedom guarantee. However, > the weaker livelock-freedom guarantee is sufficient to implement the C11 > and C++11 languages, and is substantially easier to provide in some > microarchitectural styles." Pff, that's just stupid. I suppose the best you can say of the C11 memory model is that it's nice that the C committee realized they needed to catch up to the every day reality of SMP systems (which people have been writing 'C' for ever since the 70s, since SMP is older than C itself). There's so much wrong with the C11 memory model and atomics, using it to define an architecture is just backwards. C11 cannot do RCU, C11 cannot do seqlocks, C11 cannot do deterministic locks, C11 cannot do real systems. C11 is garbage. Yes, architectures need to support C11, because sadly C11 exists and people will use it. But architectures also need to be able to do real systems and thus should not limit themselves to C11.
On Tue, Apr 06, 2021 at 09:15:50AM +0200, Peter Zijlstra wrote: > Anyway, given you have such a crap architecture (and here I thought > RISC-V was supposed to be a modern design *sigh*), you had better go > look at the sparc64 atomic implementation which has a software backoff > for failed CAS in order to make fwd progress. It wasn't supposed to be modern. It was supposed to use boring old ideas. Where it actually did that it is a great ISA, in parts where academics actually tried to come up with cool or state of the art ideas (interrupt handling, tlb shootdowns, the totally fucked up memory model) it turned into a trainwreck.
On Wed, Apr 07, 2021 at 10:42:50AM +0200, Arnd Bergmann wrote: > Since there are really only a handful of instances in the kernel > that use the cmpxchg() or xchg() on u8/u16 variables, it would seem > best to just disallow those completely Not going to happen. xchg16 is optimal for qspinlock and if we replace that with a cmpxchg loop on x86 we're regressing. > Interestingly, the s390 version using __sync_val_compare_and_swap() > seems to produce nice output on all architectures that have atomic > instructions, with any supported compiler, to the point where I think > we could just use that to replace most of the inline-asm versions except > for arm64: > > #define cmpxchg(ptr, o, n) \ > ({ \ > __typeof__(*(ptr)) __o = (o); \ > __typeof__(*(ptr)) __n = (n); \ > (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\ > }) It generates the LL/SC loop, but doesn't do sensible optimizations when it's again used in a loop itself. That is, it generates a loop of a loop, just like what you'd expect, which is sub-optimal for LL/SC. > Not how gcc's acquire/release behavior of __sync_val_compare_and_swap() > relates to what the kernel wants here. > > The gcc documentation also recommends using the standard > __atomic_compare_exchange_n() builtin instead, which would allow > constructing release/acquire/relaxed versions as well, but I could not > get it to produce equally good output. (possibly I was using it wrong) I'm scared to death of the C11 crap, the compiler will 'optimize' them when it feels like it and use the C11 memory model rules for it, which are not compatible with the kernel rules. But the same thing applies, it won't do the right thing for composites.
On Wed, Apr 7, 2021 at 1:36 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Apr 07, 2021 at 10:42:50AM +0200, Arnd Bergmann wrote: > > Since there are really only a handful of instances in the kernel > > that use the cmpxchg() or xchg() on u8/u16 variables, it would seem > > best to just disallow those completely > > Not going to happen. xchg16 is optimal for qspinlock and if we replace > that with a cmpxchg loop on x86 we're regressing. I did not mean to remove the possibility of doing a 16-bit compare-exchange operation altogether, just removing it from the regular macros. We already do this for the 64-bit xchg64()/cmpxchg64(), which only some of the 32-bit architectures provide, so I think having an explicit xchg8()/xchg16()/cmpxchg8()/cmpxchg16() interface while tightening the type checking would be more consistent here. On any 32-bit architecture, cmpxchg()/xchg() macros then only have to deal with word-sized operations. > > Interestingly, the s390 version using __sync_val_compare_and_swap() > > seems to produce nice output on all architectures that have atomic > > instructions, with any supported compiler, to the point where I think > > we could just use that to replace most of the inline-asm versions except > > for arm64: > > > > #define cmpxchg(ptr, o, n) \ > > ({ \ > > __typeof__(*(ptr)) __o = (o); \ > > __typeof__(*(ptr)) __n = (n); \ > > (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\ > > }) > > It generates the LL/SC loop, but doesn't do sensible optimizations when > it's again used in a loop itself. That is, it generates a loop of a > loop, just like what you'd expect, which is sub-optimal for LL/SC. One thing that it does though is to generate an ll/sc loop for xchg16(), while mips, openrisc, xtensa and sparc64 currently open-code the nested loop in their respective xchg16() wrappers. I have not seen any case in which it produces object code that is worse than the architecture specific code does today, except for those that rely on runtime patching (i486+smp, arm64+lse). > > Not how gcc's acquire/release behavior of __sync_val_compare_and_swap() > > relates to what the kernel wants here. > > > > The gcc documentation also recommends using the standard > > __atomic_compare_exchange_n() builtin instead, which would allow > > constructing release/acquire/relaxed versions as well, but I could not > > get it to produce equally good output. (possibly I was using it wrong) > > I'm scared to death of the C11 crap, the compiler will 'optimize' them > when it feels like it and use the C11 memory model rules for it, which > are not compatible with the kernel rules. > > But the same thing applies, it won't do the right thing for composites. Makes sense. As I said, I could not even get it to produce optimal code for the simple case. Arnd
On Wed, Apr 07, 2021 at 01:36:45PM +0200, Peter Zijlstra wrote: > On Wed, Apr 07, 2021 at 10:42:50AM +0200, Arnd Bergmann wrote: > > Since there are really only a handful of instances in the kernel > > that use the cmpxchg() or xchg() on u8/u16 variables, it would seem > > best to just disallow those completely > > Not going to happen. xchg16 is optimal for qspinlock and if we replace > that with a cmpxchg loop on x86 we're regressing. > > > Interestingly, the s390 version using __sync_val_compare_and_swap() > > seems to produce nice output on all architectures that have atomic > > instructions, with any supported compiler, to the point where I think > > we could just use that to replace most of the inline-asm versions except > > for arm64: > > > > #define cmpxchg(ptr, o, n) \ > > ({ \ > > __typeof__(*(ptr)) __o = (o); \ > > __typeof__(*(ptr)) __n = (n); \ > > (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\ > > }) > > It generates the LL/SC loop, but doesn't do sensible optimizations when > it's again used in a loop itself. That is, it generates a loop of a > loop, just like what you'd expect, which is sub-optimal for LL/SC. > > > Not how gcc's acquire/release behavior of __sync_val_compare_and_swap() > > relates to what the kernel wants here. > > > > The gcc documentation also recommends using the standard > > __atomic_compare_exchange_n() builtin instead, which would allow > > constructing release/acquire/relaxed versions as well, but I could not > > get it to produce equally good output. (possibly I was using it wrong) > > I'm scared to death of the C11 crap, the compiler will 'optimize' them > when it feels like it and use the C11 memory model rules for it, which > are not compatible with the kernel rules. > > But the same thing applies, it won't do the right thing for composites. See the mess it makes: https://godbolt.org/z/r7d13d4Kf That should've been something like: __xadd: mov r3, r0 dmb ish .L1: ldrex r0, [r3] adds r1, r0 strex ip, r0, [r3] cmp ip, #0 bne .L1 dmb ish Which I'll argue has stronger guarantees than the double loop. I'm pretty sure branches can release the lock on some archs (Alpha is known to do this).
On Wed, Apr 7, 2021 at 11:43 AM Christoph Hellwig <hch@infradead.org> wrote: > > On Tue, Apr 06, 2021 at 09:15:50AM +0200, Peter Zijlstra wrote: > > Anyway, given you have such a crap architecture (and here I thought > > RISC-V was supposed to be a modern design *sigh*), you had better go > > look at the sparc64 atomic implementation which has a software backoff > > for failed CAS in order to make fwd progress. > > It wasn't supposed to be modern. It was supposed to use boring old > ideas. Where it actually did that it is a great ISA, in parts where > academics actually tried to come up with cool or state of the art > ideas (interrupt handling, tlb shootdowns, the totally fucked up > memory model) it turned into a trainwreck. Gentlemen, please rethink your wording. RISC-V is neither "crap" nor a "trainwreck", regardless if you like it or not. The comparison with sparc64 is not applicable, as sparc64 does not have LL/SC instructions. Further, it is not the case that RISC-V has no guarantees at all. It just does not provide a forward progress guarantee for a synchronization implementation, that writes in an endless loop to a memory location while trying to complete an LL/SC loop on the same memory location at the same time. If there's a reasonable algorithm, that relies on forward progress in this case, then we should indeed think about that, but I haven't seen one so far. The whole MCF lock idea is to actually spin on different memory locations per CPU to improve scalability (reduce cacheline bouncing). That's a clear indicator, that well-scaling synchronization algorithms need to avoid contended cache lines anyways. RISC-V defines LR/SC loops consisting of up to 16 instructions as constrained LR/SC loops. Such constrained LR/SC loops provide the required forward guarantees, that are expected (similar to what other architectures, like AArch64, have). What RISC-V does not have is sub-word atomics and if required, we would have to implement them as LL/SC sequences. And yes, using atomic instructions is preferred over using LL/SC, because atomics will tend to perform better (less instructions and less spilled registers). But that actually depends on the actual ISA implementation. Respectfully, Christoph
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph M??llner wrote: > Gentlemen, please rethink your wording. > RISC-V is neither "crap" nor a "trainwreck", regardless if you like it or not. No, by all objective standards the RISC-V memory model and privileged architecture is a trainwreck. Anyone who disagrees is way out there in the sky.
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote: > Further, it is not the case that RISC-V has no guarantees at all. > It just does not provide a forward progress guarantee for a > synchronization implementation, > that writes in an endless loop to a memory location while trying to > complete an LL/SC > loop on the same memory location at the same time. Userspace can DoS the kernel that way, see futex.
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote: > The comparison with sparc64 is not applicable, as sparc64 does not > have LL/SC instructions. Sparc64 has CAS, without hardware fwd progress. It has to do software backoff for failed CAS in order to do software fwd progress.
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote: > RISC-V defines LR/SC loops consisting of up to 16 instructions as > constrained LR/SC loops. Such constrained LR/SC loops provide the > required forward guarantees, that are expected (similar to what other > architectures, like AArch64, have). The text quoted by others didn't seem to say such a thing, but whatever. > What RISC-V does not have is sub-word atomics and if required, we > would have to implement them as LL/SC sequences. And yes, using atomic > instructions is preferred over using LL/SC, (psudo asm, can't be bothered to figure out the actual syntax) # setup r_and_mask, r_or_mask .L1 LL r, [word] AND r, r, r_and_mask OR r, r, r_or_mask SC r, [word] JNE .L1 is what you need for LL/SC based xchg16, that's less than 16 instructions. If RISC-V guarantees fwd progress on that, good, write it like that and lets end this thread. The fact that this is apparently hard, is not good.
On Wed, Apr 07, 2021 at 05:51:07PM +0200, Peter Zijlstra wrote: > On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote: > > Further, it is not the case that RISC-V has no guarantees at all. > > It just does not provide a forward progress guarantee for a > > synchronization implementation, > > that writes in an endless loop to a memory location while trying to > > complete an LL/SC > > loop on the same memory location at the same time. > > Userspace can DoS the kernel that way, see futex. The longer answer is that this means you cannot share locks (or any atomic really) across a trust boundary, which is of course exactly what futex does.
On Wed, Apr 07, 2021 at 05:52:36PM +0200, Peter Zijlstra wrote: > On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote: > > The comparison with sparc64 is not applicable, as sparc64 does not > > have LL/SC instructions. > > Sparc64 has CAS, without hardware fwd progress. It has to do software > backoff for failed CAS in order to do software fwd progress. Again, the longer answer is that this applies identically to LL/SC and I'm sure we actually have (or had) an architecture in tree that did just that. I just cannot remember which architecture that was.
On Wed, Apr 7, 2021 at 6:00 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote: > > RISC-V defines LR/SC loops consisting of up to 16 instructions as > > constrained LR/SC loops. Such constrained LR/SC loops provide the > > required forward guarantees, that are expected (similar to what other > > architectures, like AArch64, have). > > The text quoted by others didn't seem to say such a thing, but whatever. The RISC-V unpriv spec is public can be found here: https://riscv.org/technical/specifications/ Version 20191213 discusses LR/SC-loops in section 8.3 (page 51). So in case you are interested in the exact wording, you can find it there. > > What RISC-V does not have is sub-word atomics and if required, we > > would have to implement them as LL/SC sequences. And yes, using atomic > > instructions is preferred over using LL/SC, > > (psudo asm, can't be bothered to figure out the actual syntax) > > # setup r_and_mask, r_or_mask > > .L1 > LL r, [word] > AND r, r, r_and_mask > OR r, r, r_or_mask > SC r, [word] > JNE .L1 I fully agree with this. I've implemented a patch for that two weeks ago using the following helper: +/* + * Mask and set given bits at a given address atomically. + * The masked old value will be returned. + */ +static inline u32 atomic_mask_and_set(u32* p, u32 mask, u32 val) +{ + u32 ret, tmp; + __asm__ __volatile__ ( + "0: lr.w %0, %2\n" + " and %0, %0, %3\n" + " or %1, %0, %4\n" + " sc.w %1, %1, %2\n" + " bnez %1, 0b\n" + : "+&r"(ret), "=&r" (tmp), "+A"(*p) + : "r" (mask), "rJ"(val) + : "memory"); + return ret; +} However, Guo pushed out a new patchset in between and I decided to not continue my approach to not undermine his approach. I will sync up with Guo to provide a common patchset. Thanks, Christoph > is what you need for LL/SC based xchg16, that's less than 16 > instructions. If RISC-V guarantees fwd progress on that, good, write it > like that and lets end this thread. > > The fact that this is apparently hard, is not good.
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 3de8fd11873b..d02f1261f73f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -239,6 +239,9 @@ config LOCK_SPIN_ON_OWNER config ARCH_USE_QUEUED_SPINLOCKS bool +config ARCH_USE_QUEUED_SPINLOCKS_XCHG32 + bool + config QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS depends on SMP diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index cbff6ba53d56..54de0632c6a8 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); } -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - #else /* _Q_PENDING_BITS == 8 */ /** @@ -206,6 +186,30 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) { atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); } +#endif + +#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32) +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /** * xchg_tail - Put in the new queue tail code word & retrieve previous one