diff mbox series

[v4,3/4] locking/qspinlock: Add ARCH_USE_QUEUED_SPINLOCKS_XCHG32

Message ID 1616868399-82848-4-git-send-email-guoren@kernel.org (mailing list archive)
State New, archived
Headers show
Series riscv: Add qspinlock/qrwlock | expand

Commit Message

Guo Ren March 27, 2021, 6:06 p.m. UTC
From: Guo Ren <guoren@linux.alibaba.com>

Some architectures don't have sub-word swap atomic instruction,
they only have the full word's one.

The sub-word swap only improve the performance when:
NR_CPUS < 16K
 *  0- 7: locked byte
 *     8: pending
 *  9-15: not used
 * 16-17: tail index
 * 18-31: tail cpu (+1)

The 9-15 bits are wasted to use xchg16 in xchg_tail.

Please let architecture select xchg16/xchg32 to implement
xchg_tail.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Anup Patel <anup@brainfault.org>
---
 kernel/Kconfig.locks       |  3 +++
 kernel/locking/qspinlock.c | 44 +++++++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 20 deletions(-)

Comments

Waiman Long March 27, 2021, 6:43 p.m. UTC | #1
On 3/27/21 2:06 PM, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
>
> Some architectures don't have sub-word swap atomic instruction,
> they only have the full word's one.
>
> The sub-word swap only improve the performance when:
> NR_CPUS < 16K
>   *  0- 7: locked byte
>   *     8: pending
>   *  9-15: not used
>   * 16-17: tail index
>   * 18-31: tail cpu (+1)
>
> The 9-15 bits are wasted to use xchg16 in xchg_tail.
>
> Please let architecture select xchg16/xchg32 to implement
> xchg_tail.
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Will Deacon <will@kernel.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Anup Patel <anup@brainfault.org>
> ---
>   kernel/Kconfig.locks       |  3 +++
>   kernel/locking/qspinlock.c | 44 +++++++++++++++++++++-----------------
>   2 files changed, 27 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 3de8fd11873b..d02f1261f73f 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -239,6 +239,9 @@ config LOCK_SPIN_ON_OWNER
>   config ARCH_USE_QUEUED_SPINLOCKS
>   	bool
>   
> +config ARCH_USE_QUEUED_SPINLOCKS_XCHG32
> +	bool
> +
>   config QUEUED_SPINLOCKS
>   	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
>   	depends on SMP
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index cbff6ba53d56..54de0632c6a8 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
>   	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
>   }
>   
> -/*
> - * xchg_tail - Put in the new queue tail code word & retrieve previous one
> - * @lock : Pointer to queued spinlock structure
> - * @tail : The new queue tail code word
> - * Return: The previous queue tail code word
> - *
> - * xchg(lock, tail), which heads an address dependency
> - *
> - * p,*,* -> n,*,* ; prev = xchg(lock, node)
> - */
> -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> -{
> -	/*
> -	 * We can use relaxed semantics since the caller ensures that the
> -	 * MCS node is properly initialized before updating the tail.
> -	 */
> -	return (u32)xchg_relaxed(&lock->tail,
> -				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> -}
> -
>   #else /* _Q_PENDING_BITS == 8 */
>   
>   /**
> @@ -206,6 +186,30 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
>   {
>   	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
>   }
> +#endif
> +
> +#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32)
> +/*
> + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> + * @lock : Pointer to queued spinlock structure
> + * @tail : The new queue tail code word
> + * Return: The previous queue tail code word
> + *
> + * xchg(lock, tail), which heads an address dependency
> + *
> + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> + */
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> +	/*
> +	 * We can use relaxed semantics since the caller ensures that the
> +	 * MCS node is properly initialized before updating the tail.
> +	 */
> +	return (u32)xchg_relaxed(&lock->tail,
> +				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> +}
> +
> +#else
>   
>   /**
>    * xchg_tail - Put in the new queue tail code word & retrieve previous one

I don't have any problem adding a 
CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32 config option to control that.

One minor nit:

#endif /* _Q_PENDING_BITS == 8 */

You should probably remove the comment at the trailing end of the 
corresponding "#endif" as it is now wrong.

Cheers,
Longman
Guo Ren March 28, 2021, 1:48 a.m. UTC | #2
On Sun, Mar 28, 2021 at 2:43 AM Waiman Long <longman@redhat.com> wrote:
>
> On 3/27/21 2:06 PM, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Some architectures don't have sub-word swap atomic instruction,
> > they only have the full word's one.
> >
> > The sub-word swap only improve the performance when:
> > NR_CPUS < 16K
> >   *  0- 7: locked byte
> >   *     8: pending
> >   *  9-15: not used
> >   * 16-17: tail index
> >   * 18-31: tail cpu (+1)
> >
> > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> >
> > Please let architecture select xchg16/xchg32 to implement
> > xchg_tail.
> >
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Waiman Long <longman@redhat.com>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > Cc: Anup Patel <anup@brainfault.org>
> > ---
> >   kernel/Kconfig.locks       |  3 +++
> >   kernel/locking/qspinlock.c | 44 +++++++++++++++++++++-----------------
> >   2 files changed, 27 insertions(+), 20 deletions(-)
> >
> > diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> > index 3de8fd11873b..d02f1261f73f 100644
> > --- a/kernel/Kconfig.locks
> > +++ b/kernel/Kconfig.locks
> > @@ -239,6 +239,9 @@ config LOCK_SPIN_ON_OWNER
> >   config ARCH_USE_QUEUED_SPINLOCKS
> >       bool
> >
> > +config ARCH_USE_QUEUED_SPINLOCKS_XCHG32
> > +     bool
> > +
> >   config QUEUED_SPINLOCKS
> >       def_bool y if ARCH_USE_QUEUED_SPINLOCKS
> >       depends on SMP
> > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> > index cbff6ba53d56..54de0632c6a8 100644
> > --- a/kernel/locking/qspinlock.c
> > +++ b/kernel/locking/qspinlock.c
> > @@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> >       WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
> >   }
> >
> > -/*
> > - * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > - * @lock : Pointer to queued spinlock structure
> > - * @tail : The new queue tail code word
> > - * Return: The previous queue tail code word
> > - *
> > - * xchg(lock, tail), which heads an address dependency
> > - *
> > - * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > - */
> > -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > -{
> > -     /*
> > -      * We can use relaxed semantics since the caller ensures that the
> > -      * MCS node is properly initialized before updating the tail.
> > -      */
> > -     return (u32)xchg_relaxed(&lock->tail,
> > -                              tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> > -}
> > -
> >   #else /* _Q_PENDING_BITS == 8 */
> >
> >   /**
> > @@ -206,6 +186,30 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> >   {
> >       atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
> >   }
> > +#endif
> > +
> > +#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32)
> > +/*
> > + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > + * @lock : Pointer to queued spinlock structure
> > + * @tail : The new queue tail code word
> > + * Return: The previous queue tail code word
> > + *
> > + * xchg(lock, tail), which heads an address dependency
> > + *
> > + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > + */
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +     /*
> > +      * We can use relaxed semantics since the caller ensures that the
> > +      * MCS node is properly initialized before updating the tail.
> > +      */
> > +     return (u32)xchg_relaxed(&lock->tail,
> > +                              tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> > +}
> > +
> > +#else
> >
> >   /**
> >    * xchg_tail - Put in the new queue tail code word & retrieve previous one
>
> I don't have any problem adding a
> CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32 config option to control that.
Thx

>
> One minor nit:
>
> #endif /* _Q_PENDING_BITS == 8 */
>
> You should probably remove the comment at the trailing end of the
> corresponding "#endif" as it is now wrong.
I'll fix it in next patch
Peter Zijlstra March 29, 2021, 7:50 a.m. UTC | #3
On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Some architectures don't have sub-word swap atomic instruction,
> they only have the full word's one.
> 
> The sub-word swap only improve the performance when:
> NR_CPUS < 16K
>  *  0- 7: locked byte
>  *     8: pending
>  *  9-15: not used
>  * 16-17: tail index
>  * 18-31: tail cpu (+1)
> 
> The 9-15 bits are wasted to use xchg16 in xchg_tail.
> 
> Please let architecture select xchg16/xchg32 to implement
> xchg_tail.

So I really don't like this, this pushes complexity into the generic
code for something that's really not needed.

Lots of RISC already implement sub-word atomics using word ll/sc.
Obviously they're not sharing code like they should be :/ See for
example arch/mips/kernel/cmpxchg.c.

Also, I really do think doing ticket locks first is a far more sensible
step.
Arnd Bergmann March 29, 2021, 9:41 a.m. UTC | #4
On Mon, Mar 29, 2021 at 9:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
> On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Some architectures don't have sub-word swap atomic instruction,
> > they only have the full word's one.
> >
> > The sub-word swap only improve the performance when:
> > NR_CPUS < 16K
> >  *  0- 7: locked byte
> >  *     8: pending
> >  *  9-15: not used
> >  * 16-17: tail index
> >  * 18-31: tail cpu (+1)
> >
> > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> >
> > Please let architecture select xchg16/xchg32 to implement
> > xchg_tail.
>
> So I really don't like this, this pushes complexity into the generic
> code for something that's really not needed.
>
> Lots of RISC already implement sub-word atomics using word ll/sc.
> Obviously they're not sharing code like they should be :/ See for
> example arch/mips/kernel/cmpxchg.c.

That is what the previous version of the patch set did, right?

I think this v4 is nicer because the code is already there in
qspinlock.c and just gets moved around, and the implementation
is likely more efficient this way. The mips version could be made
more generic, but it is also less efficient than a simple xchg
since it requires an indirect function call plus nesting a pair of
loops instead in place of the single single ll/sc loop in the 32-bit
xchg.

I think the weakly typed xchg/cmpxchg implementation causes
more problems than it solves, and we'd be better off using
a stronger version in general, with the 8-bit and 16-bit exchanges
using separate helpers in the same way that the fixed-length
cmpxchg64 is separate already, there are only a couple of instances
for each of these in the kernel.

Unfortunately, there is roughly a 50:50 split between fixed 32-bit
and long/pointer-sized xchg/cmpxchg users in the kernel, so
making the interface completely fixed-type would add a ton of
churn. I created an experimental patch for this, but it's probably
not worth it.

> Also, I really do think doing ticket locks first is a far more sensible
> step.

I think this is the important point to look at first. From what I can
tell, most users of ticket spinlocks have moved to qspinlock over
time, but I'm not sure about the exact tradeoff, in particular on
large machines without a 16-bit xchg operation.

FWIW, the implementations in the other SMP capable
architectures are

alpha    simple
arc      simple+custom
arm      ticket
arm64    qspinlock (formerly ticket)
csky     ticket/qrwlock (formerly simple+ticket/qrwlock)
hexagon  simple
ia64     ticket
mips     qspinlock (formerly ticket)
openrisc qspinlock
parisc   custom
powerpc  simple+qspinlock (formerly simple)
riscv    simple
s390     custom-queue
sh       simple
sparc    qspinlock
x86      qspinlock (formerly ticket+qspinlock)
xtensa   qspinlock

32-bit Arm is the only relevant user of ticket locks these days, but its
hardware has a practical limit of 16 CPUs and four nodes, with most
implementations having a single CPU cluster of at most four cores.

We had the same discussion about xchg16() when Sebastian submitted
the arm32 qspinlock implementation:
https://lore.kernel.org/linux-arm-kernel/20191007214439.27891-1-sebastian@breakpoint.cc/

        Arnd
Peter Zijlstra March 29, 2021, 11:16 a.m. UTC | #5
On Mon, Mar 29, 2021 at 11:41:19AM +0200, Arnd Bergmann wrote:
> On Mon, Mar 29, 2021 at 9:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > Some architectures don't have sub-word swap atomic instruction,
> > > they only have the full word's one.
> > >
> > > The sub-word swap only improve the performance when:
> > > NR_CPUS < 16K
> > >  *  0- 7: locked byte
> > >  *     8: pending
> > >  *  9-15: not used
> > >  * 16-17: tail index
> > >  * 18-31: tail cpu (+1)
> > >
> > > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> > >
> > > Please let architecture select xchg16/xchg32 to implement
> > > xchg_tail.
> >
> > So I really don't like this, this pushes complexity into the generic
> > code for something that's really not needed.
> >
> > Lots of RISC already implement sub-word atomics using word ll/sc.
> > Obviously they're not sharing code like they should be :/ See for
> > example arch/mips/kernel/cmpxchg.c.
> 
> That is what the previous version of the patch set did, right?
> 
> I think this v4 is nicer because the code is already there in
> qspinlock.c and just gets moved around, and the implementation
> is likely more efficient this way. The mips version could be made
> more generic, but it is also less efficient than a simple xchg
> since it requires an indirect function call plus nesting a pair of
> loops instead in place of the single single ll/sc loop in the 32-bit
> xchg.
> 
> I think the weakly typed xchg/cmpxchg implementation causes
> more problems than it solves, and we'd be better off using
> a stronger version in general, with the 8-bit and 16-bit exchanges
> using separate helpers in the same way that the fixed-length
> cmpxchg64 is separate already, there are only a couple of instances
> for each of these in the kernel.
> 
> Unfortunately, there is roughly a 50:50 split between fixed 32-bit
> and long/pointer-sized xchg/cmpxchg users in the kernel, so
> making the interface completely fixed-type would add a ton of
> churn. I created an experimental patch for this, but it's probably
> not worth it.

The mips code is pretty horrible. Using a cmpxchg loop on an ll/sc arch
is jus daft. And that's exactly what the generic xchg_tail() thing does
too.

A single LL/SC loop that sets either the upper or lower 16 bits of the
word is always better.

Anyway, an additional 'funny' is that I suspect you cannot prove fwd
progress of the entire primitive with any of this on. But who cares
about details anyway.. :/

And the whole WFE optimization that was relevant for the ticket lock, is
_still_ relevant for qspinlock, except that seems to have gone missing
again.

I just don't have much confidence here that people actually understand
what they're doing or why.
Guo Ren March 29, 2021, 11:19 a.m. UTC | #6
On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Some architectures don't have sub-word swap atomic instruction,
> > they only have the full word's one.
> >
> > The sub-word swap only improve the performance when:
> > NR_CPUS < 16K
> >  *  0- 7: locked byte
> >  *     8: pending
> >  *  9-15: not used
> >  * 16-17: tail index
> >  * 18-31: tail cpu (+1)
> >
> > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> >
> > Please let architecture select xchg16/xchg32 to implement
> > xchg_tail.
>
> So I really don't like this, this pushes complexity into the generic
> code for something that's really not needed.
>
> Lots of RISC already implement sub-word atomics using word ll/sc.
> Obviously they're not sharing code like they should be :/ See for
> example arch/mips/kernel/cmpxchg.c.
I see, we've done two versions of this:
 - Using cmpxchg codes from MIPS by Michael
 - Re-write with assembly codes by Guo

But using the full-word atomic xchg instructions implement xchg16 has
the semantic risk for atomic operations.

I don't think export xchg16 in a none-sub-word atomic machine is correct.

>
> Also, I really do think doing ticket locks first is a far more sensible
> step.
NACK by Anup

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Peter Zijlstra March 29, 2021, 11:26 a.m. UTC | #7
On Mon, Mar 29, 2021 at 07:19:29PM +0800, Guo Ren wrote:
> On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > Some architectures don't have sub-word swap atomic instruction,
> > > they only have the full word's one.
> > >
> > > The sub-word swap only improve the performance when:
> > > NR_CPUS < 16K
> > >  *  0- 7: locked byte
> > >  *     8: pending
> > >  *  9-15: not used
> > >  * 16-17: tail index
> > >  * 18-31: tail cpu (+1)
> > >
> > > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> > >
> > > Please let architecture select xchg16/xchg32 to implement
> > > xchg_tail.
> >
> > So I really don't like this, this pushes complexity into the generic
> > code for something that's really not needed.
> >
> > Lots of RISC already implement sub-word atomics using word ll/sc.
> > Obviously they're not sharing code like they should be :/ See for
> > example arch/mips/kernel/cmpxchg.c.
> I see, we've done two versions of this:
>  - Using cmpxchg codes from MIPS by Michael
>  - Re-write with assembly codes by Guo
> 
> But using the full-word atomic xchg instructions implement xchg16 has
> the semantic risk for atomic operations.

What? -ENOPARSE

> > Also, I really do think doing ticket locks first is a far more sensible
> > step.
> NACK by Anup

Who's he when he's not sending NAKs ?
Peter Zijlstra March 29, 2021, 11:29 a.m. UTC | #8
On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote:
> Anyway, an additional 'funny' is that I suspect you cannot prove fwd
> progress of the entire primitive with any of this on. But who cares
> about details anyway.. :/

What's the architectural guarantee on LL/SC progress for RISC-V ? And
what if you double loop it like cmpxchg() ?
Guo Ren March 29, 2021, 12:01 p.m. UTC | #9
On Mon, Mar 29, 2021 at 7:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 29, 2021 at 07:19:29PM +0800, Guo Ren wrote:
> > On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > >
> > > > Some architectures don't have sub-word swap atomic instruction,
> > > > they only have the full word's one.
> > > >
> > > > The sub-word swap only improve the performance when:
> > > > NR_CPUS < 16K
> > > >  *  0- 7: locked byte
> > > >  *     8: pending
> > > >  *  9-15: not used
> > > >  * 16-17: tail index
> > > >  * 18-31: tail cpu (+1)
> > > >
> > > > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> > > >
> > > > Please let architecture select xchg16/xchg32 to implement
> > > > xchg_tail.
> > >
> > > So I really don't like this, this pushes complexity into the generic
> > > code for something that's really not needed.
> > >
> > > Lots of RISC already implement sub-word atomics using word ll/sc.
> > > Obviously they're not sharing code like they should be :/ See for
> > > example arch/mips/kernel/cmpxchg.c.
> > I see, we've done two versions of this:
> >  - Using cmpxchg codes from MIPS by Michael
> >  - Re-write with assembly codes by Guo
> >
> > But using the full-word atomic xchg instructions implement xchg16 has
> > the semantic risk for atomic operations.
>
> What? -ENOPARSE

u32 a = 0x55aa66bb;
u16 *ptr = &a;

CPU0                       CPU1
=========             =========
xchg16(ptr, new)     while(1)
                                    WRITE_ONCE(*(ptr + 1), x);

When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.

>
> > > Also, I really do think doing ticket locks first is a far more sensible
> > > step.
> > NACK by Anup
>
> Who's he when he's not sending NAKs ?
We've talked before:
https://lore.kernel.org/linux-riscv/CAAhSdy1JHLUFwu7RuCaQ+RUWRBks2KsDva7EpRt8--4ZfofSUQ@mail.gmail.com/T/#t
Anup Patel March 29, 2021, 12:13 p.m. UTC | #10
> -----Original Message-----
> From: Peter Zijlstra <peterz@infradead.org>
> Sent: 29 March 2021 16:57
> To: Guo Ren <guoren@kernel.org>
> Cc: linux-riscv <linux-riscv@lists.infradead.org>; Linux Kernel Mailing List
> <linux-kernel@vger.kernel.org>; linux-csky@vger.kernel.org; linux-arch
> <linux-arch@vger.kernel.org>; Guo Ren <guoren@linux.alibaba.com>; Will
> Deacon <will@kernel.org>; Ingo Molnar <mingo@redhat.com>; Waiman
> Long <longman@redhat.com>; Arnd Bergmann <arnd@arndb.de>; Anup
> Patel <anup@brainfault.org>
> Subject: Re: [PATCH v4 3/4] locking/qspinlock: Add
> ARCH_USE_QUEUED_SPINLOCKS_XCHG32
> 
> On Mon, Mar 29, 2021 at 07:19:29PM +0800, Guo Ren wrote:
> > On Mon, Mar 29, 2021 at 3:50 PM Peter Zijlstra <peterz@infradead.org>
> wrote:
> > >
> > > On Sat, Mar 27, 2021 at 06:06:38PM +0000, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > >
> > > > Some architectures don't have sub-word swap atomic instruction,
> > > > they only have the full word's one.
> > > >
> > > > The sub-word swap only improve the performance when:
> > > > NR_CPUS < 16K
> > > >  *  0- 7: locked byte
> > > >  *     8: pending
> > > >  *  9-15: not used
> > > >  * 16-17: tail index
> > > >  * 18-31: tail cpu (+1)
> > > >
> > > > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> > > >
> > > > Please let architecture select xchg16/xchg32 to implement
> > > > xchg_tail.
> > >
> > > So I really don't like this, this pushes complexity into the generic
> > > code for something that's really not needed.
> > >
> > > Lots of RISC already implement sub-word atomics using word ll/sc.
> > > Obviously they're not sharing code like they should be :/ See for
> > > example arch/mips/kernel/cmpxchg.c.
> > I see, we've done two versions of this:
> >  - Using cmpxchg codes from MIPS by Michael
> >  - Re-write with assembly codes by Guo
> >
> > But using the full-word atomic xchg instructions implement xchg16 has
> > the semantic risk for atomic operations.
> 
> What? -ENOPARSE
> 
> > > Also, I really do think doing ticket locks first is a far more
> > > sensible step.
> > NACK by Anup
> 
> Who's he when he's not sending NAKs ?

We had discussions in the RISC-V platforms group about this. Over there,
We had evaluated all spin lock approaches (ticket, qspinlock, etc) tried
in Linux till now. It was concluded in those discussions that eventually we
have to move to qspinlock (even if we moved to ticket spinlock temporarily)
because qspinlock avoids cache line bouncing. Also, moving to qspinlock
will be aligned with other major architectures supported in Linux (such as
x86, ARM64)

Some of the organizations working on high-end RISC-V systems (> 32
CPUs) are interested in having an optimized spinlock implementation
(just like other major architectures x86 and ARM64).

Based on above, Linux RISC-V should move to qspinlock.

Regards,
Anup
Peter Zijlstra March 29, 2021, 12:49 p.m. UTC | #11
On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> u32 a = 0x55aa66bb;
> u16 *ptr = &a;
> 
> CPU0                       CPU1
> =========             =========
> xchg16(ptr, new)     while(1)
>                                     WRITE_ONCE(*(ptr + 1), x);
> 
> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.

Then I think your LL/SC is broken.

That also means you really don't want to build super complex locking
primitives on top, because that live-lock will percolate through.

Step 1 would be to get your architecute fixed such that it can provide
fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
in building complex systems with it.
Guo Ren March 29, 2021, 12:52 p.m. UTC | #12
On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote:
> > Anyway, an additional 'funny' is that I suspect you cannot prove fwd
> > progress of the entire primitive with any of this on. But who cares
> > about details anyway.. :/
>
> What's the architectural guarantee on LL/SC progress for RISC-V ?

funct5    | aq | rl   | rs2 |  rs1  | funct3 | rd | opcode
     5          1    1      5       5         3        5          7
LR.W/D  ordering  0     addr    width   dest    AMO
SC.W/D  ordering  src  addr    width   dest    AMO

LR.W loads a word from the address in rs1, places the sign-extended
value in rd, and registers a reservation set—a set of bytes that
subsumes the bytes in the addressed word. SC.W conditionally writes a
word in rs2 to the address in rs1: the SC.W succeeds only if the
reservation is still valid and the reservation set contains the bytes
being written. If the SC.W succeeds, the instruction writes the word
in rs2 to memory, and it writes zero to rd. If the SC.W fails, the
instruction does not write to memory, and it writes a nonzero value to
rd. Regardless of success or failure, executing an SC.W instruction
*invalidates any reservation held by this hart*.

More details, ref:
https://github.com/riscv/riscv-isa-manual

> And what if you double loop it like cmpxchg() ?
Can you give a code snippet?


--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Peter Zijlstra March 29, 2021, 12:54 p.m. UTC | #13
On Mon, Mar 29, 2021 at 12:13:10PM +0000, Anup Patel wrote:

> We had discussions in the RISC-V platforms group about this. Over there,
> We had evaluated all spin lock approaches (ticket, qspinlock, etc) tried
> in Linux till now. It was concluded in those discussions that eventually we
> have to move to qspinlock (even if we moved to ticket spinlock temporarily)
> because qspinlock avoids cache line bouncing. Also, moving to qspinlock
> will be aligned with other major architectures supported in Linux (such as
> x86, ARM64)
> 
> Some of the organizations working on high-end RISC-V systems (> 32
> CPUs) are interested in having an optimized spinlock implementation
> (just like other major architectures x86 and ARM64).
> 
> Based on above, Linux RISC-V should move to qspinlock.

That's all well and good, but first you need architectural forward
progress guarantees. Otherwise there's absolutely no point in building
complex locking primitives.

And unless you already have such big systems in silicon, where you
can benchmark against simpler locks (like ticket) there really is no
point in the complexity.
Arnd Bergmann March 29, 2021, 1:56 p.m. UTC | #14
On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote:
> > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd
> > > progress of the entire primitive with any of this on. But who cares
> > > about details anyway.. :/
> >
> > What's the architectural guarantee on LL/SC progress for RISC-V ?
>
> funct5    | aq | rl   | rs2 |  rs1  | funct3 | rd | opcode
>      5          1    1      5       5         3        5          7
> LR.W/D  ordering  0     addr    width   dest    AMO
> SC.W/D  ordering  src  addr    width   dest    AMO
>
> LR.W loads a word from the address in rs1, places the sign-extended
> value in rd, and registers a reservation set—a set of bytes that
> subsumes the bytes in the addressed word. SC.W conditionally writes a
> word in rs2 to the address in rs1: the SC.W succeeds only if the
> reservation is still valid and the reservation set contains the bytes
> being written. If the SC.W succeeds, the instruction writes the word
> in rs2 to memory, and it writes zero to rd. If the SC.W fails, the
> instruction does not write to memory, and it writes a nonzero value to
> rd. Regardless of success or failure, executing an SC.W instruction
> *invalidates any reservation held by this hart*.
>
> More details, ref:
> https://github.com/riscv/riscv-isa-manual

I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant
link, as this defines memory areas that either do or do not have
forward progress guarantees, including this part:

   "When LR/SC is used for memory locations marked RsrvNonEventual,
     software should provide alternative fall-back mechanisms used when
     lack of progress is detected."

My reading of this is that if the example you tried stalls, then either
the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
or that the PMA is marked RsrvEventual but the implementation is
buggy.

It also seems that the current "amoswap" based implementation
would be reliable independent of RsrvEventual/RsrvNonEventual.
arm64 is already in the situation of having to choose between
two cmpxchg() implementation at runtime to allow falling back to
a slower but more general version, but it's best to avoid that if you
can.

         Arnd

[1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas
Guo Ren March 30, 2021, 2:26 a.m. UTC | #15
On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote:
> > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd
> > > > progress of the entire primitive with any of this on. But who cares
> > > > about details anyway.. :/
> > >
> > > What's the architectural guarantee on LL/SC progress for RISC-V ?
> >
> > funct5    | aq | rl   | rs2 |  rs1  | funct3 | rd | opcode
> >      5          1    1      5       5         3        5          7
> > LR.W/D  ordering  0     addr    width   dest    AMO
> > SC.W/D  ordering  src  addr    width   dest    AMO
> >
> > LR.W loads a word from the address in rs1, places the sign-extended
> > value in rd, and registers a reservation set—a set of bytes that
> > subsumes the bytes in the addressed word. SC.W conditionally writes a
> > word in rs2 to the address in rs1: the SC.W succeeds only if the
> > reservation is still valid and the reservation set contains the bytes
> > being written. If the SC.W succeeds, the instruction writes the word
> > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the
> > instruction does not write to memory, and it writes a nonzero value to
> > rd. Regardless of success or failure, executing an SC.W instruction
> > *invalidates any reservation held by this hart*.
> >
> > More details, ref:
> > https://github.com/riscv/riscv-isa-manual
>
> I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant
> link, as this defines memory areas that either do or do not have
> forward progress guarantees, including this part:
>
>    "When LR/SC is used for memory locations marked RsrvNonEventual,
>      software should provide alternative fall-back mechanisms used when
>      lack of progress is detected."
>
> My reading of this is that if the example you tried stalls, then either
> the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
> or that the PMA is marked RsrvEventual but the implementation is
> buggy.
Yes, PMA just defines physical memory region attributes, But in our
processor, when MMU is enabled (satp's value register > 2) in s-mode,
it will look at our custom PTE's attributes BIT(63) ref [1]:

   PTE format:
   | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
     SO   C    B    SH   SE    RSW   D   A   G   U   X   W   R   V
     ^    ^    ^    ^    ^
   BIT(63): SO - Strong Order
   BIT(62): C  - Cacheable
   BIT(61): B  - Bufferable
   BIT(60): SH - Shareable
   BIT(59): SE - Security

So the memory also could be RsrvNone/RsrvEventual.

[1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88

>
> It also seems that the current "amoswap" based implementation
> would be reliable independent of RsrvEventual/RsrvNonEventual.
Yes, the hardware implementation of AMO could be different from LR/SC.
AMO could use ACE snoop holding to lock the bus in hw coherency
design, but LR/SC uses an exclusive monitor without locking the bus.

> arm64 is already in the situation of having to choose between
> two cmpxchg() implementation at runtime to allow falling back to
> a slower but more general version, but it's best to avoid that if you
> can.
Current RISC-V needn't multiple versions to select, and all AMO &
LR/SC has been defined in the spec.

RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
think LR/SC would be slower than CAS, and CAS is just good for code
size.

>
>          Arnd
>
> [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Guo Ren March 30, 2021, 3:13 a.m. UTC | #16
On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > u32 a = 0x55aa66bb;
> > u16 *ptr = &a;
> >
> > CPU0                       CPU1
> > =========             =========
> > xchg16(ptr, new)     while(1)
> >                                     WRITE_ONCE(*(ptr + 1), x);
> >
> > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
>
> Then I think your LL/SC is broken.
>
> That also means you really don't want to build super complex locking
> primitives on top, because that live-lock will percolate through.
Do you mean the below implementation has live-lock risk?
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+       u32 old, new, val = atomic_read(&lock->val);
+
+       for (;;) {
+               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+               old = atomic_cmpxchg(&lock->val, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+       return old;
+}


>
> Step 1 would be to get your architecute fixed such that it can provide
> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
> in building complex systems with it.

Quote Waiman's comment [1] on xchg16 optimization:

"This optimization is needed to make the qspinlock achieve performance
parity with ticket spinlock at light load."

[1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/

So for a non-xhg16 machine:
 - ticket-lock for small numbers of CPUs
 - qspinlock for large numbers of CPUs

Okay, I'll put all of them into the next patch :P
Anup Patel March 30, 2021, 4:54 a.m. UTC | #17
> -----Original Message-----
> From: Guo Ren <guoren@kernel.org>
> Sent: 30 March 2021 08:44
> To: Peter Zijlstra <peterz@infradead.org>
> Cc: linux-riscv <linux-riscv@lists.infradead.org>; Linux Kernel Mailing List
> <linux-kernel@vger.kernel.org>; linux-csky@vger.kernel.org; linux-arch
> <linux-arch@vger.kernel.org>; Guo Ren <guoren@linux.alibaba.com>; Will
> Deacon <will@kernel.org>; Ingo Molnar <mingo@redhat.com>; Waiman
> Long <longman@redhat.com>; Arnd Bergmann <arnd@arndb.de>; Anup
> Patel <anup@brainfault.org>
> Subject: Re: [PATCH v4 3/4] locking/qspinlock: Add
> ARCH_USE_QUEUED_SPINLOCKS_XCHG32
> 
> On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org>
> wrote:
> >
> > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > u32 a = 0x55aa66bb;
> > > u16 *ptr = &a;
> > >
> > > CPU0                       CPU1
> > > =========             =========
> > > xchg16(ptr, new)     while(1)
> > >                                     WRITE_ONCE(*(ptr + 1), x);
> > >
> > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >
> > Then I think your LL/SC is broken.
> >
> > That also means you really don't want to build super complex locking
> > primitives on top, because that live-lock will percolate through.
> Do you mean the below implementation has live-lock risk?
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> +       u32 old, new, val = atomic_read(&lock->val);
> +
> +       for (;;) {
> +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> +               old = atomic_cmpxchg(&lock->val, val, new);
> +               if (old == val)
> +                       break;
> +
> +               val = old;
> +       }
> +       return old;
> +}
> 
> 
> >
> > Step 1 would be to get your architecute fixed such that it can provide
> > fwd progress guarantees for LL/SC. Otherwise there's absolutely no
> > point in building complex systems with it.
> 
> Quote Waiman's comment [1] on xchg16 optimization:
> 
> "This optimization is needed to make the qspinlock achieve performance
> parity with ticket spinlock at light load."
> 
> [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-
> Waiman.Long@hp.com/
> 
> So for a non-xhg16 machine:
>  - ticket-lock for small numbers of CPUs
>  - qspinlock for large numbers of CPUs
> 
> Okay, I'll put all of them into the next patch 

I would suggest to have separate Kconfig opitons for ticket spinlock
in Linux RISC-V which will be disabled by default. This means Linux
RISC-V will use qspinlock by default and use ticket spinlock only when
ticket spinlock kconfig is enabled.

Regards,
Anup
Anup Patel March 30, 2021, 5:51 a.m. UTC | #18
On Tue, Mar 30, 2021 at 7:56 AM Guo Ren <guoren@kernel.org> wrote:
>
> On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote:
> > > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd
> > > > > progress of the entire primitive with any of this on. But who cares
> > > > > about details anyway.. :/
> > > >
> > > > What's the architectural guarantee on LL/SC progress for RISC-V ?
> > >
> > > funct5    | aq | rl   | rs2 |  rs1  | funct3 | rd | opcode
> > >      5          1    1      5       5         3        5          7
> > > LR.W/D  ordering  0     addr    width   dest    AMO
> > > SC.W/D  ordering  src  addr    width   dest    AMO
> > >
> > > LR.W loads a word from the address in rs1, places the sign-extended
> > > value in rd, and registers a reservation set—a set of bytes that
> > > subsumes the bytes in the addressed word. SC.W conditionally writes a
> > > word in rs2 to the address in rs1: the SC.W succeeds only if the
> > > reservation is still valid and the reservation set contains the bytes
> > > being written. If the SC.W succeeds, the instruction writes the word
> > > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the
> > > instruction does not write to memory, and it writes a nonzero value to
> > > rd. Regardless of success or failure, executing an SC.W instruction
> > > *invalidates any reservation held by this hart*.
> > >
> > > More details, ref:
> > > https://github.com/riscv/riscv-isa-manual
> >
> > I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant
> > link, as this defines memory areas that either do or do not have
> > forward progress guarantees, including this part:
> >
> >    "When LR/SC is used for memory locations marked RsrvNonEventual,
> >      software should provide alternative fall-back mechanisms used when
> >      lack of progress is detected."
> >
> > My reading of this is that if the example you tried stalls, then either
> > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
> > or that the PMA is marked RsrvEventual but the implementation is
> > buggy.
> Yes, PMA just defines physical memory region attributes, But in our
> processor, when MMU is enabled (satp's value register > 2) in s-mode,
> it will look at our custom PTE's attributes BIT(63) ref [1]:
>
>    PTE format:
>    | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
>      SO   C    B    SH   SE    RSW   D   A   G   U   X   W   R   V
>      ^    ^    ^    ^    ^
>    BIT(63): SO - Strong Order
>    BIT(62): C  - Cacheable
>    BIT(61): B  - Bufferable
>    BIT(60): SH - Shareable
>    BIT(59): SE - Security
>
> So the memory also could be RsrvNone/RsrvEventual.
>
> [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88

Is this about your C-sky architecture or your RISC-V implementation.

If these PTE bits are in your RISC-V implementation then clearly your
RISC-V implementation is not compliant with the RISC-V privilege spec
because these bits are not defined in RISC-V privilege spec.

Regards,
Anup
>
> >
> > It also seems that the current "amoswap" based implementation
> > would be reliable independent of RsrvEventual/RsrvNonEventual.
> Yes, the hardware implementation of AMO could be different from LR/SC.
> AMO could use ACE snoop holding to lock the bus in hw coherency
> design, but LR/SC uses an exclusive monitor without locking the bus.
>
> > arm64 is already in the situation of having to choose between
> > two cmpxchg() implementation at runtime to allow falling back to
> > a slower but more general version, but it's best to avoid that if you
> > can.
> Current RISC-V needn't multiple versions to select, and all AMO &
> LR/SC has been defined in the spec.
>
> RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> think LR/SC would be slower than CAS, and CAS is just good for code
> size.
>
> >
> >          Arnd
> >
> > [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas
>
> --
> Best Regards
>  Guo Ren
>
> ML: https://lore.kernel.org/linux-csky/
Guo Ren March 30, 2021, 6:26 a.m. UTC | #19
On Tue, Mar 30, 2021 at 1:51 PM Anup Patel <anup@brainfault.org> wrote:
>
> On Tue, Mar 30, 2021 at 7:56 AM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
> > > >
> > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Mon, Mar 29, 2021 at 01:16:53PM +0200, Peter Zijlstra wrote:
> > > > > > Anyway, an additional 'funny' is that I suspect you cannot prove fwd
> > > > > > progress of the entire primitive with any of this on. But who cares
> > > > > > about details anyway.. :/
> > > > >
> > > > > What's the architectural guarantee on LL/SC progress for RISC-V ?
> > > >
> > > > funct5    | aq | rl   | rs2 |  rs1  | funct3 | rd | opcode
> > > >      5          1    1      5       5         3        5          7
> > > > LR.W/D  ordering  0     addr    width   dest    AMO
> > > > SC.W/D  ordering  src  addr    width   dest    AMO
> > > >
> > > > LR.W loads a word from the address in rs1, places the sign-extended
> > > > value in rd, and registers a reservation set—a set of bytes that
> > > > subsumes the bytes in the addressed word. SC.W conditionally writes a
> > > > word in rs2 to the address in rs1: the SC.W succeeds only if the
> > > > reservation is still valid and the reservation set contains the bytes
> > > > being written. If the SC.W succeeds, the instruction writes the word
> > > > in rs2 to memory, and it writes zero to rd. If the SC.W fails, the
> > > > instruction does not write to memory, and it writes a nonzero value to
> > > > rd. Regardless of success or failure, executing an SC.W instruction
> > > > *invalidates any reservation held by this hart*.
> > > >
> > > > More details, ref:
> > > > https://github.com/riscv/riscv-isa-manual
> > >
> > > I think section "3.5.3.2 Reservability PMA" [1] would be a more relevant
> > > link, as this defines memory areas that either do or do not have
> > > forward progress guarantees, including this part:
> > >
> > >    "When LR/SC is used for memory locations marked RsrvNonEventual,
> > >      software should provide alternative fall-back mechanisms used when
> > >      lack of progress is detected."
> > >
> > > My reading of this is that if the example you tried stalls, then either
> > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
> > > or that the PMA is marked RsrvEventual but the implementation is
> > > buggy.
> > Yes, PMA just defines physical memory region attributes, But in our
> > processor, when MMU is enabled (satp's value register > 2) in s-mode,
> > it will look at our custom PTE's attributes BIT(63) ref [1]:
> >
> >    PTE format:
> >    | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
> >      SO   C    B    SH   SE    RSW   D   A   G   U   X   W   R   V
> >      ^    ^    ^    ^    ^
> >    BIT(63): SO - Strong Order
> >    BIT(62): C  - Cacheable
> >    BIT(61): B  - Bufferable
> >    BIT(60): SH - Shareable
> >    BIT(59): SE - Security
> >
> > So the memory also could be RsrvNone/RsrvEventual.
> >
> > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88
>
> Is this about your C-sky architecture or your RISC-V implementation.
It's in RISC-V implementation.

>
> If these PTE bits are in your RISC-V implementation then clearly your
> RISC-V implementation is not compliant with the RISC-V privilege spec
> because these bits are not defined in RISC-V privilege spec.
We could disable it if the vendor's SOC has a coherency interconnect
bus, so C910 is compliant with standard privilege spec.

ps:
I remember someone has mentioned a similar design in 1.12-draft-VM-TASKGROUP:

"Bit 63 indicates that the PTE uses a custom implementation-specific
encoding. If bit 63 is set, the algorithm for virtual-to-physical
address translation is implementation-defined. If bit 63 is not set,
the algorithm for virtual-to-physical address translation is described
in Section 4.4.2.

Bit 62 indicates the use of naturally aligned power-of-two (NAPOT)
address translation contiguity, as described in Section 4.4.2.

Bits 61–xx indicate cacheability attributes associated with the
virtual address in question, as de-scribed in Section 4.4.3.

Bits xx–54 are reserved for future use."


>
> Regards,
> Anup
> >
> > >
> > > It also seems that the current "amoswap" based implementation
> > > would be reliable independent of RsrvEventual/RsrvNonEventual.
> > Yes, the hardware implementation of AMO could be different from LR/SC.
> > AMO could use ACE snoop holding to lock the bus in hw coherency
> > design, but LR/SC uses an exclusive monitor without locking the bus.
> >
> > > arm64 is already in the situation of having to choose between
> > > two cmpxchg() implementation at runtime to allow falling back to
> > > a slower but more general version, but it's best to avoid that if you
> > > can.
> > Current RISC-V needn't multiple versions to select, and all AMO &
> > LR/SC has been defined in the spec.
> >
> > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> > think LR/SC would be slower than CAS, and CAS is just good for code
> > size.
> >
> > >
> > >          Arnd
> > >
> > > [1] http://www.five-embeddev.com/riscv-isa-manual/latest/machine.html#atomicity-pmas
> >
> > --
> > Best Regards
> >  Guo Ren
> >
> > ML: https://lore.kernel.org/linux-csky/



--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Guo Ren March 30, 2021, 6:27 a.m. UTC | #20
On Tue, Mar 30, 2021 at 12:54 PM Anup Patel <Anup.Patel@wdc.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Guo Ren <guoren@kernel.org>
> > Sent: 30 March 2021 08:44
> > To: Peter Zijlstra <peterz@infradead.org>
> > Cc: linux-riscv <linux-riscv@lists.infradead.org>; Linux Kernel Mailing List
> > <linux-kernel@vger.kernel.org>; linux-csky@vger.kernel.org; linux-arch
> > <linux-arch@vger.kernel.org>; Guo Ren <guoren@linux.alibaba.com>; Will
> > Deacon <will@kernel.org>; Ingo Molnar <mingo@redhat.com>; Waiman
> > Long <longman@redhat.com>; Arnd Bergmann <arnd@arndb.de>; Anup
> > Patel <anup@brainfault.org>
> > Subject: Re: [PATCH v4 3/4] locking/qspinlock: Add
> > ARCH_USE_QUEUED_SPINLOCKS_XCHG32
> >
> > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org>
> > wrote:
> > >
> > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > > u32 a = 0x55aa66bb;
> > > > u16 *ptr = &a;
> > > >
> > > > CPU0                       CPU1
> > > > =========             =========
> > > > xchg16(ptr, new)     while(1)
> > > >                                     WRITE_ONCE(*(ptr + 1), x);
> > > >
> > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> > >
> > > Then I think your LL/SC is broken.
> > >
> > > That also means you really don't want to build super complex locking
> > > primitives on top, because that live-lock will percolate through.
> > Do you mean the below implementation has live-lock risk?
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +       u32 old, new, val = atomic_read(&lock->val);
> > +
> > +       for (;;) {
> > +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> > +               old = atomic_cmpxchg(&lock->val, val, new);
> > +               if (old == val)
> > +                       break;
> > +
> > +               val = old;
> > +       }
> > +       return old;
> > +}
> >
> >
> > >
> > > Step 1 would be to get your architecute fixed such that it can provide
> > > fwd progress guarantees for LL/SC. Otherwise there's absolutely no
> > > point in building complex systems with it.
> >
> > Quote Waiman's comment [1] on xchg16 optimization:
> >
> > "This optimization is needed to make the qspinlock achieve performance
> > parity with ticket spinlock at light load."
> >
> > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-
> > Waiman.Long@hp.com/
> >
> > So for a non-xhg16 machine:
> >  - ticket-lock for small numbers of CPUs
> >  - qspinlock for large numbers of CPUs
> >
> > Okay, I'll put all of them into the next patch
>
> I would suggest to have separate Kconfig opitons for ticket spinlock
> in Linux RISC-V which will be disabled by default. This means Linux
> RISC-V will use qspinlock by default and use ticket spinlock only when
> ticket spinlock kconfig is enabled.
OK

>
> Regards,
> Anup
Arnd Bergmann March 30, 2021, 7:11 a.m. UTC | #21
On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote:
> On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
> > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > What's the architectural guarantee on LL/SC progress for RISC-V ?
> >
> >    "When LR/SC is used for memory locations marked RsrvNonEventual,
> >      software should provide alternative fall-back mechanisms used when
> >      lack of progress is detected."
> >
> > My reading of this is that if the example you tried stalls, then either
> > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
> > or that the PMA is marked RsrvEventual but the implementation is
> > buggy.
>
> Yes, PMA just defines physical memory region attributes, But in our
> processor, when MMU is enabled (satp's value register > 2) in s-mode,
> it will look at our custom PTE's attributes BIT(63) ref [1]:
>
>    PTE format:
>    | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
>      SO   C    B    SH   SE    RSW   D   A   G   U   X   W   R   V
>      ^    ^    ^    ^    ^
>    BIT(63): SO - Strong Order
>    BIT(62): C  - Cacheable
>    BIT(61): B  - Bufferable
>    BIT(60): SH - Shareable
>    BIT(59): SE - Security
>
> So the memory also could be RsrvNone/RsrvEventual.

I was not talking about RsrvNone, which would clearly mean that
you cannot use lr/sc at all (trap would trap, right?), but "RsrvNonEventual",
which would explain the behavior you described in an earlier reply:

| u32 a = 0x55aa66bb;
| u16 *ptr = &a;
|
| CPU0                       CPU1
| =========             =========
| xchg16(ptr, new)     while(1)
|                                     WRITE_ONCE(*(ptr + 1), x);
|
| When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.

As I understand, this example must not cause a deadlock on
a compliant hardware implementation when the underlying memory
has RsrvEventual behavior, but could deadlock in case of
RsrvNonEventual

> [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88
>
> >
> > It also seems that the current "amoswap" based implementation
> > would be reliable independent of RsrvEventual/RsrvNonEventual.
>
> Yes, the hardware implementation of AMO could be different from LR/SC.
> AMO could use ACE snoop holding to lock the bus in hw coherency
> design, but LR/SC uses an exclusive monitor without locking the bus.
>
> RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> think LR/SC would be slower than CAS, and CAS is just good for code
> size.

What I meant here is that the current spinlock uses a simple amoswap,
which presumably does not suffer from the lack of forward process you
described.

        Arnd
David Laight March 30, 2021, 8:31 a.m. UTC | #22
From: Guo Ren
> Sent: 30 March 2021 04:14
...
> > Step 1 would be to get your architecute fixed such that it can provide
> > fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
> > in building complex systems with it.
> 
> Quote Waiman's comment [1] on xchg16 optimization:
> 
> "This optimization is needed to make the qspinlock achieve performance
> parity with ticket spinlock at light load."
> 
> [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/
> 
> So for a non-xhg16 machine:
>  - ticket-lock for small numbers of CPUs
>  - qspinlock for large numbers of CPUs
> 
> Okay, I'll put all of them into the next patch :P

Doesn't that also imply that you need ticket-locks for
lightly contended locks even with a lot of CPUs?

If you actually get a lot of CPUs waiting on the same spinlock
you probably ought to change the code to reduce lock contention.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Waiman Long March 30, 2021, 2:09 p.m. UTC | #23
On 3/29/21 11:13 PM, Guo Ren wrote:
> On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
>> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
>>> u32 a = 0x55aa66bb;
>>> u16 *ptr = &a;
>>>
>>> CPU0                       CPU1
>>> =========             =========
>>> xchg16(ptr, new)     while(1)
>>>                                      WRITE_ONCE(*(ptr + 1), x);
>>>
>>> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
>> Then I think your LL/SC is broken.
>>
>> That also means you really don't want to build super complex locking
>> primitives on top, because that live-lock will percolate through.
> Do you mean the below implementation has live-lock risk?
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> +       u32 old, new, val = atomic_read(&lock->val);
> +
> +       for (;;) {
> +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> +               old = atomic_cmpxchg(&lock->val, val, new);
> +               if (old == val)
> +                       break;
> +
> +               val = old;
> +       }
> +       return old;
> +}
If there is a continuous stream of incoming spinlock takers, it is 
possible that some cpus may have to wait a long time to set the tail 
right. However, this should only happen on artificial workload. I doubt 
it will happen with real workload or with limit number of cpus.
>
>> Step 1 would be to get your architecute fixed such that it can provide
>> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
>> in building complex systems with it.
> Quote Waiman's comment [1] on xchg16 optimization:
>
> "This optimization is needed to make the qspinlock achieve performance
> parity with ticket spinlock at light load."
>
> [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/
>
> So for a non-xhg16 machine:
>   - ticket-lock for small numbers of CPUs
>   - qspinlock for large numbers of CPUs
>
> Okay, I'll put all of them into the next patch :P
>
It is true that qspinlock may not offer much advantage when the number 
of cpus is small. It shines for systems with many cpus. You may use 
NR_CPUS to determine if the default should be ticket or qspinlock with 
user override. To determine the right NR_CPUS threshold, you may need to 
run on real SMP RISCV systems to find out.

Cheers,
Longman
Peter Zijlstra March 30, 2021, 4:08 p.m. UTC | #24
On Tue, Mar 30, 2021 at 11:13:55AM +0800, Guo Ren wrote:
> On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > u32 a = 0x55aa66bb;
> > > u16 *ptr = &a;
> > >
> > > CPU0                       CPU1
> > > =========             =========
> > > xchg16(ptr, new)     while(1)
> > >                                     WRITE_ONCE(*(ptr + 1), x);
> > >
> > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >
> > Then I think your LL/SC is broken.
> >
> > That also means you really don't want to build super complex locking
> > primitives on top, because that live-lock will percolate through.

> Do you mean the below implementation has live-lock risk?
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> +       u32 old, new, val = atomic_read(&lock->val);
> +
> +       for (;;) {
> +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> +               old = atomic_cmpxchg(&lock->val, val, new);
> +               if (old == val)
> +                       break;
> +
> +               val = old;
> +       }
> +       return old;
> +}

That entirely depends on the architecture (and cmpxchg() impementation).

There are a number of cases:

 * architecture has cmpxchg() instruction (x86, s390, sparc, etc.).

  - architecture provides fwd progress (x86)
  - architecture requires backoff for progress (sparc)

 * architecture does not have cmpxchg, and implements it using LL/SC.

  and here things get *really* interesting, because while an
  architecture can have LL/SC fwd progress, that does not translate into
  cmpxchg() also having the same guarantees and all bets are off.

The real bummer is that C can do cmpxchg(), but there is no way it can
do LL/SC. And even if we'd teach C how to do LL/SC, it couldn't be
generic because architectures lacking it can't emulate it using
cmpxchg() (there's a fun class of bugs there).

So while the above code might be the best we can do in generic code,
it's really up to the architecture to make it work.
Stafford Horne March 30, 2021, 10:35 p.m. UTC | #25
On Tue, Mar 30, 2021 at 06:08:40PM +0200, Peter Zijlstra wrote:
> On Tue, Mar 30, 2021 at 11:13:55AM +0800, Guo Ren wrote:
> > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > > u32 a = 0x55aa66bb;
> > > > u16 *ptr = &a;
> > > >
> > > > CPU0                       CPU1
> > > > =========             =========
> > > > xchg16(ptr, new)     while(1)
> > > >                                     WRITE_ONCE(*(ptr + 1), x);
> > > >
> > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> > >
> > > Then I think your LL/SC is broken.
> > >
> > > That also means you really don't want to build super complex locking
> > > primitives on top, because that live-lock will percolate through.
> 
> > Do you mean the below implementation has live-lock risk?
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +       u32 old, new, val = atomic_read(&lock->val);
> > +
> > +       for (;;) {
> > +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> > +               old = atomic_cmpxchg(&lock->val, val, new);
> > +               if (old == val)
> > +                       break;
> > +
> > +               val = old;
> > +       }
> > +       return old;
> > +}
> 
> That entirely depends on the architecture (and cmpxchg() impementation).
> 
> There are a number of cases:
> 
>  * architecture has cmpxchg() instruction (x86, s390, sparc, etc.).
> 
>   - architecture provides fwd progress (x86)
>   - architecture requires backoff for progress (sparc)
> 
>  * architecture does not have cmpxchg, and implements it using LL/SC.
> 
>   and here things get *really* interesting, because while an
>   architecture can have LL/SC fwd progress, that does not translate into
>   cmpxchg() also having the same guarantees and all bets are off.
> 
> The real bummer is that C can do cmpxchg(), but there is no way it can
> do LL/SC. And even if we'd teach C how to do LL/SC, it couldn't be
> generic because architectures lacking it can't emulate it using
> cmpxchg() (there's a fun class of bugs there).
> 
> So while the above code might be the best we can do in generic code,
> it's really up to the architecture to make it work.

I just want to chime in here, there may be a better spot in the thread to
mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
which I have on my todo list somwhere to replace the other generic
implementations like that in mips.

  arch/openrisc/include/asm/cmpxchg.h

The idea would be that architectures just implement these methods:

  long cmpxchg_u32(*ptr,old,new)
  long xchg_u32(*ptr,val)

Then the rest of the generic header would implement cmpxchg.

-Stafford
Guo Ren March 31, 2021, 4:18 a.m. UTC | #26
On Tue, Mar 30, 2021 at 3:12 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote:
> > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
> > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > What's the architectural guarantee on LL/SC progress for RISC-V ?
> > >
> > >    "When LR/SC is used for memory locations marked RsrvNonEventual,
> > >      software should provide alternative fall-back mechanisms used when
> > >      lack of progress is detected."
> > >
> > > My reading of this is that if the example you tried stalls, then either
> > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
> > > or that the PMA is marked RsrvEventual but the implementation is
> > > buggy.
> >
> > Yes, PMA just defines physical memory region attributes, But in our
> > processor, when MMU is enabled (satp's value register > 2) in s-mode,
> > it will look at our custom PTE's attributes BIT(63) ref [1]:
> >
> >    PTE format:
> >    | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
> >      SO   C    B    SH   SE    RSW   D   A   G   U   X   W   R   V
> >      ^    ^    ^    ^    ^
> >    BIT(63): SO - Strong Order
> >    BIT(62): C  - Cacheable
> >    BIT(61): B  - Bufferable
> >    BIT(60): SH - Shareable
> >    BIT(59): SE - Security
> >
> > So the memory also could be RsrvNone/RsrvEventual.
>
> I was not talking about RsrvNone, which would clearly mean that
> you cannot use lr/sc at all (trap would trap, right?), but "RsrvNonEventual",
> which would explain the behavior you described in an earlier reply:
>
> | u32 a = 0x55aa66bb;
> | u16 *ptr = &a;
> |
> | CPU0                       CPU1
> | =========             =========
> | xchg16(ptr, new)     while(1)
> |                                     WRITE_ONCE(*(ptr + 1), x);
> |
> | When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
>
> As I understand, this example must not cause a deadlock on
> a compliant hardware implementation when the underlying memory
> has RsrvEventual behavior, but could deadlock in case of
> RsrvNonEventual
Thx for the nice explanation:
 - RsrvNonEventual - depends on software fall-back mechanisms, and
just I'm worried about.
 - RsrvEventual - HW would provide the eventual success guarantee.

>
> > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88
> >
> > >
> > > It also seems that the current "amoswap" based implementation
> > > would be reliable independent of RsrvEventual/RsrvNonEventual.
> >
> > Yes, the hardware implementation of AMO could be different from LR/SC.
> > AMO could use ACE snoop holding to lock the bus in hw coherency
> > design, but LR/SC uses an exclusive monitor without locking the bus.
> >
> > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> > think LR/SC would be slower than CAS, and CAS is just good for code
> > size.
>
> What I meant here is that the current spinlock uses a simple amoswap,
> which presumably does not suffer from the lack of forward process you
> described.
Does that mean we should prevent using LR/SC (if RsrvNonEventual)?
Paul Campbell March 31, 2021, 5:33 a.m. UTC | #27
On Wednesday, 31 March 2021 5:18:56 PM NZDT Guo Ren wrote:
> > > [1]
> > > https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc
> > > 52afd0fcbf88> > 
> > > > It also seems that the current "amoswap" based implementation
> > > > would be reliable independent of RsrvEventual/RsrvNonEventual.
> > > 
> > > Yes, the hardware implementation of AMO could be different from LR/SC.
> > > AMO could use ACE snoop holding to lock the bus in hw coherency
> > > design, but LR/SC uses an exclusive monitor without locking the bus.
> > > 
> > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> > > think LR/SC would be slower than CAS, and CAS is just good for code
> > > size.
> > 
> > What I meant here is that the current spinlock uses a simple amoswap,
> > which presumably does not suffer from the lack of forward process you
> > described.
> 
> Does that mean we should prevent using LR/SC (if RsrvNonEventual)?

Let me provide another data-point, I'm working on a high-end highly 
speculative implementation with many concurrent instructions in flight - from 
my point of view  both sorts of AMO (LR/SC and swap/add/etc) require me to 
grab a cache line in an exclusive modifiable state (so no difference there).

More importantly both sorts of AMO instructions  (unlike most loads and 
stores) can't be speculated (not even LR because it changes hidden state, I 
found this out the hard way bringing up the kernel).

This means that both LR AND SC individually can't be executed until all 
speculation is resolved (that means that they happen really late in the 
execute path and block the resolution of the speculation of subsequent 
instructions) - equally a single amoswap/add/etc instruction can't happen 
until late in the execute path - so both require the same cache line state, 
but one of these sorts of events is better than two of them.

Which in short means that amoswap/add/etc is better for small things - small 
buzzy lock loops, while LR/SC is better for more complex things with actual 
processing between the LR and SC.

----

Another issue here is to consider is what happens when you hit one of these 
tight spinlocks when the branch target cache is empty and they fail (ie loop 
back and try again) - the default branch prediction, and resulting 
speculation, is (very) likely to be looping back, while hopefully most locks 
are not contended when you hit them and that speculation would be wrong - a 
spinlock like this may not be so good:

	li a0, 1
loop: 
	amoswap	a1, a0, (a2)
	beqz	a1, loop
	..... subsequent code

In my world with no BTC info the pipe fills with dozens of amoswaps, rather 
than  the 'subsequent code'. While (in my world) code like this:

	li a0, 1
loop: 
	amoswap	a1, a0, (a2)
	bnez	a1, 1f
	.... subsequent code

1:	j loop

would actually be better (in my world unconditional jump instructions are 
folded early and never see execution so they're sort of free, though they mess 
with the issue/decode rate). Smart compilers could move the "j loop" out of 
the way, while the double branch on failure is not a big deal since either the 
lock is still held (and you don't care if it's slow) or it's been released in 
which case the cache line has been stolen and the refetch of that cache line 
is going to dominate the next time around the loop

I need to stress here that this is how my architecture works, other's will of 
course be different though I expect that other heavily speculative 
architectures to have similar issues :-)

	Paul Campbell
	Moonbase Otago
Guo Ren March 31, 2021, 6:44 a.m. UTC | #28
Hi Arnd

On Wed, Mar 31, 2021 at 12:18 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Tue, Mar 30, 2021 at 3:12 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote:
> > > On Mon, Mar 29, 2021 at 9:56 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > > On Mon, Mar 29, 2021 at 2:52 PM Guo Ren <guoren@kernel.org> wrote:
> > > > > On Mon, Mar 29, 2021 at 7:31 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > >
> > > > > > What's the architectural guarantee on LL/SC progress for RISC-V ?
> > > >
> > > >    "When LR/SC is used for memory locations marked RsrvNonEventual,
> > > >      software should provide alternative fall-back mechanisms used when
> > > >      lack of progress is detected."
> > > >
> > > > My reading of this is that if the example you tried stalls, then either
> > > > the PMA is not RsrvEventual, and it is wrong to rely on ll/sc on this,
> > > > or that the PMA is marked RsrvEventual but the implementation is
> > > > buggy.
> > >
> > > Yes, PMA just defines physical memory region attributes, But in our
> > > processor, when MMU is enabled (satp's value register > 2) in s-mode,
> > > it will look at our custom PTE's attributes BIT(63) ref [1]:
> > >
> > >    PTE format:
> > >    | 63 | 62 | 61 | 60 | 59 | 58-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
> > >      SO   C    B    SH   SE    RSW   D   A   G   U   X   W   R   V
> > >      ^    ^    ^    ^    ^
> > >    BIT(63): SO - Strong Order
> > >    BIT(62): C  - Cacheable
> > >    BIT(61): B  - Bufferable
> > >    BIT(60): SH - Shareable
> > >    BIT(59): SE - Security
> > >
> > > So the memory also could be RsrvNone/RsrvEventual.
> >
> > I was not talking about RsrvNone, which would clearly mean that
> > you cannot use lr/sc at all (trap would trap, right?), but "RsrvNonEventual",
> > which would explain the behavior you described in an earlier reply:
> >
> > | u32 a = 0x55aa66bb;
> > | u16 *ptr = &a;
> > |
> > | CPU0                       CPU1
> > | =========             =========
> > | xchg16(ptr, new)     while(1)
> > |                                     WRITE_ONCE(*(ptr + 1), x);
> > |
> > | When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >
> > As I understand, this example must not cause a deadlock on
> > a compliant hardware implementation when the underlying memory
> > has RsrvEventual behavior, but could deadlock in case of
> > RsrvNonEventual
> Thx for the nice explanation:
>  - RsrvNonEventual - depends on software fall-back mechanisms, and
> just I'm worried about.
>  - RsrvEventual - HW would provide the eventual success guarantee.
In riscv-spec 8.3 Eventual Success of Store-Conditional Instructions

I found:
"As a consequence of the eventuality guarantee, if some harts in an
execution environment are
executing constrained LR/SC loops, and no other harts or devices in
the execution environment
execute an unconditional store or AMO to that reservation set, then at
least one hart will
eventually exit its constrained LR/SC loop. *** By contrast, if other
harts or devices continue to
write to that reservation set, it ***is not guaranteed*** that any
hart will exit its LR/SC loop.*** "

Seems RsrvEventual couldn't solve the code's problem I've mentioned.

>
> >
> > > [1] https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc52afd0fcbf88
> > >
> > > >
> > > > It also seems that the current "amoswap" based implementation
> > > > would be reliable independent of RsrvEventual/RsrvNonEventual.
> > >
> > > Yes, the hardware implementation of AMO could be different from LR/SC.
> > > AMO could use ACE snoop holding to lock the bus in hw coherency
> > > design, but LR/SC uses an exclusive monitor without locking the bus.
> > >
> > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> > > think LR/SC would be slower than CAS, and CAS is just good for code
> > > size.
> >
> > What I meant here is that the current spinlock uses a simple amoswap,
> > which presumably does not suffer from the lack of forward process you
> > described.
> Does that mean we should prevent using LR/SC (if RsrvNonEventual)?
>
> --
> Best Regards
>  Guo Ren
>
> ML: https://lore.kernel.org/linux-csky/
Arnd Bergmann March 31, 2021, 7:12 a.m. UTC | #29
On Wed, Mar 31, 2021 at 8:44 AM Guo Ren <guoren@kernel.org> wrote:
> On Wed, Mar 31, 2021 at 12:18 PM Guo Ren <guoren@kernel.org> wrote:
> > On Tue, Mar 30, 2021 at 3:12 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > On Tue, Mar 30, 2021 at 4:26 AM Guo Ren <guoren@kernel.org> wrote:

> > > As I understand, this example must not cause a deadlock on
> > > a compliant hardware implementation when the underlying memory
> > > has RsrvEventual behavior, but could deadlock in case of
> > > RsrvNonEventual
> > Thx for the nice explanation:
> >  - RsrvNonEventual - depends on software fall-back mechanisms, and
> > just I'm worried about.
> >  - RsrvEventual - HW would provide the eventual success guarantee.
> In riscv-spec 8.3 Eventual Success of Store-Conditional Instructions
>
> I found:
> "As a consequence of the eventuality guarantee, if some harts in an
> execution environment are
> executing constrained LR/SC loops, and no other harts or devices in
> the execution environment
> execute an unconditional store or AMO to that reservation set, then at
> least one hart will
> eventually exit its constrained LR/SC loop. *** By contrast, if other
> harts or devices continue to
> write to that reservation set, it ***is not guaranteed*** that any
> hart will exit its LR/SC loop.*** "
>
> Seems RsrvEventual couldn't solve the code's problem I've mentioned.

Ok, got it.

        Arnd
Arnd Bergmann March 31, 2021, 7:23 a.m. UTC | #30
On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
>
> I just want to chime in here, there may be a better spot in the thread to
> mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
> which I have on my todo list somwhere to replace the other generic
> implementations like that in mips.
>
>   arch/openrisc/include/asm/cmpxchg.h
>
> The idea would be that architectures just implement these methods:
>
>   long cmpxchg_u32(*ptr,old,new)
>   long xchg_u32(*ptr,val)
>
> Then the rest of the generic header would implement cmpxchg.

I like the idea of generalizing it a little further. I'd suggest staying a
little closer to the existing naming here though, as we already have
cmpxchg() for the type-agnostic version, and cmpxchg64() for the
fixed-length 64-bit version.

I think a nice interface between architecture-specific and architecture
independent code would be to have architectures provide
arch_cmpxchg32()/arch_xchg32() as the most basic version, as well
as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16()
if they have instructions for those.

The common code can then build cmpxchg16()/xchg16() on top of
either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg()
wrapper around those (or alternatively we can decide to have them
only deal with fixed-32-bit and long/pointer sized atomics).

      Arnd
Stafford Horne March 31, 2021, 12:31 p.m. UTC | #31
On Wed, Mar 31, 2021 at 09:23:27AM +0200, Arnd Bergmann wrote:
> On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
> >
> > I just want to chime in here, there may be a better spot in the thread to
> > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
> > which I have on my todo list somwhere to replace the other generic
> > implementations like that in mips.
> >
> >   arch/openrisc/include/asm/cmpxchg.h
> >
> > The idea would be that architectures just implement these methods:
> >
> >   long cmpxchg_u32(*ptr,old,new)
> >   long xchg_u32(*ptr,val)
> >
> > Then the rest of the generic header would implement cmpxchg.
> 
> I like the idea of generalizing it a little further. I'd suggest staying a
> little closer to the existing naming here though, as we already have
> cmpxchg() for the type-agnostic version, and cmpxchg64() for the
> fixed-length 64-bit version.

OK.

> I think a nice interface between architecture-specific and architecture
> independent code would be to have architectures provide
> arch_cmpxchg32()/arch_xchg32() as the most basic version, as well
> as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16()
> if they have instructions for those.

Thanks for the name suggestions, it makes it easier for me.

> The common code can then build cmpxchg16()/xchg16() on top of
> either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg()
> wrapper around those (or alternatively we can decide to have them
> only deal with fixed-32-bit and long/pointer sized atomics).

Yeah, that was the idea.

-Stafford
Guo Ren March 31, 2021, 2:47 p.m. UTC | #32
On Tue, Mar 30, 2021 at 10:09 PM Waiman Long <longman@redhat.com> wrote:
>
> On 3/29/21 11:13 PM, Guo Ren wrote:
> > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> >>> u32 a = 0x55aa66bb;
> >>> u16 *ptr = &a;
> >>>
> >>> CPU0                       CPU1
> >>> =========             =========
> >>> xchg16(ptr, new)     while(1)
> >>>                                      WRITE_ONCE(*(ptr + 1), x);
> >>>
> >>> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >> Then I think your LL/SC is broken.
> >>
> >> That also means you really don't want to build super complex locking
> >> primitives on top, because that live-lock will percolate through.
> > Do you mean the below implementation has live-lock risk?
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +       u32 old, new, val = atomic_read(&lock->val);
> > +
> > +       for (;;) {
> > +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> > +               old = atomic_cmpxchg(&lock->val, val, new);
> > +               if (old == val)
> > +                       break;
> > +
> > +               val = old;
> > +       }
> > +       return old;
> > +}
> If there is a continuous stream of incoming spinlock takers, it is
> possible that some cpus may have to wait a long time to set the tail
> right. However, this should only happen on artificial workload. I doubt
> it will happen with real workload or with limit number of cpus.
Yes, I agree or it couldn't with NR_CPU > 16k.

So the implementation above is suitable for non-sub-word-xchg architecture.

> >
> >> Step 1 would be to get your architecute fixed such that it can provide
> >> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
> >> in building complex systems with it.
> > Quote Waiman's comment [1] on xchg16 optimization:
> >
> > "This optimization is needed to make the qspinlock achieve performance
> > parity with ticket spinlock at light load."
> >
> > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/
> >
> > So for a non-xhg16 machine:
> >   - ticket-lock for small numbers of CPUs
> >   - qspinlock for large numbers of CPUs
> >
> > Okay, I'll put all of them into the next patch :P
> >
> It is true that qspinlock may not offer much advantage when the number
> of cpus is small. It shines for systems with many cpus. You may use
> NR_CPUS to determine if the default should be ticket or qspinlock with
> user override. To determine the right NR_CPUS threshold, you may need to
> run on real SMP RISCV systems to find out.
We'd give the choice to the users, and they could select ticket-lock
or qspinlock in riscv.
Guo Ren March 31, 2021, 3:10 p.m. UTC | #33
Hi Stafford,

How do think add ARCH_USE_QUEUED_SPINLOCKS_XCHG32 in openrisc?

https://lore.kernel.org/linux-riscv/1617201040-83905-7-git-send-email-guoren@kernel.org/T/#u

On Wed, Mar 31, 2021 at 8:31 PM Stafford Horne <shorne@gmail.com> wrote:
>
> On Wed, Mar 31, 2021 at 09:23:27AM +0200, Arnd Bergmann wrote:
> > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
> > >
> > > I just want to chime in here, there may be a better spot in the thread to
> > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
> > > which I have on my todo list somwhere to replace the other generic
> > > implementations like that in mips.
> > >
> > >   arch/openrisc/include/asm/cmpxchg.h
> > >
> > > The idea would be that architectures just implement these methods:
> > >
> > >   long cmpxchg_u32(*ptr,old,new)
> > >   long xchg_u32(*ptr,val)
> > >
> > > Then the rest of the generic header would implement cmpxchg.
> >
> > I like the idea of generalizing it a little further. I'd suggest staying a
> > little closer to the existing naming here though, as we already have
> > cmpxchg() for the type-agnostic version, and cmpxchg64() for the
> > fixed-length 64-bit version.
>
> OK.
>
> > I think a nice interface between architecture-specific and architecture
> > independent code would be to have architectures provide
> > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well
> > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16()
> > if they have instructions for those.
>
> Thanks for the name suggestions, it makes it easier for me.
>
> > The common code can then build cmpxchg16()/xchg16() on top of
> > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg()
> > wrapper around those (or alternatively we can decide to have them
> > only deal with fixed-32-bit and long/pointer sized atomics).
>
> Yeah, that was the idea.
>
> -Stafford
Guo Ren March 31, 2021, 3:22 p.m. UTC | #34
On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > u32 a = 0x55aa66bb;
> > u16 *ptr = &a;
> >
> > CPU0                       CPU1
> > =========             =========
> > xchg16(ptr, new)     while(1)
> >                                     WRITE_ONCE(*(ptr + 1), x);
> >
> > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
>
> Then I think your LL/SC is broken.
No, it's not broken LR.W/SC.W. Quote <8.3 Eventual Success of
Store-Conditional Instructions>:

"As a consequence of the eventuality guarantee, if some harts in an
execution environment are
executing constrained LR/SC loops, and no other harts or devices in
the execution environment
execute an unconditional store or AMO to that reservation set, then at
least one hart will
eventually exit its constrained LR/SC loop. By contrast, if other
harts or devices continue to
write to that reservation set, it is not guaranteed that any hart will
exit its LR/SC loop."

So I think it's a feature of LR/SC. How does the above code (also use
ll.w/sc.w to implement xchg16) running on arm64?

1: ldxr
    eor
    cbnz ... 2f
    stxr
    cbnz ... 1b   // I think it would deadlock for arm64.

"LL/SC fwd progress" which you have mentioned could guarantee stxr
success? How hardware could do that?

>
> That also means you really don't want to build super complex locking
> primitives on top, because that live-lock will percolate through.
>
> Step 1 would be to get your architecute fixed such that it can provide
> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
> in building complex systems with it.
--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Guo Ren April 5, 2021, 4:12 p.m. UTC | #35
Hi Paul,

Thx for the explanation, here is my comment.

On Wed, Mar 31, 2021 at 1:33 PM Paul Campbell <taniwha@gmail.com> wrote:
>
> On Wednesday, 31 March 2021 5:18:56 PM NZDT Guo Ren wrote:
> > > > [1]
> > > > https://github.com/c-sky/csky-linux/commit/e837aad23148542771794d8a2fcc
> > > > 52afd0fcbf88> >
> > > > > It also seems that the current "amoswap" based implementation
> > > > > would be reliable independent of RsrvEventual/RsrvNonEventual.
> > > >
> > > > Yes, the hardware implementation of AMO could be different from LR/SC.
> > > > AMO could use ACE snoop holding to lock the bus in hw coherency
> > > > design, but LR/SC uses an exclusive monitor without locking the bus.
> > > >
> > > > RISC-V hasn't CAS instructions, and it uses LR/SC for cmpxchg. I don't
> > > > think LR/SC would be slower than CAS, and CAS is just good for code
> > > > size.
> > >
> > > What I meant here is that the current spinlock uses a simple amoswap,
> > > which presumably does not suffer from the lack of forward process you
> > > described.
> >
> > Does that mean we should prevent using LR/SC (if RsrvNonEventual)?
>
> Let me provide another data-point, I'm working on a high-end highly
> speculative implementation with many concurrent instructions in flight - from
> my point of view  both sorts of AMO (LR/SC and swap/add/etc) require me to
> grab a cache line in an exclusive modifiable state (so no difference there).
>
> More importantly both sorts of AMO instructions  (unlike most loads and
> stores) can't be speculated (not even LR because it changes hidden state, I
> found this out the hard way bringing up the kernel).
>
> This means that both LR AND SC individually can't be executed until all
> speculation is resolved (that means that they happen really late in the
> execute path and block the resolution of the speculation of subsequent
> instructions) - equally a single amoswap/add/etc instruction can't happen
> until late in the execute path - so both require the same cache line state,
> but one of these sorts of events is better than two of them.
>
> Which in short means that amoswap/add/etc is better for small things - small
> buzzy lock loops, while LR/SC is better for more complex things with actual
> processing between the LR and SC.
Seems your machine using the same way to implement LR/SC and AMO, but
some machines would differ them.

For AMO, I think it's would be like what you've described:
 - AMO would be separated into three parts: load & lock, ALU
operation, store & unlock
 - load & lock, eg: we could using ACE protocol -SNOOP channel to
holding the bus
 - Doing atomic AMO
 - store & unlock: Write the result back and releasing the ACE
protocol -SNOOP channel
I think the above is what you describe as how to "grab a cache line in
an exclusive modifiable state".

But for LR/SC, it's different. Because we have separated AMO into real
three parts of instruction:
 - LR
 - Operation instructions
 - SC
If we let LR holding ACE protocol -SNOOP channel and let SC release
channel, that would break the ISA design (we couldn't let an
instruction holding the snoop bus and made other harts hang up.)

So LR/SC would use address monitors for every hart, to detect the
target address has been written or not.
That means LR/SC won't be implemented fwd progress guarantees. If you
care about fwd progress guarantees, I think ISA should choose cmpxchg
(eg: cas) instead of LR/SC.


>
> ----
>
> Another issue here is to consider is what happens when you hit one of these
> tight spinlocks when the branch target cache is empty and they fail (ie loop
> back and try again) - the default branch prediction, and resulting
> speculation, is (very) likely to be looping back, while hopefully most locks
> are not contended when you hit them and that speculation would be wrong - a
> spinlock like this may not be so good:
>
>         li a0, 1
> loop:
>         amoswap a1, a0, (a2)
>         beqz    a1, loop
>         ..... subsequent code
>
> In my world with no BTC info the pipe fills with dozens of amoswaps, rather
> than  the 'subsequent code'. While (in my world) code like this:
>
>         li a0, 1
> loop:
>         amoswap a1, a0, (a2)
>         bnez    a1, 1f
>         .... subsequent code
>
> 1:      j loop
>
> would actually be better (in my world unconditional jump instructions are
> folded early and never see execution so they're sort of free, though they mess
> with the issue/decode rate). Smart compilers could move the "j loop" out of
> the way, while the double branch on failure is not a big deal since either the
> lock is still held (and you don't care if it's slow) or it's been released in
> which case the cache line has been stolen and the refetch of that cache line
> is going to dominate the next time around the loop
Thx for sharing the view of the spinlock speculative path. But I think
we should use smp_cond_load_acquire not looping.
That means we could use wfe/cpu_relax to let other harts utilized the
core's pipeline. So we needn't optimize the "subsequent code"
speculative path in the multi-threads processing core and just let the
hart relax.


>
> I need to stress here that this is how my architecture works, other's will of
> course be different though I expect that other heavily speculative
> architectures to have similar issues :-)
>
>         Paul Campbell
>         Moonbase Otago
>
>
>
Guo Ren April 5, 2021, 4:40 p.m. UTC | #36
On Wed, Mar 31, 2021 at 12:08 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Mar 30, 2021 at 11:13:55AM +0800, Guo Ren wrote:
> > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > > u32 a = 0x55aa66bb;
> > > > u16 *ptr = &a;
> > > >
> > > > CPU0                       CPU1
> > > > =========             =========
> > > > xchg16(ptr, new)     while(1)
> > > >                                     WRITE_ONCE(*(ptr + 1), x);
> > > >
> > > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> > >
> > > Then I think your LL/SC is broken.
> > >
> > > That also means you really don't want to build super complex locking
> > > primitives on top, because that live-lock will percolate through.
>
> > Do you mean the below implementation has live-lock risk?
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +       u32 old, new, val = atomic_read(&lock->val);
> > +
> > +       for (;;) {
> > +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> > +               old = atomic_cmpxchg(&lock->val, val, new);
> > +               if (old == val)
> > +                       break;
> > +
> > +               val = old;
> > +       }
> > +       return old;
> > +}
>
> That entirely depends on the architecture (and cmpxchg() impementation).
>
> There are a number of cases:
>
>  * architecture has cmpxchg() instruction (x86, s390, sparc, etc.).
>
>   - architecture provides fwd progress (x86)
>   - architecture requires backoff for progress (sparc)
>
>  * architecture does not have cmpxchg, and implements it using LL/SC.
>
>   and here things get *really* interesting, because while an
>   architecture can have LL/SC fwd progress, that does not translate into
>   cmpxchg() also having the same guarantees and all bets are off.
Seems riscv spec didn't mandatory LR/SC fwd guarantee, ref:

In riscv-spec 8.3 Eventual Success of Store-Conditional Instructions

"As a consequence of the eventuality guarantee, if some harts in an
execution environment are executing constrained LR/SC loops, and no
other harts or devices in the execution environment execute an
unconditional store or AMO to that reservation set, then at least one
hart will eventually exit its constrained LR/SC loop. *** By contrast,
if other harts or devices continue to write to that reservation set,
it ***is not guaranteed*** that any hart will exit its LR/SC loop.***
"

>
> The real bummer is that C can do cmpxchg(), but there is no way it can
> do LL/SC. And even if we'd teach C how to do LL/SC, it couldn't be
> generic because architectures lacking it can't emulate it using
> cmpxchg() (there's a fun class of bugs there).
>
> So while the above code might be the best we can do in generic code,
> it's really up to the architecture to make it work.

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Guo Ren April 5, 2021, 4:45 p.m. UTC | #37
On Tue, Mar 30, 2021 at 10:09 PM Waiman Long <longman@redhat.com> wrote:
>
> On 3/29/21 11:13 PM, Guo Ren wrote:
> > On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >> On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> >>> u32 a = 0x55aa66bb;
> >>> u16 *ptr = &a;
> >>>
> >>> CPU0                       CPU1
> >>> =========             =========
> >>> xchg16(ptr, new)     while(1)
> >>>                                      WRITE_ONCE(*(ptr + 1), x);
> >>>
> >>> When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >> Then I think your LL/SC is broken.
> >>
> >> That also means you really don't want to build super complex locking
> >> primitives on top, because that live-lock will percolate through.
> > Do you mean the below implementation has live-lock risk?
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +       u32 old, new, val = atomic_read(&lock->val);
> > +
> > +       for (;;) {
> > +               new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> > +               old = atomic_cmpxchg(&lock->val, val, new);
> > +               if (old == val)
> > +                       break;
> > +
> > +               val = old;
> > +       }
> > +       return old;
> > +}
> If there is a continuous stream of incoming spinlock takers, it is
> possible that some cpus may have to wait a long time to set the tail
> right. However, this should only happen on artificial workload. I doubt
> it will happen with real workload or with limit number of cpus.
Yes, I think is ok for LR/SC in riscv, becasue

CPU0 LR
CPU1 LR
CPU0 SC //success
CPU1 SC //fail

or

CPU0 LR
CPU1 LR
CPU1 SC //success
CPU0 SC //fail

So always one store condition would success. I think it's OK.

> >
> >> Step 1 would be to get your architecute fixed such that it can provide
> >> fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
> >> in building complex systems with it.
> > Quote Waiman's comment [1] on xchg16 optimization:
> >
> > "This optimization is needed to make the qspinlock achieve performance
> > parity with ticket spinlock at light load."
> >
> > [1] https://lore.kernel.org/kvm/1429901803-29771-6-git-send-email-Waiman.Long@hp.com/
> >
> > So for a non-xhg16 machine:
> >   - ticket-lock for small numbers of CPUs
> >   - qspinlock for large numbers of CPUs
> >
> > Okay, I'll put all of them into the next patch :P
> >
> It is true that qspinlock may not offer much advantage when the number
> of cpus is small. It shines for systems with many cpus. You may use
> NR_CPUS to determine if the default should be ticket or qspinlock with
> user override. To determine the right NR_CPUS threshold, you may need to
> run on real SMP RISCV systems to find out.
I Agree
Guo Ren April 6, 2021, 3:50 a.m. UTC | #38
On Wed, Mar 31, 2021 at 3:23 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
> >
> > I just want to chime in here, there may be a better spot in the thread to
> > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
> > which I have on my todo list somwhere to replace the other generic
> > implementations like that in mips.
> >
> >   arch/openrisc/include/asm/cmpxchg.h
> >
> > The idea would be that architectures just implement these methods:
> >
> >   long cmpxchg_u32(*ptr,old,new)
> >   long xchg_u32(*ptr,val)
> >
> > Then the rest of the generic header would implement cmpxchg.
>
> I like the idea of generalizing it a little further. I'd suggest staying a
> little closer to the existing naming here though, as we already have
> cmpxchg() for the type-agnostic version, and cmpxchg64() for the
> fixed-length 64-bit version.
>
> I think a nice interface between architecture-specific and architecture
> independent code would be to have architectures provide
> arch_cmpxchg32()/arch_xchg32() as the most basic version, as well
> as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16()
> if they have instructions for those.
>
> The common code can then build cmpxchg16()/xchg16() on top of
> either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg()
> wrapper around those (or alternatively we can decide to have them
> only deal with fixed-32-bit and long/pointer sized atomics).
I think these emulation codes are suitable for some architectures but not riscv.

We shouldn't export xchg16/cmpxchg16(emulated by lr.w/sc.w) in riscv,
We should forbid these sub-word atomic primitive and lets the
programmers consider their atomic design.
Peter Zijlstra April 6, 2021, 7:15 a.m. UTC | #39
On Wed, Mar 31, 2021 at 11:22:35PM +0800, Guo Ren wrote:
> On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > u32 a = 0x55aa66bb;
> > > u16 *ptr = &a;
> > >
> > > CPU0                       CPU1
> > > =========             =========
> > > xchg16(ptr, new)     while(1)
> > >                                     WRITE_ONCE(*(ptr + 1), x);
> > >
> > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >
> > Then I think your LL/SC is broken.
> No, it's not broken LR.W/SC.W. Quote <8.3 Eventual Success of
> Store-Conditional Instructions>:
> 
> "As a consequence of the eventuality guarantee, if some harts in an
> execution environment are executing constrained LR/SC loops, and no
> other harts or devices in the execution environment execute an
> unconditional store or AMO to that reservation set, then at least one
> hart will eventually exit its constrained LR/SC loop. By contrast, if
> other harts or devices continue to write to that reservation set, it
> is not guaranteed that any hart will exit its LR/SC loop."

(there, reflowed it for you)

That just means your arch spec is broken too :-)

> So I think it's a feature of LR/SC. How does the above code (also use
> ll.w/sc.w to implement xchg16) running on arm64?
> 
> 1: ldxr
>     eor
>     cbnz ... 2f
>     stxr
>     cbnz ... 1b   // I think it would deadlock for arm64.
> 
> "LL/SC fwd progress" which you have mentioned could guarantee stxr
> success? How hardware could do that?

I'm not a hardware person; I've never actually build anything larger
than a 4 bit adder with nand gates (IIRC, 25+ years ago). And I'll let
Will answer the ARM64 part.

That said, I think the idea is that if you lock the line (load-locked is
a clue ofcourse) to the core until either: an exception (or anything
else that is guaranteed to fail LL/SC), SC or N instructions, then a
competing LL/SC will stall in the LL while the first core makes
progress.

This same principle is key to hardware progress for cmpxchg/cas loops,
don't instantly yield the exclusive hold on the cacheline, keep it
around for a while.

Out-of-order CPUs can do even better I think, by virtue of them being
able to see tight loops.


Anyway, given you have such a crap architecture (and here I thought
RISC-V was supposed to be a modern design *sigh*), you had better go
look at the sparc64 atomic implementation which has a software backoff
for failed CAS in order to make fwd progress.
Stafford Horne April 6, 2021, 8:51 a.m. UTC | #40
On Wed, Mar 31, 2021 at 11:10:53PM +0800, Guo Ren wrote:
> Hi Stafford,
> 
> How do think add ARCH_USE_QUEUED_SPINLOCKS_XCHG32 in openrisc?
> 
> https://lore.kernel.org/linux-riscv/1617201040-83905-7-git-send-email-guoren@kernel.org/T/#u

Sorry I missed your mail here.

It is true that OpenRISC doesn't have xchg16, so using the xchg32 code is better.

Acked-by: Stafford Horne <shorne@gmail.com>


> On Wed, Mar 31, 2021 at 8:31 PM Stafford Horne <shorne@gmail.com> wrote:
> >
> > On Wed, Mar 31, 2021 at 09:23:27AM +0200, Arnd Bergmann wrote:
> > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
> > > >
> > > > I just want to chime in here, there may be a better spot in the thread to
> > > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
> > > > which I have on my todo list somwhere to replace the other generic
> > > > implementations like that in mips.
> > > >
> > > >   arch/openrisc/include/asm/cmpxchg.h
> > > >
> > > > The idea would be that architectures just implement these methods:
> > > >
> > > >   long cmpxchg_u32(*ptr,old,new)
> > > >   long xchg_u32(*ptr,val)
> > > >
> > > > Then the rest of the generic header would implement cmpxchg.
> > >
> > > I like the idea of generalizing it a little further. I'd suggest staying a
> > > little closer to the existing naming here though, as we already have
> > > cmpxchg() for the type-agnostic version, and cmpxchg64() for the
> > > fixed-length 64-bit version.
> >
> > OK.
> >
> > > I think a nice interface between architecture-specific and architecture
> > > independent code would be to have architectures provide
> > > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well
> > > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16()
> > > if they have instructions for those.
> >
> > Thanks for the name suggestions, it makes it easier for me.
> >
> > > The common code can then build cmpxchg16()/xchg16() on top of
> > > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg()
> > > wrapper around those (or alternatively we can decide to have them
> > > only deal with fixed-32-bit and long/pointer sized atomics).
> >
> > Yeah, that was the idea.
> >
> > -Stafford
> 
> 
> 
> -- 
> Best Regards
>  Guo Ren
> 
> ML: https://lore.kernel.org/linux-csky/
Stafford Horne April 6, 2021, 8:56 a.m. UTC | #41
On Tue, Apr 06, 2021 at 11:50:38AM +0800, Guo Ren wrote:
> On Wed, Mar 31, 2021 at 3:23 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
> > >
> > > I just want to chime in here, there may be a better spot in the thread to
> > > mention this but, for OpenRISC I did implement some generic 8/16-bit xchg code
> > > which I have on my todo list somwhere to replace the other generic
> > > implementations like that in mips.
> > >
> > >   arch/openrisc/include/asm/cmpxchg.h
> > >
> > > The idea would be that architectures just implement these methods:
> > >
> > >   long cmpxchg_u32(*ptr,old,new)
> > >   long xchg_u32(*ptr,val)
> > >
> > > Then the rest of the generic header would implement cmpxchg.
> >
> > I like the idea of generalizing it a little further. I'd suggest staying a
> > little closer to the existing naming here though, as we already have
> > cmpxchg() for the type-agnostic version, and cmpxchg64() for the
> > fixed-length 64-bit version.
> >
> > I think a nice interface between architecture-specific and architecture
> > independent code would be to have architectures provide
> > arch_cmpxchg32()/arch_xchg32() as the most basic version, as well
> > as arch_cmpxchg8()/arch_cmpxchg16()/arch_xchg8()/arch_xchg16()
> > if they have instructions for those.
> >
> > The common code can then build cmpxchg16()/xchg16() on top of
> > either the 16-bit or the 32-bit primitives, and build the cmpxchg()/xchg()
> > wrapper around those (or alternatively we can decide to have them
> > only deal with fixed-32-bit and long/pointer sized atomics).
> I think these emulation codes are suitable for some architectures but not riscv.
> 
> We shouldn't export xchg16/cmpxchg16(emulated by lr.w/sc.w) in riscv,
> We should forbid these sub-word atomic primitive and lets the
> programmers consider their atomic design.

Fair enough, having the generic sub-word emulation would be something that
an architecture can select to use/export.

-Stafford
Boqun Feng April 6, 2021, 5:24 p.m. UTC | #42
On Wed, Mar 31, 2021 at 11:22:35PM +0800, Guo Ren wrote:
> On Mon, Mar 29, 2021 at 8:50 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Mar 29, 2021 at 08:01:41PM +0800, Guo Ren wrote:
> > > u32 a = 0x55aa66bb;
> > > u16 *ptr = &a;
> > >
> > > CPU0                       CPU1
> > > =========             =========
> > > xchg16(ptr, new)     while(1)
> > >                                     WRITE_ONCE(*(ptr + 1), x);
> > >
> > > When we use lr.w/sc.w implement xchg16, it'll cause CPU0 deadlock.
> >
> > Then I think your LL/SC is broken.
> No, it's not broken LR.W/SC.W. Quote <8.3 Eventual Success of
> Store-Conditional Instructions>:
> 
> "As a consequence of the eventuality guarantee, if some harts in an
> execution environment are
> executing constrained LR/SC loops, and no other harts or devices in
> the execution environment
> execute an unconditional store or AMO to that reservation set, then at
> least one hart will
> eventually exit its constrained LR/SC loop. By contrast, if other
> harts or devices continue to
> write to that reservation set, it is not guaranteed that any hart will
> exit its LR/SC loop."
> 
> So I think it's a feature of LR/SC. How does the above code (also use
> ll.w/sc.w to implement xchg16) running on arm64?
> 
> 1: ldxr
>     eor
>     cbnz ... 2f
>     stxr
>     cbnz ... 1b   // I think it would deadlock for arm64.
> 
> "LL/SC fwd progress" which you have mentioned could guarantee stxr
> success? How hardware could do that?
> 

Actually, "old" riscv standard does provide fwd progress ;-) In

	https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf

Section "7.2 Load-Reserved/Store-Conditional Instructions":

"""
One advantage of CAS is that it guarantees that some hart eventually
makes progress, whereas an LR/SC atomic sequence could livelock
indefinitely on some systems. To avoid this concern, we added an
architectural guarantee of forward progress to LR/SC atomic sequences.
The restrictions on LR/SC sequence contents allows an implementation to
**capture a cache line on the LR and complete the LR/SC sequence by
holding off remote cache interventions for a bounded short time**.
"""

The guarantee is removed later due to "Earlier versions of this
specification imposed a stronger starvation-freedom guarantee. However,
the weaker livelock-freedom guarantee is sufficient to implement the C11
and C++11 languages, and is substantially easier to provide in some
microarchitectural styles."

But I take it as an example that hardware can guarantee this.

Regards,
Boqun

> >
> > That also means you really don't want to build super complex locking
> > primitives on top, because that live-lock will percolate through.
> >
> > Step 1 would be to get your architecute fixed such that it can provide
> > fwd progress guarantees for LL/SC. Otherwise there's absolutely no point
> > in building complex systems with it.
> --
> Best Regards
>  Guo Ren
> 
> ML: https://lore.kernel.org/linux-csky/
Arnd Bergmann April 7, 2021, 8:42 a.m. UTC | #43
On Tue, Apr 6, 2021 at 10:56 AM Stafford Horne <shorne@gmail.com> wrote:
> On Tue, Apr 06, 2021 at 11:50:38AM +0800, Guo Ren wrote:
> > On Wed, Mar 31, 2021 at 3:23 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > On Wed, Mar 31, 2021 at 12:35 AM Stafford Horne <shorne@gmail.com> wrote:
> >
> > We shouldn't export xchg16/cmpxchg16(emulated by lr.w/sc.w) in riscv,
> > We should forbid these sub-word atomic primitive and lets the
> > programmers consider their atomic design.
>
> Fair enough, having the generic sub-word emulation would be something that
> an architecture can select to use/export.

I still have the feeling that we should generalize and unify the exact behavior
across architectures as much as possible here, while possibly also trying to
simplify the interface a little.

Looking through the various xchg()/cmpxchg() implementations, I find eight
distinct ways to do 8-bit and 16-bit atomics:

Full support:
      ia64, m68k (Atari only), x86, arm32 (v6k+), arm64

gcc/clang __sync_{val,bool}_compare_and_swap:
     s390

Emulated through ll/sc:
      alpha, powerpc

Emulated through cmpxchg loop:
      mips, openrisc, xtensa (xchg but not cmpxchg), sparc64 (cmpxchg_u8,
      xchg_u16 but not cmpxchg_u16 and xchg_u8!)

Emulated through local_irq_save (non SMP only):
        h8300, m68k (most), microblaze, mips, nds32, nios2

Emulated through hashed spinlock:
        parisc (8-bit only added in 2020, 16-bit still missing)

Forced compile-time error:
       arm32 (v4/v5/v6 non-SMP), arc, csky, riscv, parisc (16 bit), sparc32,
       sparc64, xtensa (cmpxchg)

Silently broken:
        hexagon

Since there are really only a handful of instances in the kernel
that use the cmpxchg() or xchg() on u8/u16 variables, it would seem
best to just disallow those completely and have a separate set of
functions here, with only 64-bit architectures using any variable-type
wrapper to handle both 32-bit and 64-bit arguments.

Interestingly, the s390 version using __sync_val_compare_and_swap()
seems to produce nice output on all architectures that have atomic
instructions, with any supported compiler, to the point where I think
we could just use that to replace most of the inline-asm versions except
for arm64:

#define cmpxchg(ptr, o, n)                                              \
({                                                                      \
        __typeof__(*(ptr)) __o = (o);                                   \
        __typeof__(*(ptr)) __n = (n);                                   \
        (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
})

Not how gcc's acquire/release behavior of __sync_val_compare_and_swap()
relates to what the kernel wants here.

The gcc documentation also recommends using the standard
__atomic_compare_exchange_n() builtin instead, which would allow
constructing release/acquire/relaxed versions as well, but I could not
get it to produce equally good output. (possibly I was using it wrong)

       Arnd
Peter Zijlstra April 7, 2021, 9:26 a.m. UTC | #44
On Wed, Apr 07, 2021 at 01:24:12AM +0800, Boqun Feng wrote:

> Actually, "old" riscv standard does provide fwd progress ;-) In
> 
> 	https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
> 
> Section "7.2 Load-Reserved/Store-Conditional Instructions":
> 
> """
> One advantage of CAS is that it guarantees that some hart eventually
> makes progress, whereas an LR/SC atomic sequence could livelock
> indefinitely on some systems. To avoid this concern, we added an

This is not inherent to CAS, there's plenty of CAS implementations that
are prone to livelock (unfortunately).

> architectural guarantee of forward progress to LR/SC atomic sequences.
> The restrictions on LR/SC sequence contents allows an implementation to
> **capture a cache line on the LR and complete the LR/SC sequence by
> holding off remote cache interventions for a bounded short time**.
> """
> 
> The guarantee is removed later due to "Earlier versions of this
> specification imposed a stronger starvation-freedom guarantee. However,
> the weaker livelock-freedom guarantee is sufficient to implement the C11
> and C++11 languages, and is substantially easier to provide in some
> microarchitectural styles."

Pff, that's just stupid. I suppose the best you can say of the C11
memory model is that it's nice that the C committee realized they needed
to catch up to the every day reality of SMP systems (which people have
been writing 'C' for ever since the 70s, since SMP is older than C
itself).

There's so much wrong with the C11 memory model and atomics, using it to
define an architecture is just backwards.

C11 cannot do RCU, C11 cannot do seqlocks, C11 cannot do deterministic
locks, C11 cannot do real systems.

C11 is garbage.

Yes, architectures need to support C11, because sadly C11 exists and
people will use it. But architectures also need to be able to do real
systems and thus should not limit themselves to C11.
Christoph Hellwig April 7, 2021, 9:42 a.m. UTC | #45
On Tue, Apr 06, 2021 at 09:15:50AM +0200, Peter Zijlstra wrote:
> Anyway, given you have such a crap architecture (and here I thought
> RISC-V was supposed to be a modern design *sigh*), you had better go
> look at the sparc64 atomic implementation which has a software backoff
> for failed CAS in order to make fwd progress.

It wasn't supposed to be modern.  It was supposed to use boring old
ideas.  Where it actually did that it is a great ISA, in parts where
academics actually tried to come up with cool or state of the art
ideas (interrupt handling, tlb shootdowns, the totally fucked up
memory model) it turned into a trainwreck.
Peter Zijlstra April 7, 2021, 11:36 a.m. UTC | #46
On Wed, Apr 07, 2021 at 10:42:50AM +0200, Arnd Bergmann wrote:
> Since there are really only a handful of instances in the kernel
> that use the cmpxchg() or xchg() on u8/u16 variables, it would seem
> best to just disallow those completely 

Not going to happen. xchg16 is optimal for qspinlock and if we replace
that with a cmpxchg loop on x86 we're regressing.

> Interestingly, the s390 version using __sync_val_compare_and_swap()
> seems to produce nice output on all architectures that have atomic
> instructions, with any supported compiler, to the point where I think
> we could just use that to replace most of the inline-asm versions except
> for arm64:
> 
> #define cmpxchg(ptr, o, n)                                              \
> ({                                                                      \
>         __typeof__(*(ptr)) __o = (o);                                   \
>         __typeof__(*(ptr)) __n = (n);                                   \
>         (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
> })

It generates the LL/SC loop, but doesn't do sensible optimizations when
it's again used in a loop itself. That is, it generates a loop of a
loop, just like what you'd expect, which is sub-optimal for LL/SC.

> Not how gcc's acquire/release behavior of __sync_val_compare_and_swap()
> relates to what the kernel wants here.
> 
> The gcc documentation also recommends using the standard
> __atomic_compare_exchange_n() builtin instead, which would allow
> constructing release/acquire/relaxed versions as well, but I could not
> get it to produce equally good output. (possibly I was using it wrong)

I'm scared to death of the C11 crap, the compiler will 'optimize' them
when it feels like it and use the C11 memory model rules for it, which
are not compatible with the kernel rules.

But the same thing applies, it won't do the right thing for composites.
Arnd Bergmann April 7, 2021, 11:57 a.m. UTC | #47
On Wed, Apr 7, 2021 at 1:36 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Apr 07, 2021 at 10:42:50AM +0200, Arnd Bergmann wrote:
> > Since there are really only a handful of instances in the kernel
> > that use the cmpxchg() or xchg() on u8/u16 variables, it would seem
> > best to just disallow those completely
>
> Not going to happen. xchg16 is optimal for qspinlock and if we replace
> that with a cmpxchg loop on x86 we're regressing.

I did not mean to remove the possibility of doing a 16-bit compare-exchange
operation altogether, just removing it from the regular macros.

We already do this for the 64-bit xchg64()/cmpxchg64(), which only some
of the 32-bit architectures provide, so I think having an explicit
xchg8()/xchg16()/cmpxchg8()/cmpxchg16() interface while tightening the
type checking would be more consistent here.

On any 32-bit architecture, cmpxchg()/xchg() macros then only have
to deal with word-sized operations.

> > Interestingly, the s390 version using __sync_val_compare_and_swap()
> > seems to produce nice output on all architectures that have atomic
> > instructions, with any supported compiler, to the point where I think
> > we could just use that to replace most of the inline-asm versions except
> > for arm64:
> >
> > #define cmpxchg(ptr, o, n)                                              \
> > ({                                                                      \
> >         __typeof__(*(ptr)) __o = (o);                                   \
> >         __typeof__(*(ptr)) __n = (n);                                   \
> >         (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
> > })
>
> It generates the LL/SC loop, but doesn't do sensible optimizations when
> it's again used in a loop itself. That is, it generates a loop of a
> loop, just like what you'd expect, which is sub-optimal for LL/SC.

One thing that it does though is to generate an ll/sc loop for xchg16(),
while mips, openrisc, xtensa and sparc64 currently open-code the
nested loop in their respective xchg16() wrappers. I have not seen
any case in which it produces object code that is worse than the
architecture specific code does today, except for those that rely on
runtime patching (i486+smp, arm64+lse).

> > Not how gcc's acquire/release behavior of __sync_val_compare_and_swap()
> > relates to what the kernel wants here.
> >
> > The gcc documentation also recommends using the standard
> > __atomic_compare_exchange_n() builtin instead, which would allow
> > constructing release/acquire/relaxed versions as well, but I could not
> > get it to produce equally good output. (possibly I was using it wrong)
>
> I'm scared to death of the C11 crap, the compiler will 'optimize' them
> when it feels like it and use the C11 memory model rules for it, which
> are not compatible with the kernel rules.
>
> But the same thing applies, it won't do the right thing for composites.

Makes sense. As I said, I could not even get it to produce optimal code
for the simple case.

         Arnd
Peter Zijlstra April 7, 2021, 12:02 p.m. UTC | #48
On Wed, Apr 07, 2021 at 01:36:45PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 07, 2021 at 10:42:50AM +0200, Arnd Bergmann wrote:
> > Since there are really only a handful of instances in the kernel
> > that use the cmpxchg() or xchg() on u8/u16 variables, it would seem
> > best to just disallow those completely 
> 
> Not going to happen. xchg16 is optimal for qspinlock and if we replace
> that with a cmpxchg loop on x86 we're regressing.
> 
> > Interestingly, the s390 version using __sync_val_compare_and_swap()
> > seems to produce nice output on all architectures that have atomic
> > instructions, with any supported compiler, to the point where I think
> > we could just use that to replace most of the inline-asm versions except
> > for arm64:
> > 
> > #define cmpxchg(ptr, o, n)                                              \
> > ({                                                                      \
> >         __typeof__(*(ptr)) __o = (o);                                   \
> >         __typeof__(*(ptr)) __n = (n);                                   \
> >         (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
> > })
> 
> It generates the LL/SC loop, but doesn't do sensible optimizations when
> it's again used in a loop itself. That is, it generates a loop of a
> loop, just like what you'd expect, which is sub-optimal for LL/SC.
> 
> > Not how gcc's acquire/release behavior of __sync_val_compare_and_swap()
> > relates to what the kernel wants here.
> > 
> > The gcc documentation also recommends using the standard
> > __atomic_compare_exchange_n() builtin instead, which would allow
> > constructing release/acquire/relaxed versions as well, but I could not
> > get it to produce equally good output. (possibly I was using it wrong)
> 
> I'm scared to death of the C11 crap, the compiler will 'optimize' them
> when it feels like it and use the C11 memory model rules for it, which
> are not compatible with the kernel rules.
> 
> But the same thing applies, it won't do the right thing for composites.

See the mess it makes:

https://godbolt.org/z/r7d13d4Kf

That should've been something like:


__xadd:
	mov r3, r0
	dmb ish
.L1:
	ldrex r0, [r3]
	adds r1, r0
	strex ip, r0, [r3]
	cmp ip, #0
	bne .L1
	dmb ish


Which I'll argue has stronger guarantees than the double loop. I'm
pretty sure branches can release the lock on some archs (Alpha is known
to do this).
Christoph Muellner April 7, 2021, 2:29 p.m. UTC | #49
On Wed, Apr 7, 2021 at 11:43 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Apr 06, 2021 at 09:15:50AM +0200, Peter Zijlstra wrote:
> > Anyway, given you have such a crap architecture (and here I thought
> > RISC-V was supposed to be a modern design *sigh*), you had better go
> > look at the sparc64 atomic implementation which has a software backoff
> > for failed CAS in order to make fwd progress.
>
> It wasn't supposed to be modern.  It was supposed to use boring old
> ideas.  Where it actually did that it is a great ISA, in parts where
> academics actually tried to come up with cool or state of the art
> ideas (interrupt handling, tlb shootdowns, the totally fucked up
> memory model) it turned into a trainwreck.

Gentlemen, please rethink your wording.
RISC-V is neither "crap" nor a "trainwreck", regardless if you like it or not.

The comparison with sparc64 is not applicable, as sparc64 does not
have LL/SC instructions.

Further, it is not the case that RISC-V has no guarantees at all.
It just does not provide a forward progress guarantee for a
synchronization implementation,
that writes in an endless loop to a memory location while trying to
complete an LL/SC
loop on the same memory location at the same time.
If there's a reasonable algorithm, that relies on forward progress in this case,
then we should indeed think about that, but I haven't seen one so far.
The whole MCF lock idea is to actually spin on different memory
locations per CPU
to improve scalability (reduce cacheline bouncing). That's a clear indicator,
that well-scaling synchronization algorithms need to avoid contended cache lines
anyways.

RISC-V defines LR/SC loops consisting of up to 16 instructions as
constrained LR/SC loops.
Such constrained LR/SC loops provide the required forward guarantees,
that are expected
(similar to what other architectures, like AArch64, have).

What RISC-V does not have is sub-word atomics and if required, we
would have to implement
them as LL/SC sequences. And yes, using atomic instructions is
preferred over using LL/SC,
because atomics will tend to perform better (less instructions and
less spilled registers).
But that actually depends on the actual ISA implementation.

Respectfully,
Christoph
Christoph Hellwig April 7, 2021, 2:34 p.m. UTC | #50
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph M??llner wrote:
> Gentlemen, please rethink your wording.
> RISC-V is neither "crap" nor a "trainwreck", regardless if you like it or not.

No, by all objective standards the RISC-V memory model and privileged
architecture is a trainwreck.  Anyone who disagrees is way out there in
the sky.
Peter Zijlstra April 7, 2021, 3:51 p.m. UTC | #51
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote:
> Further, it is not the case that RISC-V has no guarantees at all.
> It just does not provide a forward progress guarantee for a
> synchronization implementation,
> that writes in an endless loop to a memory location while trying to
> complete an LL/SC
> loop on the same memory location at the same time.

Userspace can DoS the kernel that way, see futex.
Peter Zijlstra April 7, 2021, 3:52 p.m. UTC | #52
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote:
> The comparison with sparc64 is not applicable, as sparc64 does not
> have LL/SC instructions.

Sparc64 has CAS, without hardware fwd progress. It has to do software
backoff for failed CAS in order to do software fwd progress.
Peter Zijlstra April 7, 2021, 4 p.m. UTC | #53
On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote:
> RISC-V defines LR/SC loops consisting of up to 16 instructions as
> constrained LR/SC loops.  Such constrained LR/SC loops provide the
> required forward guarantees, that are expected (similar to what other
> architectures, like AArch64, have).

The text quoted by others didn't seem to say such a thing, but whatever.

> What RISC-V does not have is sub-word atomics and if required, we
> would have to implement them as LL/SC sequences. And yes, using atomic
> instructions is preferred over using LL/SC,

(psudo asm, can't be bothered to figure out the actual syntax)

	# setup r_and_mask, r_or_mask

.L1
	LL r, [word]
	AND r, r, r_and_mask
	OR r, r, r_or_mask
	SC r, [word]
	JNE .L1

is what you need for LL/SC based xchg16, that's less than 16
instructions. If RISC-V guarantees fwd progress on that, good, write it
like that and lets end this thread.

The fact that this is apparently hard, is not good.
Peter Zijlstra April 7, 2021, 4:44 p.m. UTC | #54
On Wed, Apr 07, 2021 at 05:51:07PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote:
> > Further, it is not the case that RISC-V has no guarantees at all.
> > It just does not provide a forward progress guarantee for a
> > synchronization implementation,
> > that writes in an endless loop to a memory location while trying to
> > complete an LL/SC
> > loop on the same memory location at the same time.
> 
> Userspace can DoS the kernel that way, see futex.

The longer answer is that this means you cannot share locks (or any
atomic really) across a trust boundary, which is of course exactly what
futex does.
Peter Zijlstra April 7, 2021, 4:54 p.m. UTC | #55
On Wed, Apr 07, 2021 at 05:52:36PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote:
> > The comparison with sparc64 is not applicable, as sparc64 does not
> > have LL/SC instructions.
> 
> Sparc64 has CAS, without hardware fwd progress. It has to do software
> backoff for failed CAS in order to do software fwd progress.

Again, the longer answer is that this applies identically to LL/SC and
I'm sure we actually have (or had) an architecture in tree that did just
that. I just cannot remember which architecture that was.
Christoph Muellner April 7, 2021, 7:50 p.m. UTC | #56
On Wed, Apr 7, 2021 at 6:00 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Apr 07, 2021 at 04:29:12PM +0200, Christoph Müllner wrote:
> > RISC-V defines LR/SC loops consisting of up to 16 instructions as
> > constrained LR/SC loops.  Such constrained LR/SC loops provide the
> > required forward guarantees, that are expected (similar to what other
> > architectures, like AArch64, have).
>
> The text quoted by others didn't seem to say such a thing, but whatever.

The RISC-V unpriv spec is public can be found here:
https://riscv.org/technical/specifications/
Version 20191213 discusses LR/SC-loops in section 8.3 (page 51).
So in case you are interested in the exact wording, you can find it there.

> > What RISC-V does not have is sub-word atomics and if required, we
> > would have to implement them as LL/SC sequences. And yes, using atomic
> > instructions is preferred over using LL/SC,
>
> (psudo asm, can't be bothered to figure out the actual syntax)
>
>         # setup r_and_mask, r_or_mask
>
> .L1
>         LL r, [word]
>         AND r, r, r_and_mask
>         OR r, r, r_or_mask
>         SC r, [word]
>         JNE .L1

I fully agree with this.
I've implemented a patch for that two weeks ago using the following helper:

+/*
+ * Mask and set given bits at a given address atomically.
+ * The masked old value will be returned.
+ */
+static inline u32 atomic_mask_and_set(u32* p, u32 mask, u32 val)
+{
+       u32 ret, tmp;
+       __asm__ __volatile__ (
+               "0:     lr.w %0, %2\n"
+               "       and  %0, %0, %3\n"
+               "       or   %1, %0, %4\n"
+               "       sc.w %1, %1, %2\n"
+               "       bnez %1, 0b\n"
+               : "+&r"(ret), "=&r" (tmp), "+A"(*p)
+               : "r" (mask), "rJ"(val)
+               : "memory");
+       return ret;
+}

However, Guo pushed out a new patchset in between and I decided to not continue
my approach to not undermine his approach.

I will sync up with Guo to provide a common patchset.

Thanks,
Christoph

> is what you need for LL/SC based xchg16, that's less than 16
> instructions. If RISC-V guarantees fwd progress on that, good, write it
> like that and lets end this thread.
>
> The fact that this is apparently hard, is not good.
diff mbox series

Patch

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 3de8fd11873b..d02f1261f73f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -239,6 +239,9 @@  config LOCK_SPIN_ON_OWNER
 config ARCH_USE_QUEUED_SPINLOCKS
 	bool
 
+config ARCH_USE_QUEUED_SPINLOCKS_XCHG32
+	bool
+
 config QUEUED_SPINLOCKS
 	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
 	depends on SMP
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..54de0632c6a8 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -163,26 +163,6 @@  static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }
 
-/*
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail), which heads an address dependency
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
-	/*
-	 * We can use relaxed semantics since the caller ensures that the
-	 * MCS node is properly initialized before updating the tail.
-	 */
-	return (u32)xchg_relaxed(&lock->tail,
-				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
-}
-
 #else /* _Q_PENDING_BITS == 8 */
 
 /**
@@ -206,6 +186,30 @@  static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 {
 	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
 }
+#endif
+
+#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32)
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	/*
+	 * We can use relaxed semantics since the caller ensures that the
+	 * MCS node is properly initialized before updating the tail.
+	 */
+	return (u32)xchg_relaxed(&lock->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else
 
 /**
  * xchg_tail - Put in the new queue tail code word & retrieve previous one