diff mbox series

riscv: locks: introduce ticket-based spinlock implementation

Message ID 1616580892-80815-1-git-send-email-guoren@kernel.org (mailing list archive)
State New, archived
Headers show
Series riscv: locks: introduce ticket-based spinlock implementation | expand

Commit Message

Guo Ren March 24, 2021, 10:14 a.m. UTC
From: Guo Ren <guoren@linux.alibaba.com>

This patch introduces a ticket lock implementation for riscv, along the
same lines as the implementation for arch/arm & arch/csky.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/riscv/Kconfig                      |   1 +
 arch/riscv/include/asm/Kbuild           |   1 +
 arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
 arch/riscv/include/asm/spinlock_types.h |  19 ++--
 4 files changed, 74 insertions(+), 105 deletions(-)

Comments

Peter Zijlstra March 24, 2021, 11:09 a.m. UTC | #1
On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote:
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> +	arch_spinlock_t lockval;
> +	u32 tmp;
> +
> +	asm volatile (
> +		"1:	lr.w	%0, %2		\n"
> +		"	mv	%1, %0		\n"
> +		"	addw	%0, %0, %3	\n"
> +		"	sc.w	%0, %0, %2	\n"
> +		"	bnez	%0, 1b		\n"
> +		: "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> +		: "r" (1 << TICKET_NEXT)
> +		: "memory");
>  
> +	while (lockval.tickets.next != lockval.tickets.owner) {
> +		/*
> +		 * FIXME - we need wfi/wfe here to prevent:
> +		 *  - cache line bouncing
> +		 *  - saving cpu pipeline in multi-harts-per-core
> +		 *    processor
> +		 */
> +		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> +	}
>  
> +	__atomic_acquire_fence();
>  }

> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> +	smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> +	/* FIXME - we need ipi/sev here to notify above */
>  }

Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The
ARM64 model is far superious IMO, and then you can use
smp_cond_load_acquire() in arch_spin_lock() and call it a day.
Guo Ren March 24, 2021, 12:10 p.m. UTC | #2
Thx Peter,

On Wed, Mar 24, 2021 at 7:09 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote:
> > +static inline void arch_spin_lock(arch_spinlock_t *lock)
> > +{
> > +     arch_spinlock_t lockval;
> > +     u32 tmp;
> > +
> > +     asm volatile (
> > +             "1:     lr.w    %0, %2          \n"
> > +             "       mv      %1, %0          \n"
> > +             "       addw    %0, %0, %3      \n"
> > +             "       sc.w    %0, %0, %2      \n"
> > +             "       bnez    %0, 1b          \n"
> > +             : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> > +             : "r" (1 << TICKET_NEXT)
> > +             : "memory");
> >
> > +     while (lockval.tickets.next != lockval.tickets.owner) {
> > +             /*
> > +              * FIXME - we need wfi/wfe here to prevent:
> > +              *  - cache line bouncing
> > +              *  - saving cpu pipeline in multi-harts-per-core
> > +              *    processor
> > +              */
> > +             lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> > +     }
> >
> > +     __atomic_acquire_fence();
> >  }
>
> > +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> >  {
> > +     smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> > +     /* FIXME - we need ipi/sev here to notify above */
> >  }
>
> Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The
Yes, I'm considering that kind of code.

> ARM64 model is far superious IMO, and then you can use
> smp_cond_load_acquire() in arch_spin_lock() and call it a day.
Great tip, thx. I'll follow that.
Peter Zijlstra March 24, 2021, 12:23 p.m. UTC | #3
On Wed, Mar 24, 2021 at 12:15:47PM +0100, Vitaly Wool wrote:
> On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
> 
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This patch introduces a ticket lock implementation for riscv, along the
> > same lines as the implementation for arch/arm & arch/csky.
> >
> 
> Could you please provide a rationale for this? Like, what is wrong with the
> current implementation.

test-and-set spinlocks have terrible worst case behaviour.
Guo Ren March 24, 2021, 12:24 p.m. UTC | #4
On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote:
>
>
>
> On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
>>
>> From: Guo Ren <guoren@linux.alibaba.com>
>>
>> This patch introduces a ticket lock implementation for riscv, along the
>> same lines as the implementation for arch/arm & arch/csky.
>
>
> Could you please provide a rationale for this? Like, what is wrong with the current implementation.
Ticket based spinlock's principle is here:
https://lwn.net/Articles/267968/

Current implementation will cause cache line bouncing when many harts
are acquiring the same spinlock.
I'm seeking a solution, maybe not fitting the current RISC-V base ISA.

I'll add more comments in the next version of patch.

>
> Thanks in advance,
>
> Best regards,
>    Vitaly
>>
>>
>> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> Cc: Anup Patel <anup@brainfault.org>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> ---
>>  arch/riscv/Kconfig                      |   1 +
>>  arch/riscv/include/asm/Kbuild           |   1 +
>>  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>>  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>>  4 files changed, 74 insertions(+), 105 deletions(-)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 87d7b52..7c56a20 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -30,6 +30,7 @@ config RISCV
>>         select ARCH_HAS_STRICT_KERNEL_RWX if MMU
>>         select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
>>         select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
>> +       select ARCH_USE_QUEUED_RWLOCKS
>>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>         select ARCH_WANT_FRAME_POINTERS
>>         select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
>> index 445ccc9..e57ef80 100644
>> --- a/arch/riscv/include/asm/Kbuild
>> +++ b/arch/riscv/include/asm/Kbuild
>> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h
>>  generic-y += extable.h
>>  generic-y += flat.h
>>  generic-y += kvm_para.h
>> +generic-y += qrwlock.h
>>  generic-y += user.h
>>  generic-y += vmlinux.lds.h
>> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
>> index f4f7fa1..2c81764 100644
>> --- a/arch/riscv/include/asm/spinlock.h
>> +++ b/arch/riscv/include/asm/spinlock.h
>> @@ -7,129 +7,91 @@
>>  #ifndef _ASM_RISCV_SPINLOCK_H
>>  #define _ASM_RISCV_SPINLOCK_H
>>
>> -#include <linux/kernel.h>
>> -#include <asm/current.h>
>> -#include <asm/fence.h>
>> -
>>  /*
>> - * Simple spin lock operations.  These provide no fairness guarantees.
>> + * Ticket-based spin-locking.
>>   */
>> +static inline void arch_spin_lock(arch_spinlock_t *lock)
>> +{
>> +       arch_spinlock_t lockval;
>> +       u32 tmp;
>> +
>> +       asm volatile (
>> +               "1:     lr.w    %0, %2          \n"
>> +               "       mv      %1, %0          \n"
>> +               "       addw    %0, %0, %3      \n"
>> +               "       sc.w    %0, %0, %2      \n"
>> +               "       bnez    %0, 1b          \n"
>> +               : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
>> +               : "r" (1 << TICKET_NEXT)
>> +               : "memory");
>>
>> -/* FIXME: Replace this with a ticket lock, like MIPS. */
>> -
>> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
>> +       while (lockval.tickets.next != lockval.tickets.owner) {
>> +               /*
>> +                * FIXME - we need wfi/wfe here to prevent:
>> +                *  - cache line bouncing
>> +                *  - saving cpu pipeline in multi-harts-per-core
>> +                *    processor
>> +                */
>> +               lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
>> +       }
>>
>> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
>> -{
>> -       smp_store_release(&lock->lock, 0);
>> +       __atomic_acquire_fence();
>>  }
>>
>>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>>  {
>> -       int tmp = 1, busy;
>> -
>> -       __asm__ __volatile__ (
>> -               "       amoswap.w %0, %2, %1\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "=r" (busy), "+A" (lock->lock)
>> -               : "r" (tmp)
>> +       u32 tmp, contended, res;
>> +
>> +       do {
>> +               asm volatile (
>> +               "       lr.w    %0, %3          \n"
>> +               "       srliw   %1, %0, %5      \n"
>> +               "       slliw   %2, %0, %5      \n"
>> +               "       or      %1, %2, %1      \n"
>> +               "       li      %2, 0           \n"
>> +               "       sub     %1, %1, %0      \n"
>> +               "       bnez    %1, 1f          \n"
>> +               "       addw    %0, %0, %4      \n"
>> +               "       sc.w    %2, %0, %3      \n"
>> +               "1:                             \n"
>> +               : "=&r" (tmp), "=&r" (contended), "=&r" (res),
>> +                 "+A" (lock->lock)
>> +               : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
>>                 : "memory");
>> +       } while (res);
>>
>> -       return !busy;
>> -}
>> -
>> -static inline void arch_spin_lock(arch_spinlock_t *lock)
>> -{
>> -       while (1) {
>> -               if (arch_spin_is_locked(lock))
>> -                       continue;
>> -
>> -               if (arch_spin_trylock(lock))
>> -                       break;
>> +       if (!contended) {
>> +               __atomic_acquire_fence();
>> +               return 1;
>> +       } else {
>> +               return 0;
>>         }
>>  }
>>
>> -/***********************************************************/
>> -
>> -static inline void arch_read_lock(arch_rwlock_t *lock)
>> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>>  {
>> -       int tmp;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bltz    %1, 1b\n"
>> -               "       addi    %1, %1, 1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "+A" (lock->lock), "=&r" (tmp)
>> -               :: "memory");
>> +       smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
>> +       /* FIXME - we need ipi/sev here to notify above */
>>  }
>>
>> -static inline void arch_write_lock(arch_rwlock_t *lock)
>> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>>  {
>> -       int tmp;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               "       li      %1, -1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "+A" (lock->lock), "=&r" (tmp)
>> -               :: "memory");
>> +       return lock.tickets.owner == lock.tickets.next;
>>  }
>>
>> -static inline int arch_read_trylock(arch_rwlock_t *lock)
>> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
>>  {
>> -       int busy;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bltz    %1, 1f\n"
>> -               "       addi    %1, %1, 1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               "1:\n"
>> -               : "+A" (lock->lock), "=&r" (busy)
>> -               :: "memory");
>> -
>> -       return !busy;
>> +       return !arch_spin_value_unlocked(READ_ONCE(*lock));
>>  }
>>
>> -static inline int arch_write_trylock(arch_rwlock_t *lock)
>> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
>>  {
>> -       int busy;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bnez    %1, 1f\n"
>> -               "       li      %1, -1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               "1:\n"
>> -               : "+A" (lock->lock), "=&r" (busy)
>> -               :: "memory");
>> +       struct __raw_tickets tickets = READ_ONCE(lock->tickets);
>>
>> -       return !busy;
>> +       return (tickets.next - tickets.owner) > 1;
>>  }
>> +#define arch_spin_is_contended arch_spin_is_contended
>>
>> -static inline void arch_read_unlock(arch_rwlock_t *lock)
>> -{
>> -       __asm__ __volatile__(
>> -               RISCV_RELEASE_BARRIER
>> -               "       amoadd.w x0, %1, %0\n"
>> -               : "+A" (lock->lock)
>> -               : "r" (-1)
>> -               : "memory");
>> -}
>> -
>> -static inline void arch_write_unlock(arch_rwlock_t *lock)
>> -{
>> -       smp_store_release(&lock->lock, 0);
>> -}
>> +#include <asm/qrwlock.h>
>>
>>  #endif /* _ASM_RISCV_SPINLOCK_H */
>> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
>> index f398e76..d7b38bf 100644
>> --- a/arch/riscv/include/asm/spinlock_types.h
>> +++ b/arch/riscv/include/asm/spinlock_types.h
>> @@ -10,16 +10,21 @@
>>  # error "please don't include this file directly"
>>  #endif
>>
>> +#define TICKET_NEXT    16
>> +
>>  typedef struct {
>> -       volatile unsigned int lock;
>> +       union {
>> +               u32 lock;
>> +               struct __raw_tickets {
>> +                       /* little endian */
>> +                       u16 owner;
>> +                       u16 next;
>> +               } tickets;
>> +       };
>>  } arch_spinlock_t;
>>
>> -#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
>> -
>> -typedef struct {
>> -       volatile unsigned int lock;
>> -} arch_rwlock_t;
>> +#define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
>>
>> -#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
>> +#include <asm-generic/qrwlock_types.h>
>>
>>  #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
>> --
>> 2.7.4
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
Anup Patel March 24, 2021, 12:28 p.m. UTC | #5
On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> This patch introduces a ticket lock implementation for riscv, along the
> same lines as the implementation for arch/arm & arch/csky.
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> Cc: Anup Patel <anup@brainfault.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> ---
>  arch/riscv/Kconfig                      |   1 +
>  arch/riscv/include/asm/Kbuild           |   1 +
>  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>  arch/riscv/include/asm/spinlock_types.h |  19 ++--

NACK from myside.

Linux ARM64 has moved away from ticket spinlock to qspinlock.

We should directly go for qspinlock.

Regards,
Anup

>  4 files changed, 74 insertions(+), 105 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 87d7b52..7c56a20 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -30,6 +30,7 @@ config RISCV
>         select ARCH_HAS_STRICT_KERNEL_RWX if MMU
>         select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
>         select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
> +       select ARCH_USE_QUEUED_RWLOCKS
>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>         select ARCH_WANT_FRAME_POINTERS
>         select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> index 445ccc9..e57ef80 100644
> --- a/arch/riscv/include/asm/Kbuild
> +++ b/arch/riscv/include/asm/Kbuild
> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h
>  generic-y += extable.h
>  generic-y += flat.h
>  generic-y += kvm_para.h
> +generic-y += qrwlock.h
>  generic-y += user.h
>  generic-y += vmlinux.lds.h
> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> index f4f7fa1..2c81764 100644
> --- a/arch/riscv/include/asm/spinlock.h
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -7,129 +7,91 @@
>  #ifndef _ASM_RISCV_SPINLOCK_H
>  #define _ASM_RISCV_SPINLOCK_H
>
> -#include <linux/kernel.h>
> -#include <asm/current.h>
> -#include <asm/fence.h>
> -
>  /*
> - * Simple spin lock operations.  These provide no fairness guarantees.
> + * Ticket-based spin-locking.
>   */
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> +       arch_spinlock_t lockval;
> +       u32 tmp;
> +
> +       asm volatile (
> +               "1:     lr.w    %0, %2          \n"
> +               "       mv      %1, %0          \n"
> +               "       addw    %0, %0, %3      \n"
> +               "       sc.w    %0, %0, %2      \n"
> +               "       bnez    %0, 1b          \n"
> +               : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> +               : "r" (1 << TICKET_NEXT)
> +               : "memory");
>
> -/* FIXME: Replace this with a ticket lock, like MIPS. */
> -
> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
> +       while (lockval.tickets.next != lockval.tickets.owner) {
> +               /*
> +                * FIXME - we need wfi/wfe here to prevent:
> +                *  - cache line bouncing
> +                *  - saving cpu pipeline in multi-harts-per-core
> +                *    processor
> +                */
> +               lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> +       }
>
> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
> -{
> -       smp_store_release(&lock->lock, 0);
> +       __atomic_acquire_fence();
>  }
>
>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>  {
> -       int tmp = 1, busy;
> -
> -       __asm__ __volatile__ (
> -               "       amoswap.w %0, %2, %1\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "=r" (busy), "+A" (lock->lock)
> -               : "r" (tmp)
> +       u32 tmp, contended, res;
> +
> +       do {
> +               asm volatile (
> +               "       lr.w    %0, %3          \n"
> +               "       srliw   %1, %0, %5      \n"
> +               "       slliw   %2, %0, %5      \n"
> +               "       or      %1, %2, %1      \n"
> +               "       li      %2, 0           \n"
> +               "       sub     %1, %1, %0      \n"
> +               "       bnez    %1, 1f          \n"
> +               "       addw    %0, %0, %4      \n"
> +               "       sc.w    %2, %0, %3      \n"
> +               "1:                             \n"
> +               : "=&r" (tmp), "=&r" (contended), "=&r" (res),
> +                 "+A" (lock->lock)
> +               : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
>                 : "memory");
> +       } while (res);
>
> -       return !busy;
> -}
> -
> -static inline void arch_spin_lock(arch_spinlock_t *lock)
> -{
> -       while (1) {
> -               if (arch_spin_is_locked(lock))
> -                       continue;
> -
> -               if (arch_spin_trylock(lock))
> -                       break;
> +       if (!contended) {
> +               __atomic_acquire_fence();
> +               return 1;
> +       } else {
> +               return 0;
>         }
>  }
>
> -/***********************************************************/
> -
> -static inline void arch_read_lock(arch_rwlock_t *lock)
> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> -       int tmp;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bltz    %1, 1b\n"
> -               "       addi    %1, %1, 1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "+A" (lock->lock), "=&r" (tmp)
> -               :: "memory");
> +       smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> +       /* FIXME - we need ipi/sev here to notify above */
>  }
>
> -static inline void arch_write_lock(arch_rwlock_t *lock)
> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>  {
> -       int tmp;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               "       li      %1, -1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "+A" (lock->lock), "=&r" (tmp)
> -               :: "memory");
> +       return lock.tickets.owner == lock.tickets.next;
>  }
>
> -static inline int arch_read_trylock(arch_rwlock_t *lock)
> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
>  {
> -       int busy;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bltz    %1, 1f\n"
> -               "       addi    %1, %1, 1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               "1:\n"
> -               : "+A" (lock->lock), "=&r" (busy)
> -               :: "memory");
> -
> -       return !busy;
> +       return !arch_spin_value_unlocked(READ_ONCE(*lock));
>  }
>
> -static inline int arch_write_trylock(arch_rwlock_t *lock)
> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
>  {
> -       int busy;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bnez    %1, 1f\n"
> -               "       li      %1, -1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               "1:\n"
> -               : "+A" (lock->lock), "=&r" (busy)
> -               :: "memory");
> +       struct __raw_tickets tickets = READ_ONCE(lock->tickets);
>
> -       return !busy;
> +       return (tickets.next - tickets.owner) > 1;
>  }
> +#define arch_spin_is_contended arch_spin_is_contended
>
> -static inline void arch_read_unlock(arch_rwlock_t *lock)
> -{
> -       __asm__ __volatile__(
> -               RISCV_RELEASE_BARRIER
> -               "       amoadd.w x0, %1, %0\n"
> -               : "+A" (lock->lock)
> -               : "r" (-1)
> -               : "memory");
> -}
> -
> -static inline void arch_write_unlock(arch_rwlock_t *lock)
> -{
> -       smp_store_release(&lock->lock, 0);
> -}
> +#include <asm/qrwlock.h>
>
>  #endif /* _ASM_RISCV_SPINLOCK_H */
> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
> index f398e76..d7b38bf 100644
> --- a/arch/riscv/include/asm/spinlock_types.h
> +++ b/arch/riscv/include/asm/spinlock_types.h
> @@ -10,16 +10,21 @@
>  # error "please don't include this file directly"
>  #endif
>
> +#define TICKET_NEXT    16
> +
>  typedef struct {
> -       volatile unsigned int lock;
> +       union {
> +               u32 lock;
> +               struct __raw_tickets {
> +                       /* little endian */
> +                       u16 owner;
> +                       u16 next;
> +               } tickets;
> +       };
>  } arch_spinlock_t;
>
> -#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
> -
> -typedef struct {
> -       volatile unsigned int lock;
> -} arch_rwlock_t;
> +#define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
>
> -#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
> +#include <asm-generic/qrwlock_types.h>
>
>  #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
> --
> 2.7.4
>
Peter Zijlstra March 24, 2021, 12:31 p.m. UTC | #6
On Wed, Mar 24, 2021 at 08:24:34PM +0800, Guo Ren wrote:
> On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote:
> >
> >
> >
> > On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
> >>
> >> From: Guo Ren <guoren@linux.alibaba.com>
> >>
> >> This patch introduces a ticket lock implementation for riscv, along the
> >> same lines as the implementation for arch/arm & arch/csky.
> >
> >
> > Could you please provide a rationale for this? Like, what is wrong with the current implementation.
> Ticket based spinlock's principle is here:
> https://lwn.net/Articles/267968/
> 
> Current implementation will cause cache line bouncing when many harts
> are acquiring the same spinlock.
> I'm seeking a solution, maybe not fitting the current RISC-V base ISA.

Ticket locks as such don't solve the cacheline bouncing part, since
they're all still spinning on the same line. The big improvement ticket
locks bring is that the lock acquisition time becomes a function of the
longest hold time, instead of being unbounded.

However, combine it with the WFE (preferably the ARM64 variant) and you
can avoid the worst of the bouncing.

If you really want to get rid of the bouncing, go with qspinlock, which
will spin on a cpu local line. That said, qspinlock is quite gnarly
code, and only really wins from ticket when you have NUMA.
Peter Zijlstra March 24, 2021, 12:37 p.m. UTC | #7
On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This patch introduces a ticket lock implementation for riscv, along the
> > same lines as the implementation for arch/arm & arch/csky.
> >
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will.deacon@arm.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> > Cc: Anup Patel <anup@brainfault.org>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > ---
> >  arch/riscv/Kconfig                      |   1 +
> >  arch/riscv/include/asm/Kbuild           |   1 +
> >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> 
> NACK from myside.
> 
> Linux ARM64 has moved away from ticket spinlock to qspinlock.
> 
> We should directly go for qspinlock.

I think it is a sensible intermediate step, even if you want to go
qspinlock. Ticket locks are more or less trivial and get you fairness
and all that goodness without the mind bending complexity of qspinlock.

Once you have the ticket lock implementation solid (and qrwlock) and
everything, *then* start to carefully look at qspinlock.

Now, arguably arm64 did the heavy lifting of making qspinlock good on
weak architectures, but if you want to do it right, you still have to
analyze the whole thing for your own architecture.
Anup Patel March 24, 2021, 12:53 p.m. UTC | #8
On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> > >
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > This patch introduces a ticket lock implementation for riscv, along the
> > > same lines as the implementation for arch/arm & arch/csky.
> > >
> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: Will Deacon <will.deacon@arm.com>
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> > > Cc: Anup Patel <anup@brainfault.org>
> > > Cc: Arnd Bergmann <arnd@arndb.de>
> > > ---
> > >  arch/riscv/Kconfig                      |   1 +
> > >  arch/riscv/include/asm/Kbuild           |   1 +
> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >
> > NACK from myside.
> >
> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >
> > We should directly go for qspinlock.
>
> I think it is a sensible intermediate step, even if you want to go
> qspinlock. Ticket locks are more or less trivial and get you fairness
> and all that goodness without the mind bending complexity of qspinlock.
>
> Once you have the ticket lock implementation solid (and qrwlock) and
> everything, *then* start to carefully look at qspinlock.

I do understand qspinlock are relatively complex but the best thing
about qspinlock is it tries to ensure each CPU spins on it's own location.

Instead of adding ticket spinlock now and later replacing it with qspinlock,
it is better to straight away explore qspinlock hence my NACK.

>
> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> weak architectures, but if you want to do it right, you still have to
> analyze the whole thing for your own architecture.

Most of the RISC-V implementations are weak memory ordering so it
makes more sense to explore qspinlock first.

Regards,
Anup
Palmer Dabbelt April 11, 2021, 9:11 p.m. UTC | #9
On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
>> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>> > >
>> > > From: Guo Ren <guoren@linux.alibaba.com>
>> > >
>> > > This patch introduces a ticket lock implementation for riscv, along the
>> > > same lines as the implementation for arch/arm & arch/csky.
>> > >
>> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
>> > > Cc: Will Deacon <will.deacon@arm.com>
>> > > Cc: Peter Zijlstra <peterz@infradead.org>
>> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> > > Cc: Anup Patel <anup@brainfault.org>
>> > > Cc: Arnd Bergmann <arnd@arndb.de>
>> > > ---
>> > >  arch/riscv/Kconfig                      |   1 +
>> > >  arch/riscv/include/asm/Kbuild           |   1 +
>> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>> >
>> > NACK from myside.
>> >
>> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
>> >
>> > We should directly go for qspinlock.
>>
>> I think it is a sensible intermediate step, even if you want to go
>> qspinlock. Ticket locks are more or less trivial and get you fairness
>> and all that goodness without the mind bending complexity of qspinlock.
>>
>> Once you have the ticket lock implementation solid (and qrwlock) and
>> everything, *then* start to carefully look at qspinlock.
>
> I do understand qspinlock are relatively complex but the best thing
> about qspinlock is it tries to ensure each CPU spins on it's own location.
>
> Instead of adding ticket spinlock now and later replacing it with qspinlock,
> it is better to straight away explore qspinlock hence my NACK.
>
>>
>> Now, arguably arm64 did the heavy lifting of making qspinlock good on
>> weak architectures, but if you want to do it right, you still have to
>> analyze the whole thing for your own architecture.
>
> Most of the RISC-V implementations are weak memory ordering so it
> makes more sense to explore qspinlock first.

I know I'm somewhat late to the party here.  I talked with Will (and 
to a lesser extent Peter) about this a week or two ago and it seems the 
best way to go here is to start with ticket locks.  They're simpler, and 
in Arm land they performed better until we got to the larger systems.  
Given that we don't have any high performance implementations of the 
RISC-V memory model (and likely won't any time soon) it's hard to reason 
about the performance of anything like this, but at a bare minimum 
having fair locks is a pretty big positive and ticket locks should have 
very little overhead while providing fairness.

IMO the decision between ticket and queueing locks is really more of a 
property of the hardware/workload than the ISA, though there are of 
course some pretty deep ISA dependencies than can make one saner than 
the other.  It seems best to me to just allow users to pick their own 
flavor of locks, and at least PPC is already doing that.  I threw 
together a quick asm-generic ticket lock that can be selected at compile 
time, but I want to spend some more time playing with the other 
architectures before sending anything out.
Christoph Muellner April 12, 2021, 1:32 p.m. UTC | #10
On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >> > >
> >> > > From: Guo Ren <guoren@linux.alibaba.com>
> >> > >
> >> > > This patch introduces a ticket lock implementation for riscv, along the
> >> > > same lines as the implementation for arch/arm & arch/csky.
> >> > >
> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> > > Cc: Will Deacon <will.deacon@arm.com>
> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> >> > > Cc: Anup Patel <anup@brainfault.org>
> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
> >> > > ---
> >> > >  arch/riscv/Kconfig                      |   1 +
> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >> >
> >> > NACK from myside.
> >> >
> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >> >
> >> > We should directly go for qspinlock.
> >>
> >> I think it is a sensible intermediate step, even if you want to go
> >> qspinlock. Ticket locks are more or less trivial and get you fairness
> >> and all that goodness without the mind bending complexity of qspinlock.
> >>
> >> Once you have the ticket lock implementation solid (and qrwlock) and
> >> everything, *then* start to carefully look at qspinlock.
> >
> > I do understand qspinlock are relatively complex but the best thing
> > about qspinlock is it tries to ensure each CPU spins on it's own location.
> >
> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
> > it is better to straight away explore qspinlock hence my NACK.
> >
> >>
> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> >> weak architectures, but if you want to do it right, you still have to
> >> analyze the whole thing for your own architecture.
> >
> > Most of the RISC-V implementations are weak memory ordering so it
> > makes more sense to explore qspinlock first.
>
> I know I'm somewhat late to the party here.  I talked with Will (and
> to a lesser extent Peter) about this a week or two ago and it seems the
> best way to go here is to start with ticket locks.  They're simpler, and
> in Arm land they performed better until we got to the larger systems.
> Given that we don't have any high performance implementations of the
> RISC-V memory model (and likely won't any time soon) it's hard to reason
> about the performance of anything like this, but at a bare minimum
> having fair locks is a pretty big positive and ticket locks should have
> very little overhead while providing fairness.
>
> IMO the decision between ticket and queueing locks is really more of a
> property of the hardware/workload than the ISA, though there are of
> course some pretty deep ISA dependencies than can make one saner than
> the other.  It seems best to me to just allow users to pick their own
> flavor of locks, and at least PPC is already doing that.  I threw
> together a quick asm-generic ticket lock that can be selected at compile
> time, but I want to spend some more time playing with the other
> architectures before sending anything out.

This discussion came up again a few weeks ago because I've been stumbling over
the test-and-set implementation and was wondering if nobody cared to
improve that yet.
Then I saw, that there have been a few attempts so far, but they did not land.
So I brought this up in RVI's platform group meeting and the attendees
showed big
interest to get at least fairness. I assume Guo sent out his new
patchset as a reaction
to this call (1 or 2 days later).

We have the same situation on OpenSBI, where we've agreed (with Anup)
to go for a ticket lock implementation.
A series for that can be found here (the implementation was tested in
the kernel):
http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html

In the mentioned RVI call, I've asked the question if ticket locks or
MCF locks are preferred.
And the feedback was to go for qspinlock/qrwlock. One good argument to
do so would be,
to not have to maintain an RV-specific implementation, but to use a
well-tested in-kernel mechanism.

The feedback in the call is also aligned with the previous attempts to
enable MCF-locks on RV.
However, the kernel's implementation requires sub-word atomics. And
here is, where we are.
The discussion last week was about changing the generic kernel code to
loosen its requirements
(not accepted because of performance regressions on e.g. x86) and if
RV actually can provide
sub-word atomics in form of LL/SC loops (yes, that's possible).
Providing sub-word xchg() can be done within a couple of hours (some
solutions are already on the list),
but that was not enough from the initial patchset from Michael on
(e.g. Christoph Hellwig asked back then
for moving arch-independent parts into lib, which is a good idea given
other archs do the same).
So I expect this might require some more time until there is a
solution, that's accepted by
a broad range of maintainers.

I've been working on a new MCF-lock series last week.
It is working, but I did not publish it yet, because I wanted to go
through the 130+ emails
on the riscv-linux list and check for overseen review comments and
validate the patch authors.
You can find the current state here:
https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
So, if you insist on ticket locks, then let's coordinate who will do
that and how it will be tested
(RV32/RV64, qemu vs real hw).
Peter Zijlstra April 12, 2021, 2:51 p.m. UTC | #11
Please fix your mailer to properly flow text. Reflowed it for you.

On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote:

> This discussion came up again a few weeks ago because I've been
> stumbling over the test-and-set implementation and was wondering if
> nobody cared to improve that yet.

> Then I saw, that there have been a few attempts so far, but they did
> not land.  So I brought this up in RVI's platform group meeting and
> the attendees showed big interest to get at least fairness. I assume
> Guo sent out his new patchset as a reaction to this call (1 or 2 days
> later).
> 
> We have the same situation on OpenSBI, where we've agreed (with Anup)
> to go for a ticket lock implementation.  A series for that can be
> found here (the implementation was tested in the kernel):
> http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> 
> In the mentioned RVI call, I've asked the question if ticket locks or
> MCF locks are preferred.  And the feedback was to go for
> qspinlock/qrwlock. One good argument to do so would be, to not have to
> maintain an RV-specific implementation, but to use a well-tested
> in-kernel mechanism.

qrwlock does not depend on qspinlock; any fair spinlock implementation
works, including ticket.

> The feedback in the call is also aligned with the previous attempts to
> enable MCF-locks on RV.  However, the kernel's implementation requires
> sub-word atomics. And here is, where we are.  The discussion last week
> was about changing the generic kernel code to loosen its requirements
> (not accepted because of performance regressions on e.g. x86) and if
> RV actually can provide sub-word atomics in form of LL/SC loops (yes,
> that's possible).

So qspinlock is a complex and fickle beast. Yes it works on x86 and
arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using
such a complex thing needs to be provably better than the trivial and
simple thing (tickets, test-and-set).

Part of that is fwd progress, if you don't have that, stay with
test-and-set. Will fixed a number of issues there, and -RT actually hit
at least one.

Debugging non-deterministic lock behaviour is not on the fun list.
Having something simple (ticket) to fall back to to prove it's your lock
going 'funneh' is very useful.

> Providing sub-word xchg() can be done within a couple of hours (some
> solutions are already on the list), but that was not enough from the
> initial patchset from Michael on (e.g. Christoph Hellwig asked back
> then for moving arch-independent parts into lib, which is a good idea
> given other archs do the same).  So I expect this might require some
> more time until there is a solution, that's accepted by a broad range
> of maintainers.

Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I
made elsewhere in the thread. cmpxchg() based loops have very difficult
fwd progress guarantees, esp. so on LL/SC architectures.

What I would do is create a common inline helper to compute that {addr,
mask, val} setup with a comment on how to use it.

(As is, we have at least 2 different ways of dealing with ENDIAN-ness)

> I've been working on a new MCF-lock series last week.  It is working,
> but I did not publish it yet, because I wanted to go through the 130+
> emails on the riscv-linux list and check for overseen review comments
> and validate the patch authors.

> You can find the current state here:
> https://github.com/cmuellner/linux/pull/new/riscv-spinlocks 

That's not public. But if that's not qspinlock, how are you justifying a
complex spinlock implementation? Does it perform better than ticket?

> So, if you insist on ticket locks, then let's coordinate who will do
> that and how it will be tested (RV32/RV64, qemu vs real hw).

Real hardware is all that counts :-)
Palmer Dabbelt April 12, 2021, 5:33 p.m. UTC | #12
On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote:
> On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
>> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>> >>
>> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
>> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>> >> > >
>> >> > > From: Guo Ren <guoren@linux.alibaba.com>
>> >> > >
>> >> > > This patch introduces a ticket lock implementation for riscv, along the
>> >> > > same lines as the implementation for arch/arm & arch/csky.
>> >> > >
>> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
>> >> > > Cc: Will Deacon <will.deacon@arm.com>
>> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
>> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> >> > > Cc: Anup Patel <anup@brainfault.org>
>> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
>> >> > > ---
>> >> > >  arch/riscv/Kconfig                      |   1 +
>> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
>> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>> >> >
>> >> > NACK from myside.
>> >> >
>> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
>> >> >
>> >> > We should directly go for qspinlock.
>> >>
>> >> I think it is a sensible intermediate step, even if you want to go
>> >> qspinlock. Ticket locks are more or less trivial and get you fairness
>> >> and all that goodness without the mind bending complexity of qspinlock.
>> >>
>> >> Once you have the ticket lock implementation solid (and qrwlock) and
>> >> everything, *then* start to carefully look at qspinlock.
>> >
>> > I do understand qspinlock are relatively complex but the best thing
>> > about qspinlock is it tries to ensure each CPU spins on it's own location.
>> >
>> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
>> > it is better to straight away explore qspinlock hence my NACK.
>> >
>> >>
>> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
>> >> weak architectures, but if you want to do it right, you still have to
>> >> analyze the whole thing for your own architecture.
>> >
>> > Most of the RISC-V implementations are weak memory ordering so it
>> > makes more sense to explore qspinlock first.
>>
>> I know I'm somewhat late to the party here.  I talked with Will (and
>> to a lesser extent Peter) about this a week or two ago and it seems the
>> best way to go here is to start with ticket locks.  They're simpler, and
>> in Arm land they performed better until we got to the larger systems.
>> Given that we don't have any high performance implementations of the
>> RISC-V memory model (and likely won't any time soon) it's hard to reason
>> about the performance of anything like this, but at a bare minimum
>> having fair locks is a pretty big positive and ticket locks should have
>> very little overhead while providing fairness.
>>
>> IMO the decision between ticket and queueing locks is really more of a
>> property of the hardware/workload than the ISA, though there are of
>> course some pretty deep ISA dependencies than can make one saner than
>> the other.  It seems best to me to just allow users to pick their own
>> flavor of locks, and at least PPC is already doing that.  I threw
>> together a quick asm-generic ticket lock that can be selected at compile
>> time, but I want to spend some more time playing with the other
>> architectures before sending anything out.
>
> This discussion came up again a few weeks ago because I've been stumbling over
> the test-and-set implementation and was wondering if nobody cared to
> improve that yet.
> Then I saw, that there have been a few attempts so far, but they did not land.
> So I brought this up in RVI's platform group meeting and the attendees
> showed big
> interest to get at least fairness. I assume Guo sent out his new
> patchset as a reaction
> to this call (1 or 2 days later).
>
> We have the same situation on OpenSBI, where we've agreed (with Anup)
> to go for a ticket lock implementation.
> A series for that can be found here (the implementation was tested in
> the kernel):
> http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
>
> In the mentioned RVI call, I've asked the question if ticket locks or
> MCF locks are preferred.
> And the feedback was to go for qspinlock/qrwlock. One good argument to
> do so would be,
> to not have to maintain an RV-specific implementation, but to use a
> well-tested in-kernel mechanism.
>
> The feedback in the call is also aligned with the previous attempts to
> enable MCF-locks on RV.
> However, the kernel's implementation requires sub-word atomics. And
> here is, where we are.
> The discussion last week was about changing the generic kernel code to
> loosen its requirements
> (not accepted because of performance regressions on e.g. x86) and if
> RV actually can provide
> sub-word atomics in form of LL/SC loops (yes, that's possible).
> Providing sub-word xchg() can be done within a couple of hours (some
> solutions are already on the list),
> but that was not enough from the initial patchset from Michael on
> (e.g. Christoph Hellwig asked back then
> for moving arch-independent parts into lib, which is a good idea given
> other archs do the same).
> So I expect this might require some more time until there is a
> solution, that's accepted by
> a broad range of maintainers.
>
> I've been working on a new MCF-lock series last week.
> It is working, but I did not publish it yet, because I wanted to go
> through the 130+ emails
> on the riscv-linux list and check for overseen review comments and
> validate the patch authors.
> You can find the current state here:
> https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
> So, if you insist on ticket locks, then let's coordinate who will do
> that and how it will be tested
> (RV32/RV64, qemu vs real hw).

My plan is to add a generic ticket-based lock, which can be selected at 
compile time.  It'll have no architecture dependencies (though it'll 
likely have some hooks for architectures that can make this go faster).  
Users can then just pick which spinlock flavor they want, with the idea 
being that smaller systems will perform better with ticket locks and 
larger systems will perform better with queued locks.  The main goal 
here is to give the less widely used architectures an easy way to have 
fair locks, as right now we've got a lot of code duplication because any 
architecture that wants ticket locks has to do it themselves.

I'm not really convinced we have the ability to discuss the performance 
of locks on RISC-V right now, at least in any meaningful capacity.  The 
set of systems we have right now are just so far off from having a 
competitive memory system that optimizing for them doesn't seem to be 
worth the time, and as there really aren't any users we don't know what 
workloads people care about.  I'm mostly interested in just keeping the 
implementation simple, and ticket locks are the simplest spinlock flavor 
I know of that's fair (I think we can all agree that unfair locks are 
going to be a disaster).

There are certainly classes of systems for which ticket locks will be 
the wrong choice, and for those it makes sense to use the generic 
qspinlock implementation.  We'll likely need some changes to make that 
go fast on RISC-V, but we won't know what those are until we have 
hardware.  For now just having something that works (while staying away 
from anything that's obviously horribly inefficient) is as good as we 
can do, so I'm perfectly happy to take whatever we need to enable 
qspinlock on RISC-V.

I'll likely default to the ticket locks on RISC-V as it's simpler, but 
my main goal is to just get rid of the code duplication.  IMO the 
correct lock flavor is really something that's tightly coupled to both 
the microarchitecture and workload, but given how poorly the current 
crop of implementations performs on anything parallel I'm more swayed by 
simplicity.  When we end up with real hardware I'll be happy to 
re-evaluate that choice, but I don't think it's all that important today 
because we're going to need a whole new generation of implementations 
(and likely at least some ISA clarifications, as whether anything based 
on cmpxchg can be fair right now is still an open question) before we 
have anything competitive.

If you guys want to go throw a "thou shalt make queued spinlocks better 
than ticket spinlocks" (or the other way around, I don't really 
personally care all that much about spinlock flavors) in the platform 
spec that's fine.  Performance knobs are always a headache so getting 
rid of one would be great, but ultimately this is a microarchitecture 
thing so we'll have to see what gets implemented.
Christoph Muellner April 12, 2021, 9:21 p.m. UTC | #13
On Mon, Apr 12, 2021 at 4:52 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
>
> Please fix your mailer to properly flow text. Reflowed it for you.
>
> On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote:
>
> > This discussion came up again a few weeks ago because I've been
> > stumbling over the test-and-set implementation and was wondering if
> > nobody cared to improve that yet.
>
> > Then I saw, that there have been a few attempts so far, but they did
> > not land.  So I brought this up in RVI's platform group meeting and
> > the attendees showed big interest to get at least fairness. I assume
> > Guo sent out his new patchset as a reaction to this call (1 or 2 days
> > later).
> >
> > We have the same situation on OpenSBI, where we've agreed (with Anup)
> > to go for a ticket lock implementation.  A series for that can be
> > found here (the implementation was tested in the kernel):
> > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> >
> > In the mentioned RVI call, I've asked the question if ticket locks or
> > MCF locks are preferred.  And the feedback was to go for
> > qspinlock/qrwlock. One good argument to do so would be, to not have to
> > maintain an RV-specific implementation, but to use a well-tested
> > in-kernel mechanism.
>
> qrwlock does not depend on qspinlock; any fair spinlock implementation
> works, including ticket.
>
> > The feedback in the call is also aligned with the previous attempts to
> > enable MCF-locks on RV.  However, the kernel's implementation requires
> > sub-word atomics. And here is, where we are.  The discussion last week
> > was about changing the generic kernel code to loosen its requirements
> > (not accepted because of performance regressions on e.g. x86) and if
> > RV actually can provide sub-word atomics in form of LL/SC loops (yes,
> > that's possible).
>
> So qspinlock is a complex and fickle beast. Yes it works on x86 and
> arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using
> such a complex thing needs to be provably better than the trivial and
> simple thing (tickets, test-and-set).
>
> Part of that is fwd progress, if you don't have that, stay with
> test-and-set. Will fixed a number of issues there, and -RT actually hit
> at least one.
>
> Debugging non-deterministic lock behaviour is not on the fun list.
> Having something simple (ticket) to fall back to to prove it's your lock
> going 'funneh' is very useful.
>
> > Providing sub-word xchg() can be done within a couple of hours (some
> > solutions are already on the list), but that was not enough from the
> > initial patchset from Michael on (e.g. Christoph Hellwig asked back
> > then for moving arch-independent parts into lib, which is a good idea
> > given other archs do the same).  So I expect this might require some
> > more time until there is a solution, that's accepted by a broad range
> > of maintainers.
>
> Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I
> made elsewhere in the thread. cmpxchg() based loops have very difficult
> fwd progress guarantees, esp. so on LL/SC architectures.
>
> What I would do is create a common inline helper to compute that {addr,
> mask, val} setup with a comment on how to use it.
>
> (As is, we have at least 2 different ways of dealing with ENDIAN-ness)

Well, that's what I have here:
https://github.com/cmuellner/linux/commit/9d2f6449dd848b5723326ae8be34a3d2d41dcff1

> > I've been working on a new MCF-lock series last week.  It is working,
> > but I did not publish it yet, because I wanted to go through the 130+
> > emails on the riscv-linux list and check for overseen review comments
> > and validate the patch authors.
>
> > You can find the current state here:
> > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
>
> That's not public. But if that's not qspinlock, how are you justifying a
> complex spinlock implementation? Does it perform better than ticket?

I pasted the wrong link. Here is a working one:
https://github.com/cmuellner/linux/tree/riscv-spinlocks
It is basically Guo's v4 with mask-and-set within a LL/SC loop,
commits split up,
non-riscv commits dropped, and commit messages rewritten.

I fully understand your reservations against using MCF locks.
I also agree, that debugging locking issues is not funny.

FWIW:
I've been debugging quite some embedded Linux boards the last years,
where essential basic building blocks showed unreliable/erratic behavior
(caused e.g. by an unstable voltage supply) and every attempt to monitor
the bug causes it to disappear.

Ticket locks are also fine for me. Still, it would be nice to get the
16-bit xchg()
merged, so advanced users can try qspinlocks without much effort.
Otherwise, we just suspend the discussion now and restart it from the beginning
in a year (as is happening right now).

> > So, if you insist on ticket locks, then let's coordinate who will do
> > that and how it will be tested (RV32/RV64, qemu vs real hw).
>
> Real hardware is all that counts :-)

Fully agree, therefore I have written that.
Christoph Muellner April 12, 2021, 9:54 p.m. UTC | #14
On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote:
> > On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >>
> >> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> >> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >> >>
> >> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> >> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >> >> > >
> >> >> > > From: Guo Ren <guoren@linux.alibaba.com>
> >> >> > >
> >> >> > > This patch introduces a ticket lock implementation for riscv, along the
> >> >> > > same lines as the implementation for arch/arm & arch/csky.
> >> >> > >
> >> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> >> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> >> > > Cc: Will Deacon <will.deacon@arm.com>
> >> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
> >> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> >> >> > > Cc: Anup Patel <anup@brainfault.org>
> >> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
> >> >> > > ---
> >> >> > >  arch/riscv/Kconfig                      |   1 +
> >> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
> >> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >> >> >
> >> >> > NACK from myside.
> >> >> >
> >> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >> >> >
> >> >> > We should directly go for qspinlock.
> >> >>
> >> >> I think it is a sensible intermediate step, even if you want to go
> >> >> qspinlock. Ticket locks are more or less trivial and get you fairness
> >> >> and all that goodness without the mind bending complexity of qspinlock.
> >> >>
> >> >> Once you have the ticket lock implementation solid (and qrwlock) and
> >> >> everything, *then* start to carefully look at qspinlock.
> >> >
> >> > I do understand qspinlock are relatively complex but the best thing
> >> > about qspinlock is it tries to ensure each CPU spins on it's own location.
> >> >
> >> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
> >> > it is better to straight away explore qspinlock hence my NACK.
> >> >
> >> >>
> >> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> >> >> weak architectures, but if you want to do it right, you still have to
> >> >> analyze the whole thing for your own architecture.
> >> >
> >> > Most of the RISC-V implementations are weak memory ordering so it
> >> > makes more sense to explore qspinlock first.
> >>
> >> I know I'm somewhat late to the party here.  I talked with Will (and
> >> to a lesser extent Peter) about this a week or two ago and it seems the
> >> best way to go here is to start with ticket locks.  They're simpler, and
> >> in Arm land they performed better until we got to the larger systems.
> >> Given that we don't have any high performance implementations of the
> >> RISC-V memory model (and likely won't any time soon) it's hard to reason
> >> about the performance of anything like this, but at a bare minimum
> >> having fair locks is a pretty big positive and ticket locks should have
> >> very little overhead while providing fairness.
> >>
> >> IMO the decision between ticket and queueing locks is really more of a
> >> property of the hardware/workload than the ISA, though there are of
> >> course some pretty deep ISA dependencies than can make one saner than
> >> the other.  It seems best to me to just allow users to pick their own
> >> flavor of locks, and at least PPC is already doing that.  I threw
> >> together a quick asm-generic ticket lock that can be selected at compile
> >> time, but I want to spend some more time playing with the other
> >> architectures before sending anything out.
> >
> > This discussion came up again a few weeks ago because I've been stumbling over
> > the test-and-set implementation and was wondering if nobody cared to
> > improve that yet.
> > Then I saw, that there have been a few attempts so far, but they did not land.
> > So I brought this up in RVI's platform group meeting and the attendees
> > showed big
> > interest to get at least fairness. I assume Guo sent out his new
> > patchset as a reaction
> > to this call (1 or 2 days later).
> >
> > We have the same situation on OpenSBI, where we've agreed (with Anup)
> > to go for a ticket lock implementation.
> > A series for that can be found here (the implementation was tested in
> > the kernel):
> > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> >
> > In the mentioned RVI call, I've asked the question if ticket locks or
> > MCF locks are preferred.
> > And the feedback was to go for qspinlock/qrwlock. One good argument to
> > do so would be,
> > to not have to maintain an RV-specific implementation, but to use a
> > well-tested in-kernel mechanism.
> >
> > The feedback in the call is also aligned with the previous attempts to
> > enable MCF-locks on RV.
> > However, the kernel's implementation requires sub-word atomics. And
> > here is, where we are.
> > The discussion last week was about changing the generic kernel code to
> > loosen its requirements
> > (not accepted because of performance regressions on e.g. x86) and if
> > RV actually can provide
> > sub-word atomics in form of LL/SC loops (yes, that's possible).
> > Providing sub-word xchg() can be done within a couple of hours (some
> > solutions are already on the list),
> > but that was not enough from the initial patchset from Michael on
> > (e.g. Christoph Hellwig asked back then
> > for moving arch-independent parts into lib, which is a good idea given
> > other archs do the same).
> > So I expect this might require some more time until there is a
> > solution, that's accepted by
> > a broad range of maintainers.
> >
> > I've been working on a new MCF-lock series last week.
> > It is working, but I did not publish it yet, because I wanted to go
> > through the 130+ emails
> > on the riscv-linux list and check for overseen review comments and
> > validate the patch authors.
> > You can find the current state here:
> > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
> > So, if you insist on ticket locks, then let's coordinate who will do
> > that and how it will be tested
> > (RV32/RV64, qemu vs real hw).
>
> My plan is to add a generic ticket-based lock, which can be selected at
> compile time.  It'll have no architecture dependencies (though it'll
> likely have some hooks for architectures that can make this go faster).
> Users can then just pick which spinlock flavor they want, with the idea
> being that smaller systems will perform better with ticket locks and
> larger systems will perform better with queued locks.  The main goal
> here is to give the less widely used architectures an easy way to have
> fair locks, as right now we've got a lot of code duplication because any
> architecture that wants ticket locks has to do it themselves.

In the case of LL/SC sequences, we have a maximum of 16 instructions
on RISC-V. My concern with a pure-C implementation would be that
we cannot guarantee this (e.g. somebody wants to compile with -O0)
and I don't know of a way to abort the build in case this limit exceeds.
Therefore I have preferred inline assembly for OpenSBI (my initial idea
was to use closure-like LL/SC macros, where you can write the loop
in form of C code).

In case you care, here is the ticket lock code from the OpenSBI patchset
ported over to Linux:
https://github.com/cmuellner/linux/commit/40a41d561df71fbe247016b303a1ef91bf9702f3

> I'm not really convinced we have the ability to discuss the performance
> of locks on RISC-V right now, at least in any meaningful capacity.  The
> set of systems we have right now are just so far off from having a
> competitive memory system that optimizing for them doesn't seem to be
> worth the time, and as there really aren't any users we don't know what
> workloads people care about.  I'm mostly interested in just keeping the
> implementation simple, and ticket locks are the simplest spinlock flavor
> I know of that's fair (I think we can all agree that unfair locks are
> going to be a disaster).
>
> There are certainly classes of systems for which ticket locks will be
> the wrong choice, and for those it makes sense to use the generic
> qspinlock implementation.  We'll likely need some changes to make that
> go fast on RISC-V, but we won't know what those are until we have
> hardware.  For now just having something that works (while staying away
> from anything that's obviously horribly inefficient) is as good as we
> can do, so I'm perfectly happy to take whatever we need to enable
> qspinlock on RISC-V.
>
> I'll likely default to the ticket locks on RISC-V as it's simpler, but
> my main goal is to just get rid of the code duplication.  IMO the
> correct lock flavor is really something that's tightly coupled to both
> the microarchitecture and workload, but given how poorly the current
> crop of implementations performs on anything parallel I'm more swayed by
> simplicity.  When we end up with real hardware I'll be happy to
> re-evaluate that choice, but I don't think it's all that important today
> because we're going to need a whole new generation of implementations
> (and likely at least some ISA clarifications, as whether anything based
> on cmpxchg can be fair right now is still an open question) before we
> have anything competitive.
>
> If you guys want to go throw a "thou shalt make queued spinlocks better
> than ticket spinlocks" (or the other way around, I don't really
> personally care all that much about spinlock flavors) in the platform
> spec that's fine.  Performance knobs are always a headache so getting
> rid of one would be great, but ultimately this is a microarchitecture
> thing so we'll have to see what gets implemented.

No, the implementation details of the spinlock algorithm are not part of
any platform specification. It was discussed in this forum because I did
not know a better place to ask this question. We all expect the RISC-V
kernel maintainer(s) to make a proper choice in the end. And as you said,
this can be re-evaluated in the future (based on results on real HW).
Peter Zijlstra April 13, 2021, 8:03 a.m. UTC | #15
On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:

> > My plan is to add a generic ticket-based lock, which can be selected at
> > compile time.  It'll have no architecture dependencies (though it'll
> > likely have some hooks for architectures that can make this go faster).
> > Users can then just pick which spinlock flavor they want, with the idea
> > being that smaller systems will perform better with ticket locks and
> > larger systems will perform better with queued locks.  The main goal
> > here is to give the less widely used architectures an easy way to have
> > fair locks, as right now we've got a lot of code duplication because any
> > architecture that wants ticket locks has to do it themselves.
> 
> In the case of LL/SC sequences, we have a maximum of 16 instructions
> on RISC-V. My concern with a pure-C implementation would be that
> we cannot guarantee this (e.g. somebody wants to compile with -O0)
> and I don't know of a way to abort the build in case this limit exceeds.
> Therefore I have preferred inline assembly for OpenSBI (my initial idea
> was to use closure-like LL/SC macros, where you can write the loop
> in form of C code).

For ticket locks you really only needs atomic_fetch_add() and
smp_store_release() and an architectural guarantees that the
atomic_fetch_add() has fwd progress under contention and that a sub-word
store (through smp_store_release()) will fail the SC.

Then you can do something like:

void lock(atomic_t *lock)
{
	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
	u16 ticket = val >> 16;

	for (;;) {
		if (ticket == (u16)val)
			break;
		cpu_relax();
		val = atomic_read_acquire(lock);
	}
}

void unlock(atomic_t *lock)
{
	u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
	u32 val = atomic_read(lock);

	smp_store_release(ptr, (u16)val + 1);
}

That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
on x86 for not being allowed to use a memop on unlock, since its being
forced into a load-store because of all the volatile, but whatever.
Peter Zijlstra April 13, 2021, 8:17 a.m. UTC | #16
On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:

> For ticket locks you really only needs atomic_fetch_add() and
> smp_store_release() and an architectural guarantees that the
> atomic_fetch_add() has fwd progress under contention and that a sub-word
> store (through smp_store_release()) will fail the SC.
> 
> Then you can do something like:
> 
> void lock(atomic_t *lock)
> {
> 	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> 	u16 ticket = val >> 16;
> 
> 	for (;;) {
> 		if (ticket == (u16)val)
> 			break;
> 		cpu_relax();
> 		val = atomic_read_acquire(lock);
> 	}

A possibly better might be:

	if (ticket == (u16)val)
		return;

	atomic_cond_read_acquire(lock, ticket == (u16)VAL);

Since that allows architectures to use WFE like constructs.

> }
> 
> void unlock(atomic_t *lock)
> {
> 	u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> 	u32 val = atomic_read(lock);
> 
> 	smp_store_release(ptr, (u16)val + 1);
> }
> 
> That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> on x86 for not being allowed to use a memop on unlock, since its being
> forced into a load-store because of all the volatile, but whatever.
Christoph Muellner April 13, 2021, 9:22 a.m. UTC | #17
On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> > > My plan is to add a generic ticket-based lock, which can be selected at
> > > compile time.  It'll have no architecture dependencies (though it'll
> > > likely have some hooks for architectures that can make this go faster).
> > > Users can then just pick which spinlock flavor they want, with the idea
> > > being that smaller systems will perform better with ticket locks and
> > > larger systems will perform better with queued locks.  The main goal
> > > here is to give the less widely used architectures an easy way to have
> > > fair locks, as right now we've got a lot of code duplication because any
> > > architecture that wants ticket locks has to do it themselves.
> >
> > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > on RISC-V. My concern with a pure-C implementation would be that
> > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > and I don't know of a way to abort the build in case this limit exceeds.
> > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > was to use closure-like LL/SC macros, where you can write the loop
> > in form of C code).
>
> For ticket locks you really only needs atomic_fetch_add() and
> smp_store_release() and an architectural guarantees that the
> atomic_fetch_add() has fwd progress under contention and that a sub-word
> store (through smp_store_release()) will fail the SC.
>
> Then you can do something like:
>
> void lock(atomic_t *lock)
> {
>         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
>         u16 ticket = val >> 16;
>
>         for (;;) {
>                 if (ticket == (u16)val)
>                         break;
>                 cpu_relax();
>                 val = atomic_read_acquire(lock);
>         }
> }
>
> void unlock(atomic_t *lock)
> {
>         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
>         u32 val = atomic_read(lock);
>
>         smp_store_release(ptr, (u16)val + 1);
> }
>
> That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> on x86 for not being allowed to use a memop on unlock, since its being
> forced into a load-store because of all the volatile, but whatever.

What about trylock()?
I.e. one could implement trylock() without a loop, by letting
trylock() fail if the SC fails.
That looks safe on first view, but nobody does this right now.
Catalin Marinas April 13, 2021, 9:30 a.m. UTC | #18
On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > likely have some hooks for architectures that can make this go faster).
> > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > being that smaller systems will perform better with ticket locks and
> > > > larger systems will perform better with queued locks.  The main goal
> > > > here is to give the less widely used architectures an easy way to have
> > > > fair locks, as right now we've got a lot of code duplication because any
> > > > architecture that wants ticket locks has to do it themselves.
> > >
> > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > on RISC-V. My concern with a pure-C implementation would be that
> > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > and I don't know of a way to abort the build in case this limit exceeds.
> > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > was to use closure-like LL/SC macros, where you can write the loop
> > > in form of C code).
> >
> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >         u16 ticket = val >> 16;
> >
> >         for (;;) {
> >                 if (ticket == (u16)val)
> >                         break;
> >                 cpu_relax();
> >                 val = atomic_read_acquire(lock);
> >         }
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >         u32 val = atomic_read(lock);
> >
> >         smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
> 
> What about trylock()?
> I.e. one could implement trylock() without a loop, by letting
> trylock() fail if the SC fails.
> That looks safe on first view, but nobody does this right now.

Not familiar with RISC-V but I'd recommend that a trylock only fails if
the lock is locked (after LR). A SC may fail for other reasons
(cacheline eviction; depending on the microarchitecture) even if the
lock is unlocked. At least on arm64 we had this issue with an
implementation having a tendency to always fail the first STXR.
Peter Zijlstra April 13, 2021, 9:35 a.m. UTC | #19
On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:

> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >         u16 ticket = val >> 16;
> >
> >         for (;;) {
> >                 if (ticket == (u16)val)
> >                         break;
> >                 cpu_relax();
> >                 val = atomic_read_acquire(lock);
> >         }
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >         u32 val = atomic_read(lock);
> >
> >         smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
> 
> What about trylock()?
> I.e. one could implement trylock() without a loop, by letting
> trylock() fail if the SC fails.
> That looks safe on first view, but nobody does this right now.

Generic code has to use cmpxchg(), and then you get something like this:

bool trylock(atomic_t *lock)
{
	u32 old = atomic_read(lock);

	if ((old >> 16) != (old & 0xffff))
		return false;

	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
}

That will try and do the full LL/SC loop, because it wants to complete
the cmpxchg, but in generic code we have no other option.

(Is this what C11's weak cmpxchg is for?)
Christoph Muellner April 13, 2021, 9:55 a.m. UTC | #20
On Tue, Apr 13, 2021 at 11:31 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > likely have some hooks for architectures that can make this go faster).
> > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > being that smaller systems will perform better with ticket locks and
> > > > > larger systems will perform better with queued locks.  The main goal
> > > > > here is to give the less widely used architectures an easy way to have
> > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > architecture that wants ticket locks has to do it themselves.
> > > >
> > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > in form of C code).
> > >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
>
> Not familiar with RISC-V but I'd recommend that a trylock only fails if
> the lock is locked (after LR). A SC may fail for other reasons
> (cacheline eviction; depending on the microarchitecture) even if the
> lock is unlocked. At least on arm64 we had this issue with an
> implementation having a tendency to always fail the first STXR.

Interesting data point.
Thanks!
Christoph Muellner April 13, 2021, 10:25 a.m. UTC | #21
On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
>
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
>
> Generic code has to use cmpxchg(), and then you get something like this:
>
> bool trylock(atomic_t *lock)
> {
>         u32 old = atomic_read(lock);
>
>         if ((old >> 16) != (old & 0xffff))
>                 return false;
>
>         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> }

This approach requires two loads (atomic_read() and cmpxchg()), which
is not required.
Detecting this pattern and optimizing it in a compiler is quite unlikely.

A bit less generic solution would be to wrap the LL/SC (would be
mandatory in this case)
instructions and do something like this:

uint32_t __smp_load_acquire_reserved(void*);
int __smp_store_release_conditional(void*, uint32_t);

typedef union {
    uint32_t v32;
    struct {
        uint16_t owner;
        uint16_t next;
    };
} arch_spinlock_t;

int trylock(arch_spinlock_t *lock)
{
    arch_spinlock_t l;
    int success;
    do {
        l.v32 = __smp_load_acquire_reserved(lock);
        if (l.owner != l.next)
            return 0;
        l.next++;
        success = __smp_store_release_conditional(lock, l.v32);
    } while (!success);
    return success;
}

But here we can't tell the compiler to optimize the code between LL and SC...

>
> That will try and do the full LL/SC loop, because it wants to complete
> the cmpxchg, but in generic code we have no other option.
>
> (Is this what C11's weak cmpxchg is for?)
Catalin Marinas April 13, 2021, 10:45 a.m. UTC | #22
On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote:
> On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> >
> > Generic code has to use cmpxchg(), and then you get something like this:
> >
> > bool trylock(atomic_t *lock)
> > {
> >         u32 old = atomic_read(lock);
> >
> >         if ((old >> 16) != (old & 0xffff))
> >                 return false;
> >
> >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > }
> 
> This approach requires two loads (atomic_read() and cmpxchg()), which
> is not required.
> Detecting this pattern and optimizing it in a compiler is quite unlikely.
> 
> A bit less generic solution would be to wrap the LL/SC (would be
> mandatory in this case)
> instructions and do something like this:
> 
> uint32_t __smp_load_acquire_reserved(void*);
> int __smp_store_release_conditional(void*, uint32_t);
> 
> typedef union {
>     uint32_t v32;
>     struct {
>         uint16_t owner;
>         uint16_t next;
>     };
> } arch_spinlock_t;
> 
> int trylock(arch_spinlock_t *lock)
> {
>     arch_spinlock_t l;
>     int success;
>     do {
>         l.v32 = __smp_load_acquire_reserved(lock);
>         if (l.owner != l.next)
>             return 0;
>         l.next++;
>         success = __smp_store_release_conditional(lock, l.v32);
>     } while (!success);
>     return success;
> }
> 
> But here we can't tell the compiler to optimize the code between LL and SC...

This indeed needs some care. IIUC RISC-V has similar restrictions as arm
here, no load/store instructions are allowed between LR and SC. You
can't guarantee that the compiler won't spill some variable onto the
stack.

BTW, I think the SC doesn't need release semantics above, only the LR
needs acquire semantics.
David Laight April 13, 2021, 10:54 a.m. UTC | #23
From: Catalin Marinas
> Sent: 13 April 2021 11:45
...
> This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> here, no load/store instructions are allowed between LR and SC. You
> can't guarantee that the compiler won't spill some variable onto the
> stack.

You can probably never guarantee the compiler won't spill to stack.
Especially if someone compiles with -O0.

Which probably means that anything using LR/SC must be written in
asm and the C wrappers disabled.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christoph Muellner April 13, 2021, 11:04 a.m. UTC | #24
On Tue, Apr 13, 2021 at 12:45 PM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > > What about trylock()?
> > > > I.e. one could implement trylock() without a loop, by letting
> > > > trylock() fail if the SC fails.
> > > > That looks safe on first view, but nobody does this right now.
> > >
> > > Generic code has to use cmpxchg(), and then you get something like this:
> > >
> > > bool trylock(atomic_t *lock)
> > > {
> > >         u32 old = atomic_read(lock);
> > >
> > >         if ((old >> 16) != (old & 0xffff))
> > >                 return false;
> > >
> > >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > > }
> >
> > This approach requires two loads (atomic_read() and cmpxchg()), which
> > is not required.
> > Detecting this pattern and optimizing it in a compiler is quite unlikely.
> >
> > A bit less generic solution would be to wrap the LL/SC (would be
> > mandatory in this case)
> > instructions and do something like this:
> >
> > uint32_t __smp_load_acquire_reserved(void*);
> > int __smp_store_release_conditional(void*, uint32_t);
> >
> > typedef union {
> >     uint32_t v32;
> >     struct {
> >         uint16_t owner;
> >         uint16_t next;
> >     };
> > } arch_spinlock_t;
> >
> > int trylock(arch_spinlock_t *lock)
> > {
> >     arch_spinlock_t l;
> >     int success;
> >     do {
> >         l.v32 = __smp_load_acquire_reserved(lock);
> >         if (l.owner != l.next)
> >             return 0;
> >         l.next++;
> >         success = __smp_store_release_conditional(lock, l.v32);
> >     } while (!success);
> >     return success;
> > }
> >
> > But here we can't tell the compiler to optimize the code between LL and SC...
>
> This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> here, no load/store instructions are allowed between LR and SC. You
> can't guarantee that the compiler won't spill some variable onto the
> stack.

RISC-V behaves similar, but the specification is a bit more precise:
To guarantee forward progress, the ("constrained") LL/SC sequence has to
consist of <=16 instructions. Further, the "dynamic code executed between
the LR and SC instructions can only contain instructions from the base “I”
instruction set, excluding loads, stores, backward jumps, taken backward
branches, JALR, FENCE, and SYSTEM instructions".

And GCC generates a backward jump in-between LL and SC.
So we have more than enough reasons, to no do it this way.

>
> BTW, I think the SC doesn't need release semantics above, only the LR
> needs acquire semantics.
>
> --
> Catalin
Guo Ren April 13, 2021, 1:19 p.m. UTC | #25
On Tue, Apr 13, 2021 at 6:25 PM Christoph Müllner
<christophm30@gmail.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >         u16 ticket = val >> 16;
> > > >
> > > >         for (;;) {
> > > >                 if (ticket == (u16)val)
> > > >                         break;
> > > >                 cpu_relax();
> > > >                 val = atomic_read_acquire(lock);
> > > >         }
> > > > }
> > > >
> > > > void unlock(atomic_t *lock)
> > > > {
> > > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > > >         u32 val = atomic_read(lock);
> > > >
> > > >         smp_store_release(ptr, (u16)val + 1);
> > > > }
> > > >
> > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > > on x86 for not being allowed to use a memop on unlock, since its being
> > > > forced into a load-store because of all the volatile, but whatever.
> > >
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> >
> > Generic code has to use cmpxchg(), and then you get something like this:
> >
> > bool trylock(atomic_t *lock)
> > {
> >         u32 old = atomic_read(lock);
> >
> >         if ((old >> 16) != (old & 0xffff))
> >                 return false;
> >
> >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > }
>
> This approach requires two loads (atomic_read() and cmpxchg()), which
> is not required.
> Detecting this pattern and optimizing it in a compiler is quite unlikely.
>
> A bit less generic solution would be to wrap the LL/SC (would be
> mandatory in this case)
> instructions and do something like this:
>
> uint32_t __smp_load_acquire_reserved(void*);
> int __smp_store_release_conditional(void*, uint32_t);
>
> typedef union {
>     uint32_t v32;
>     struct {
>         uint16_t owner;
>         uint16_t next;
>     };
> } arch_spinlock_t;
>
> int trylock(arch_spinlock_t *lock)
> {
>     arch_spinlock_t l;
>     int success;
>     do {
>         l.v32 = __smp_load_acquire_reserved(lock);
>         if (l.owner != l.next)
>             return 0;
>         l.next++;
>         success = __smp_store_release_conditional(lock, l.v32);
It's a new semantics v.s cmpxchg, and cmpxchg is come from CAS
instruction to solve some complex scenario.

The primitive of cmpxchg has been widely used in Linux, so I don't
think introducing a new semantics (reserved_conditional) is a good
idea.

Also, from this point: It seems CAS instruction is more suitable for
software compatibility. Although riscv privilege spec chose the LR/SC
and list some bad sides of CAS.

>     } while (!success);
>     return success;
> }
>
> But here we can't tell the compiler to optimize the code between LL and SC...
>
> >
> > That will try and do the full LL/SC loop, because it wants to complete
> > the cmpxchg, but in generic code we have no other option.
> >
> > (Is this what C11's weak cmpxchg is for?)
Guo Ren April 14, 2021, 12:23 a.m. UTC | #26
On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > likely have some hooks for architectures that can make this go faster).
> > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > being that smaller systems will perform better with ticket locks and
> > > > > larger systems will perform better with queued locks.  The main goal
> > > > > here is to give the less widely used architectures an easy way to have
> > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > architecture that wants ticket locks has to do it themselves.
> > > >
> > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > in form of C code).
> > >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
I think it's safe for riscv LR/SC, because in spec A 8.3 section:
"As a consequence of the eventuality guarantee, if some harts in an
execution environment are executing constrained LR/SC loops, and no
other harts or devices in the execution environment execute an
unconditional store or AMO to that reservation set, then at least one
hart will eventually exit its constrained LR/SC loop."

So it guarantees LR/SC pair:

CPU0                   CPU1
=======             =======
LR addr1
                            LR addr1
                            SC addr1 // guarantee success.
SC addr1

But not guarantee, another hart unconditional store (which I mentioned before):
u32 a = 0x55aa66bb;
u16 *ptr = &a;

CPU0                       CPU1
=========             =========
xchg16(ptr, new)     while(1)
                                    WRITE_ONCE(*(ptr + 1), x);



>
> Not familiar with RISC-V but I'd recommend that a trylock only fails if
> the lock is locked (after LR). A SC may fail for other reasons
> (cacheline eviction; depending on the microarchitecture) even if the
> lock is unlocked. At least on arm64 we had this issue with an
> implementation having a tendency to always fail the first STXR.

I think it's a broken implementation for riscv. SC couldn't fail by
cache line bouncing and only could fail by another real write.
That means the HW implementation should use a per-hart address monitor
not just grab the cache line into the exclusive state without lockdown
the SNOOP channel.
I think the implementation of LR/SC you mentioned is a gambling style
but broke the riscv spec.

Is the patch of Will's would fix up the problem you mentioned?
----
commit 9bb17be062de6f5a9c9643258951aa0935652ec3
Author: Will Deacon <will.deacon@arm.com>
Date:   Tue Jul 2 14:54:33 2013 +0100

    ARM: locks: prefetch the destination word for write prior to strex

    The cost of changing a cacheline from shared to exclusive state can be
    significant, especially when this is triggered by an exclusive store,
    since it may result in having to retry the transaction.

    This patch prefixes our {spin,read,write}_[try]lock implementations with
    pldw instructions (on CPUs which support them) to try and grab the line
    in exclusive state from the start. arch_rwlock_t is changed to avoid
    using a volatile member, since this generates compiler warnings when
    falling back on the __builtin_prefetch intrinsic which expects a const
    void * argument.

    Acked-by: Nicolas Pitre <nico@linaro.org>
    Signed-off-by: Will Deacon <will.deacon@arm.com>
----

In the end, I want to conclude my suggestions here:
 - Using ticket-lock as default
 - Using ARCH_USE_QUEUED_SPINLOCKS_XCHG32 for riscv qspinlock
 - Disable xhg16/cmxchg16 and any sub-word atomic primitive in riscv

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Guo Ren April 14, 2021, 2:26 a.m. UTC | #27
Thx Peter,

On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
>
> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >       u16 ticket = val >> 16;
> >
> >       for (;;) {
> >               if (ticket == (u16)val)
> >                       break;
> >               cpu_relax();
> >               val = atomic_read_acquire(lock);
> >       }
Should it be?
       for (;;) {
               if (ticket == (u16)val) {
                       __atomic_acquire_fence();
                       break;
               }

>
> A possibly better might be:
>
>         if (ticket == (u16)val)
>                 return;
Should it be?
         if (ticket == (u16)val) {
                 __atomic_acquire_fence();
                 return;
         }

>
>         atomic_cond_read_acquire(lock, ticket == (u16)VAL);
>
> Since that allows architectures to use WFE like constructs.
>
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >       u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >       u32 val = atomic_read(lock);
> >
> >       smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
Guo Ren April 14, 2021, 5:54 a.m. UTC | #28
On Tue, Apr 13, 2021 at 6:54 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Catalin Marinas
> > Sent: 13 April 2021 11:45
> ...
> > This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> > here, no load/store instructions are allowed between LR and SC. You
> > can't guarantee that the compiler won't spill some variable onto the
> > stack.
>
> You can probably never guarantee the compiler won't spill to stack.
> Especially if someone compiles with -O0.
>
> Which probably means that anything using LR/SC must be written in
> asm and the C wrappers disabled.
Agree, and cmpxchg has been widely used in Linux. I think it's the
last requirement for complex atomic API, although cmpxchg has ABA
problem:

CPU0
                          CPU1
=======
                        ======
do {
  old32 = load32;

                               *ptr32 = new32_tmp;

                               *ptr32 = old32;
  load32 = cmpxchg(ptr32, old32, new32); //still success
} while (load32 != old32);

That means cmpxhg only cares about the result but not the middle
situation. It's different from LR/SC or AMO instructions.

>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>
Peter Zijlstra April 14, 2021, 7:08 a.m. UTC | #29
On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote:
> Thx Peter,
> 
> On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
> >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >       u16 ticket = val >> 16;
> > >
> > >       for (;;) {
> > >               if (ticket == (u16)val)
> > >                       break;
> > >               cpu_relax();
> > >               val = atomic_read_acquire(lock);
> > >       }
> Should it be?
>        for (;;) {
>                if (ticket == (u16)val) {
>                        __atomic_acquire_fence();
>                        break;
>                }

No, atomic_fetch_add() is full smp_mb(), it even has a comment on that
says so.

Also, __atomic_acquire_fence() is an implementation detail of atomic,
and architectures need not provide it. On top of that, IIRC the atomic
_acquire/_release have RCpc ordering, where we want our locks to have
RCsc ordering (and very much not weaker than RCtso). Even more so,
adding barriers to atomics should really not be conditional.
Peter Zijlstra April 14, 2021, 9:05 a.m. UTC | #30
On Wed, Apr 14, 2021 at 09:08:18AM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote:
> > Thx Peter,
> > 
> > On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
> > >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >       u16 ticket = val >> 16;
> > > >
> > > >       for (;;) {
> > > >               if (ticket == (u16)val)
> > > >                       break;
> > > >               cpu_relax();
> > > >               val = atomic_read_acquire(lock);
> > > >       }
> > Should it be?
> >        for (;;) {
> >                if (ticket == (u16)val) {
> >                        __atomic_acquire_fence();
> >                        break;
> >                }
> 
> No, atomic_fetch_add() is full smp_mb(), it even has a comment on that
> says so.
> 
> Also, __atomic_acquire_fence() is an implementation detail of atomic,
> and architectures need not provide it. On top of that, IIRC the atomic
> _acquire/_release have RCpc ordering, where we want our locks to have
> RCsc ordering (and very much not weaker than RCtso). Even more so,
> adding barriers to atomics should really not be conditional.

That made me look at the qspinlock code, and queued_spin_*lock() uses
atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
and has RCpc atomics will give us massive pain.

Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
openrisc (WTF?!).

Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
atomics, power has RCtso atomics (and is the arch we all hate for having
RCtso locks).

Now MIPS has all sorts of ill specified barriers, but last time looked
at it it didn't actually use any of that and stuck to using smp_mb(), so
it will have RCsc atomics.

/me goes look at wth openrisc is..  doesn't even appear to have
asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
actually have hardware ...

I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
all talking about.
Catalin Marinas April 14, 2021, 9:17 a.m. UTC | #31
On Wed, Apr 14, 2021 at 08:23:51AM +0800, Guo Ren wrote:
> On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > > likely have some hooks for architectures that can make this go faster).
> > > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > > being that smaller systems will perform better with ticket locks and
> > > > > > larger systems will perform better with queued locks.  The main goal
> > > > > > here is to give the less widely used architectures an easy way to have
> > > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > > architecture that wants ticket locks has to do it themselves.
> > > > >
> > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > > in form of C code).
> > > >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >         u16 ticket = val >> 16;
> > > >
> > > >         for (;;) {
> > > >                 if (ticket == (u16)val)
> > > >                         break;
> > > >                 cpu_relax();
> > > >                 val = atomic_read_acquire(lock);
> > > >         }
> > > > }
> > > >
> > > > void unlock(atomic_t *lock)
> > > > {
> > > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > > >         u32 val = atomic_read(lock);
> > > >
> > > >         smp_store_release(ptr, (u16)val + 1);
> > > > }
> > > >
> > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > > on x86 for not being allowed to use a memop on unlock, since its being
> > > > forced into a load-store because of all the volatile, but whatever.
> > >
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> 
> I think it's safe for riscv LR/SC, because in spec A 8.3 section:
> "As a consequence of the eventuality guarantee, if some harts in an
> execution environment are executing constrained LR/SC loops, and no
> other harts or devices in the execution environment execute an
> unconditional store or AMO to that reservation set, then at least one
> hart will eventually exit its constrained LR/SC loop."

This is clearly talking about _loops_ and that one hart will
_eventually_ exit the loop. It does not say that there is a guaranteed
LR/SC successful sequence or single iteration of the loop.

> So it guarantees LR/SC pair:
> 
> CPU0                   CPU1
> =======             =======
> LR addr1
>                             LR addr1
>                             SC addr1 // guarantee success.
> SC addr1

I don't see the RISC-V spec guaranteeing the (eventual) success of the
SC on CPU1 _without_ a loop.

> But not guarantee, another hart unconditional store (which I mentioned before):
> u32 a = 0x55aa66bb;
> u16 *ptr = &a;
> 
> CPU0                       CPU1
> =========             =========
> xchg16(ptr, new)     while(1)
>                                     WRITE_ONCE(*(ptr + 1), x);

If xchg16() is implemented with LR/SC, that's not guaranteed either. If
it implemented as some form of swap, the architecture may guarantee it's
success (or more like it won't deadlock).

> > Not familiar with RISC-V but I'd recommend that a trylock only fails if
> > the lock is locked (after LR). A SC may fail for other reasons
> > (cacheline eviction; depending on the microarchitecture) even if the
> > lock is unlocked. At least on arm64 we had this issue with an
> > implementation having a tendency to always fail the first STXR.
> 
> I think it's a broken implementation for riscv. SC couldn't fail by
> cache line bouncing and only could fail by another real write.
> That means the HW implementation should use a per-hart address monitor
> not just grab the cache line into the exclusive state without lockdown
> the SNOOP channel.
> I think the implementation of LR/SC you mentioned is a gambling style
> but broke the riscv spec.

Arm has the notion of exclusive monitors (local, global) but an
implementation may fake them by using cacheline states. And that's
allowed since the monitor doesn't need to guarantee success on the first
try but an eventual success of an ldxr/stxr loop.

> Is the patch of Will's would fix up the problem you mentioned?
> ----
> commit 9bb17be062de6f5a9c9643258951aa0935652ec3
> Author: Will Deacon <will.deacon@arm.com>
> Date:   Tue Jul 2 14:54:33 2013 +0100
> 
>     ARM: locks: prefetch the destination word for write prior to strex

No, that's only an optimisation. A prefetch would not guarantee that the
cacheline stays in certain state. There can be a long time between the
LR and SC, especially if interrupts are enabled or if you add
virtualisation into the mix.

The commit I was referring to is 4ecf7ccb1973 ("arm64: spinlock: retry
trylock operation if strex fails on free lock").
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 87d7b52..7c56a20 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -30,6 +30,7 @@  config RISCV
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
+	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 445ccc9..e57ef80 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -3,5 +3,6 @@  generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += kvm_para.h
+generic-y += qrwlock.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index f4f7fa1..2c81764 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -7,129 +7,91 @@ 
 #ifndef _ASM_RISCV_SPINLOCK_H
 #define _ASM_RISCV_SPINLOCK_H
 
-#include <linux/kernel.h>
-#include <asm/current.h>
-#include <asm/fence.h>
-
 /*
- * Simple spin lock operations.  These provide no fairness guarantees.
+ * Ticket-based spin-locking.
  */
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t lockval;
+	u32 tmp;
+
+	asm volatile (
+		"1:	lr.w	%0, %2		\n"
+		"	mv	%1, %0		\n"
+		"	addw	%0, %0, %3	\n"
+		"	sc.w	%0, %0, %2	\n"
+		"	bnez	%0, 1b		\n"
+		: "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
+		: "r" (1 << TICKET_NEXT)
+		: "memory");
 
-/* FIXME: Replace this with a ticket lock, like MIPS. */
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
+	while (lockval.tickets.next != lockval.tickets.owner) {
+		/*
+		 * FIXME - we need wfi/wfe here to prevent:
+		 *  - cache line bouncing
+		 *  - saving cpu pipeline in multi-harts-per-core
+		 *    processor
+		 */
+		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
+	}
 
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
+	__atomic_acquire_fence();
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
-	int tmp = 1, busy;
-
-	__asm__ __volatile__ (
-		"	amoswap.w %0, %2, %1\n"
-		RISCV_ACQUIRE_BARRIER
-		: "=r" (busy), "+A" (lock->lock)
-		: "r" (tmp)
+	u32 tmp, contended, res;
+
+	do {
+		asm volatile (
+		"	lr.w	%0, %3		\n"
+		"	srliw	%1, %0, %5	\n"
+		"	slliw	%2, %0, %5	\n"
+		"	or	%1, %2, %1	\n"
+		"	li	%2, 0		\n"
+		"	sub	%1, %1, %0	\n"
+		"	bnez	%1, 1f		\n"
+		"	addw	%0, %0, %4	\n"
+		"	sc.w	%2, %0, %3	\n"
+		"1:				\n"
+		: "=&r" (tmp), "=&r" (contended), "=&r" (res),
+		  "+A" (lock->lock)
+		: "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
 		: "memory");
+	} while (res);
 
-	return !busy;
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	while (1) {
-		if (arch_spin_is_locked(lock))
-			continue;
-
-		if (arch_spin_trylock(lock))
-			break;
+	if (!contended) {
+		__atomic_acquire_fence();
+		return 1;
+	} else {
+		return 0;
 	}
 }
 
-/***********************************************************/
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1b\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
+	smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
+	/* FIXME - we need ipi/sev here to notify above */
 }
 
-static inline void arch_write_lock(arch_rwlock_t *lock)
+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1b\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
+	return lock.tickets.owner == lock.tickets.next;
 }
 
-static inline int arch_read_trylock(arch_rwlock_t *lock)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1f\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
+	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
-static inline int arch_write_trylock(arch_rwlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1f\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
+	struct __raw_tickets tickets = READ_ONCE(lock->tickets);
 
-	return !busy;
+	return (tickets.next - tickets.owner) > 1;
 }
+#define arch_spin_is_contended	arch_spin_is_contended
 
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-		RISCV_RELEASE_BARRIER
-		"	amoadd.w x0, %1, %0\n"
-		: "+A" (lock->lock)
-		: "r" (-1)
-		: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
+#include <asm/qrwlock.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
index f398e76..d7b38bf 100644
--- a/arch/riscv/include/asm/spinlock_types.h
+++ b/arch/riscv/include/asm/spinlock_types.h
@@ -10,16 +10,21 @@ 
 # error "please don't include this file directly"
 #endif
 
+#define TICKET_NEXT	16
+
 typedef struct {
-	volatile unsigned int lock;
+	union {
+		u32 lock;
+		struct __raw_tickets {
+			/* little endian */
+			u16 owner;
+			u16 next;
+		} tickets;
+	};
 } arch_spinlock_t;
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_rwlock_t;
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+#include <asm-generic/qrwlock_types.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */