diff mbox series

[2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported

Message ID 1606225437-22948-2-git-send-email-guoren@kernel.org (mailing list archive)
State New, archived
Headers show
Series [1/5] riscv: Coding convention for xchg | expand

Commit Message

Guo Ren Nov. 24, 2020, 1:43 p.m. UTC
From: Guo Ren <guoren@linux.alibaba.com>

riscv only support lr.wd/s(c).w(d) with word(double word) size &
align access. There are not lr.h/sc.h instructions. But qspinlock.c
need xchg with short type variable:

xchg_tail -> xchg_releaxed(&lock->tail, ...

typedef struct qspinlock {
        union {
                atomic_t val;

                /*
                 * By using the whole 2nd least significant byte for the
                 * pending bit, we can allow better optimization of the lock
                 * acquisition for the pending bit holder.
                 */
                struct {
                        u8      locked;
                        u8      pending;
                };
                struct {
                        u16     locked_pending;
                        u16     tail; /* half word*/
                };
        };
} arch_spinlock_t;

So we add short emulation in xchg with word length and it only
solve qspinlock's requirement.

Michael have sent qspinlock before, ref to Link below.

Link: https://lore.kernel.org/linux-riscv/20190211043829.30096-1-michaeljclark@mac.com/
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Clark <michaeljclark@mac.com>
---
 arch/riscv/Kconfig                      |   2 +
 arch/riscv/include/asm/Kbuild           |   3 +
 arch/riscv/include/asm/cmpxchg.h        |  36 +++++++++
 arch/riscv/include/asm/spinlock.h       | 126 +-------------------------------
 arch/riscv/include/asm/spinlock_types.h |  15 +---
 5 files changed, 46 insertions(+), 136 deletions(-)

Comments

Peter Zijlstra Nov. 24, 2020, 2:39 p.m. UTC | #1
On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> index 59dd7be..6f5f438 100644
> --- a/arch/riscv/include/asm/Kbuild
> +++ b/arch/riscv/include/asm/Kbuild
> @@ -6,3 +6,6 @@ generic-y += kvm_para.h
>  generic-y += local64.h
>  generic-y += user.h
>  generic-y += vmlinux.lds.h
> +generic-y += mcs_spinlock.h
> +generic-y += qrwlock.h
> +generic-y += qspinlock.h
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 5609185..e178700 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -16,7 +16,43 @@
>  	__typeof__(ptr) __ptr = (ptr);					\
>  	__typeof__(new) __new = (new);					\
>  	__typeof__(*(ptr)) __ret;					\
> +	register unsigned long __rc, tmp, align, addr;			\
>  	switch (size) {							\
> +	case 2:								\
> +		align = ((unsigned long) __ptr & 0x3);			\
> +		addr = ((unsigned long) __ptr & ~0x3);			\
> +		if (align) {						\
> +		__asm__ __volatile__ (					\
> +			"0:	lr.w %0, 0(%z4)\n"			\
> +			"	move %1, %0\n"				\
> +			"	slli %1, %1, 16\n"			\
> +			"	srli %1, %1, 16\n"			\
> +			"	move %2, %z3\n"				\
> +			"	slli %2, %2, 16\n"			\
> +			"	or   %1, %2, %1\n"			\
> +			"	sc.w %2, %1, 0(%z4)\n"			\
> +			"	bnez %2, 0b\n"				\
> +			"	srli %0, %0, 16\n"			\
> +			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
> +			: "rJ" (__new), "rJ"(addr)			\
> +			: "memory");					\
> +		} else {						\
> +		__asm__ __volatile__ (					\
> +			"0:	lr.w %0, (%z4)\n"			\
> +			"	move %1, %0\n"				\
> +			"	srli %1, %1, 16\n"			\
> +			"	slli %1, %1, 16\n"			\
> +			"	move %2, %z3\n"				\
> +			"	or   %1, %2, %1\n"			\
> +			"	sc.w %2, %1, 0(%z4)\n"			\
> +			"	bnez %2, 0b\n"				\
> +			"	slli %0, %0, 16\n"			\
> +			"	srli %0, %0, 16\n"			\
> +			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
> +			: "rJ" (__new), "rJ"(addr)			\
> +			: "memory");					\
> +		}							\
> +		break;							\
>  	case 4:								\
>  		__asm__ __volatile__ (					\
>  			"	amoswap.w %0, %2, %1\n"			\

I'm pretty sure there's a handfull of implementations like this out
there... if only we could share.

Anyway, this too should be an independent patch.
Arnd Bergmann Nov. 24, 2020, 3 p.m. UTC | #2
On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild

> > +             if (align) {                                            \
> > +             __asm__ __volatile__ (                                  \
> > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > +                     "       move %1, %0\n"                          \
> > +                     "       slli %1, %1, 16\n"                      \
> > +                     "       srli %1, %1, 16\n"                      \
> > +                     "       move %2, %z3\n"                         \
> > +                     "       slli %2, %2, 16\n"                      \
> > +                     "       or   %1, %2, %1\n"                      \
> > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > +                     "       bnez %2, 0b\n"                          \
> > +                     "       srli %0, %0, 16\n"                      \
> > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > +                     : "rJ" (__new), "rJ"(addr)                      \
> > +                     : "memory");                                    \
>
> I'm pretty sure there's a handfull of implementations like this out
> there... if only we could share.

Isn't this effectively the same as the "_Q_PENDING_BITS != 8"
version of xchg_tail()?

If nothing else needs xchg() on a 16-bit value, maybe
changing the #ifdef in the qspinlock code is enough.

Only around half the architectures actually implement 8-bit
and 16-bit cmpxchg() and xchg(), it might even be worth trying
to eventually change the interface to not do it at all, but
instead have explicit cmpxchg8() and cmpxchg16() helpers
for the few files that do use them.

     Arnd
Guo Ren Nov. 25, 2020, 12:52 a.m. UTC | #3
Thx Peter,

On Tue, Nov 24, 2020 at 10:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > index 59dd7be..6f5f438 100644
> > --- a/arch/riscv/include/asm/Kbuild
> > +++ b/arch/riscv/include/asm/Kbuild
> > @@ -6,3 +6,6 @@ generic-y += kvm_para.h
> >  generic-y += local64.h
> >  generic-y += user.h
> >  generic-y += vmlinux.lds.h
> > +generic-y += mcs_spinlock.h
> > +generic-y += qrwlock.h
> > +generic-y += qspinlock.h
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 5609185..e178700 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -16,7 +16,43 @@
> >       __typeof__(ptr) __ptr = (ptr);                                  \
> >       __typeof__(new) __new = (new);                                  \
> >       __typeof__(*(ptr)) __ret;                                       \
> > +     register unsigned long __rc, tmp, align, addr;                  \
> >       switch (size) {                                                 \
> > +     case 2:                                                         \
> > +             align = ((unsigned long) __ptr & 0x3);                  \
> > +             addr = ((unsigned long) __ptr & ~0x3);                  \
> > +             if (align) {                                            \
> > +             __asm__ __volatile__ (                                  \
> > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > +                     "       move %1, %0\n"                          \
> > +                     "       slli %1, %1, 16\n"                      \
> > +                     "       srli %1, %1, 16\n"                      \
> > +                     "       move %2, %z3\n"                         \
> > +                     "       slli %2, %2, 16\n"                      \
> > +                     "       or   %1, %2, %1\n"                      \
> > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > +                     "       bnez %2, 0b\n"                          \
> > +                     "       srli %0, %0, 16\n"                      \
> > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > +                     : "rJ" (__new), "rJ"(addr)                      \
> > +                     : "memory");                                    \
> > +             } else {                                                \
> > +             __asm__ __volatile__ (                                  \
> > +                     "0:     lr.w %0, (%z4)\n"                       \
> > +                     "       move %1, %0\n"                          \
> > +                     "       srli %1, %1, 16\n"                      \
> > +                     "       slli %1, %1, 16\n"                      \
> > +                     "       move %2, %z3\n"                         \
> > +                     "       or   %1, %2, %1\n"                      \
> > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > +                     "       bnez %2, 0b\n"                          \
> > +                     "       slli %0, %0, 16\n"                      \
> > +                     "       srli %0, %0, 16\n"                      \
> > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > +                     : "rJ" (__new), "rJ"(addr)                      \
> > +                     : "memory");                                    \
> > +             }                                                       \
> > +             break;                                                  \
> >       case 4:                                                         \
> >               __asm__ __volatile__ (                                  \
> >                       "       amoswap.w %0, %2, %1\n"                 \
>
> I'm pretty sure there's a handfull of implementations like this out
> there... if only we could share.
Michael has sent qspinlock before, ref to Link below. He reused mips' code.

Link: https://lore.kernel.org/linux-riscv/20190211043829.30096-1-michaeljclark@mac.com/

Which short xchg implementation do you prefer (Mine or his)?

>
> Anyway, this too should be an independent patch.
Ok, I'll separate it into two patches,
1. implement short xchg
2. qspinlock enabled based on Michael's patch
Guo Ren Nov. 25, 2020, 2:09 p.m. UTC | #4
On Tue, Nov 24, 2020 at 11:00 PM Arnd Bergmann <arnd@kernel.org> wrote:
>
> On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
>
> > > +             if (align) {                                            \
> > > +             __asm__ __volatile__ (                                  \
> > > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > > +                     "       move %1, %0\n"                          \
> > > +                     "       slli %1, %1, 16\n"                      \
> > > +                     "       srli %1, %1, 16\n"                      \
> > > +                     "       move %2, %z3\n"                         \
> > > +                     "       slli %2, %2, 16\n"                      \
> > > +                     "       or   %1, %2, %1\n"                      \
> > > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > > +                     "       bnez %2, 0b\n"                          \
> > > +                     "       srli %0, %0, 16\n"                      \
> > > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > > +                     : "rJ" (__new), "rJ"(addr)                      \
> > > +                     : "memory");                                    \
> >
> > I'm pretty sure there's a handfull of implementations like this out
> > there... if only we could share.
>
> Isn't this effectively the same as the "_Q_PENDING_BITS != 8"
> version of xchg_tail()?

This can be concluded as the different effectiveness between cmpxchg
and xchg. For the arch which only has lr/sc instructions, the cmpxchg
& xchg are similar.

#if _Q_PENDING_BITS == 8

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
        /*
         * We can use relaxed semantics since the caller ensures that the
         * MCS node is properly initialized before updating the tail.
         */
        return (u32)xchg_relaxed(&lock->tail,
                                 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}

#else /* _Q_PENDING_BITS == 8 */

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
        u32 old, new, val = atomic_read(&lock->val);

        for (;;) {
                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
                /*
                 * We can use relaxed semantics since the caller ensures that
                 * the MCS node is properly initialized before updating the
                 * tail.
                 */
                old = atomic_cmpxchg_relaxed(&lock->val, val, new);
                if (old == val)
                        break;

                val = old;
        }
        return old;
}
#endif /* _Q_PENDING_BITS == 8 */


>
> If nothing else needs xchg() on a 16-bit value, maybe
> changing the #ifdef in the qspinlock code is enough.
>
> Only around half the architectures actually implement 8-bit
> and 16-bit cmpxchg() and xchg(), it might even be worth trying
> to eventually change the interface to not do it at all, but
> instead have explicit cmpxchg8() and cmpxchg16() helpers
> for the few files that do use them.
>
>      Arnd
Peter Zijlstra Nov. 25, 2020, 2:16 p.m. UTC | #5
On Tue, Nov 24, 2020 at 04:00:14PM +0100, Arnd Bergmann wrote:
> On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> 
> > > +             if (align) {                                            \
> > > +             __asm__ __volatile__ (                                  \
> > > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > > +                     "       move %1, %0\n"                          \
> > > +                     "       slli %1, %1, 16\n"                      \
> > > +                     "       srli %1, %1, 16\n"                      \
> > > +                     "       move %2, %z3\n"                         \
> > > +                     "       slli %2, %2, 16\n"                      \
> > > +                     "       or   %1, %2, %1\n"                      \
> > > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > > +                     "       bnez %2, 0b\n"                          \
> > > +                     "       srli %0, %0, 16\n"                      \
> > > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > > +                     : "rJ" (__new), "rJ"(addr)                      \
> > > +                     : "memory");                                    \
> >
> > I'm pretty sure there's a handfull of implementations like this out
> > there... if only we could share.
> 
> Isn't this effectively the same as the "_Q_PENDING_BITS != 8"
> version of xchg_tail()?

Pretty much I suppose.

> If nothing else needs xchg() on a 16-bit value, maybe
> changing the #ifdef in the qspinlock code is enough.

Like the below? I think the native LL/SC variant is slightly better than
the cmpxchg loop. The problem is that cmpxchg() is ll/sc and then we
loop that, instead of only having the ll/sc loop.

> Only around half the architectures actually implement 8-bit
> and 16-bit cmpxchg() and xchg(), it might even be worth trying
> to eventually change the interface to not do it at all, but
> instead have explicit cmpxchg8() and cmpxchg16() helpers
> for the few files that do use them.

Yeah, many RISCs don't have sub-word atomics. Not sure how many other
users there are. qspinlock certainly is the most popular I suppose.

---


diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..7049fb2af0b2 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }
 
-/*
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail), which heads an address dependency
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
-	/*
-	 * We can use relaxed semantics since the caller ensures that the
-	 * MCS node is properly initialized before updating the tail.
-	 */
-	return (u32)xchg_relaxed(&lock->tail,
-				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
-}
-
 #else /* _Q_PENDING_BITS == 8 */
 
 /**
@@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
 }
 
+#endif /* _Q_PENDING_BITS == 8 */
+
+#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	/*
+	 * We can use relaxed semantics since the caller ensures that the
+	 * MCS node is properly initialized before updating the tail.
+	 */
+	return (u32)xchg_relaxed(&lock->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */
+
 /**
  * xchg_tail - Put in the new queue tail code word & retrieve previous one
  * @lock : Pointer to queued spinlock structure
@@ -236,7 +242,8 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 	}
 	return old;
 }
-#endif /* _Q_PENDING_BITS == 8 */
+
+#endif /* _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16 */
 
 /**
  * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
Peter Zijlstra Nov. 25, 2020, 2:18 p.m. UTC | #6
On Wed, Nov 25, 2020 at 08:52:23AM +0800, Guo Ren wrote:

> > I'm pretty sure there's a handfull of implementations like this out
> > there... if only we could share.
> Michael has sent qspinlock before, ref to Link below. He reused mips' code.
> 
> Link: https://lore.kernel.org/linux-riscv/20190211043829.30096-1-michaeljclark@mac.com/
> 
> Which short xchg implementation do you prefer (Mine or his)?

Well, it would be very nice to have mips/riscv/csky all use the same
header to implement these I suppose.

But then we're back to a cmpxchg-loop, in which case Arnd's suggestion
isn't worse.
Will Deacon Nov. 25, 2020, 2:31 p.m. UTC | #7
On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote:
> @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
>  	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
>  }
>  
> +#endif /* _Q_PENDING_BITS == 8 */
> +
> +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
> +
> +/*
> + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> + * @lock : Pointer to queued spinlock structure
> + * @tail : The new queue tail code word
> + * Return: The previous queue tail code word
> + *
> + * xchg(lock, tail), which heads an address dependency
> + *
> + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> + */
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> +	/*
> +	 * We can use relaxed semantics since the caller ensures that the
> +	 * MCS node is properly initialized before updating the tail.
> +	 */
> +	return (u32)xchg_relaxed(&lock->tail,
> +				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> +}
> +
> +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */

Why can't architectures just implement this with a 32-bit xchg instruction
if they don't have one that operates on 16 bits? Sure, they'll store more
data, but it's atomic so you shouldn't be able to tell... (ignoring parisc
crazy).

Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in
it over tickets for <= 16 CPUs.

Will
Guo Ren Nov. 26, 2020, 1:36 a.m. UTC | #8
Hi Will,

On Wed, Nov 25, 2020 at 10:31 PM Will Deacon <will@kernel.org> wrote:
>
> On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote:
> > @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> >       atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
> >  }
> >
> > +#endif /* _Q_PENDING_BITS == 8 */
> > +
> > +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
> > +
> > +/*
> > + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > + * @lock : Pointer to queued spinlock structure
> > + * @tail : The new queue tail code word
> > + * Return: The previous queue tail code word
> > + *
> > + * xchg(lock, tail), which heads an address dependency
> > + *
> > + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > + */
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +     /*
> > +      * We can use relaxed semantics since the caller ensures that the
> > +      * MCS node is properly initialized before updating the tail.
> > +      */
> > +     return (u32)xchg_relaxed(&lock->tail,
> > +                              tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> > +}
> > +
> > +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */
>
> Why can't architectures just implement this with a 32-bit xchg instruction
> if they don't have one that operates on 16 bits? Sure, they'll store more
> data, but it's atomic so you shouldn't be able to tell... (ignoring parisc
> crazy).
>
> Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in
> it over tickets for <= 16 CPUs.
NUMA is on the way:
https://lore.kernel.org/linux-riscv/20201119003829.1282810-1-atish.patra@wdc.com/

With your advice, I think we could using tickets lock when <= 16 CPUs
and using qspinlock when > 16 CPUs.
Is that right?

The next patchset plan is:
 - Using tickets & qspinlock together in riscv. Abandon 16bits
xchg/cmpxchg implementation.
 - Abanden qspinlock in csky, because it only could 4 CPUs' SMP.

>
> Will
Will Deacon Nov. 26, 2020, 8:53 a.m. UTC | #9
On Thu, Nov 26, 2020 at 09:36:34AM +0800, Guo Ren wrote:
> On Wed, Nov 25, 2020 at 10:31 PM Will Deacon <will@kernel.org> wrote:
> > On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote:
> > > @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> > >       atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
> > >  }
> > >
> > > +#endif /* _Q_PENDING_BITS == 8 */
> > > +
> > > +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
> > > +
> > > +/*
> > > + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > > + * @lock : Pointer to queued spinlock structure
> > > + * @tail : The new queue tail code word
> > > + * Return: The previous queue tail code word
> > > + *
> > > + * xchg(lock, tail), which heads an address dependency
> > > + *
> > > + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > > + */
> > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > > +{
> > > +     /*
> > > +      * We can use relaxed semantics since the caller ensures that the
> > > +      * MCS node is properly initialized before updating the tail.
> > > +      */
> > > +     return (u32)xchg_relaxed(&lock->tail,
> > > +                              tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> > > +}
> > > +
> > > +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */
> >
> > Why can't architectures just implement this with a 32-bit xchg instruction
> > if they don't have one that operates on 16 bits? Sure, they'll store more
> > data, but it's atomic so you shouldn't be able to tell... (ignoring parisc
> > crazy).
> >
> > Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in
> > it over tickets for <= 16 CPUs.
> NUMA is on the way:
> https://lore.kernel.org/linux-riscv/20201119003829.1282810-1-atish.patra@wdc.com/

Well, they're patches not hardware (and I only see mention of QEMU over
there for the RISCV platform) ;)

> With your advice, I think we could using tickets lock when <= 16 CPUs
> and using qspinlock when > 16 CPUs.
> Is that right?

No, when I say "there's nothing in it", it means they're interchangeable.
It's just that qspinlock introduces a lot of complexity and I'm not keen
massively keen on changing the core code (which is used by many other
architectures) just because you don't have a 16-bit xchg() implementation.

So if you need qspinlock on riscv (not sure you do), then go ahead and make
that your one true lock implementation, but implement 16-bit xchg() at the
same time. Bonus points if you implement that in terms of 32-bit xchg() in
generic code (might need to watch out for endianness when shifting the new
value after aligning the base pointer).

Will
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 2d61c4c..d757ba4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -36,6 +36,8 @@  config RISCV
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select CLONE_BACKWARDS
 	select CLINT_TIMER if !MMU
 	select COMMON_CLK
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 59dd7be..6f5f438 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -6,3 +6,6 @@  generic-y += kvm_para.h
 generic-y += local64.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
+generic-y += mcs_spinlock.h
+generic-y += qrwlock.h
+generic-y += qspinlock.h
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 5609185..e178700 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -16,7 +16,43 @@ 
 	__typeof__(ptr) __ptr = (ptr);					\
 	__typeof__(new) __new = (new);					\
 	__typeof__(*(ptr)) __ret;					\
+	register unsigned long __rc, tmp, align, addr;			\
 	switch (size) {							\
+	case 2:								\
+		align = ((unsigned long) __ptr & 0x3);			\
+		addr = ((unsigned long) __ptr & ~0x3);			\
+		if (align) {						\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, 0(%z4)\n"			\
+			"	move %1, %0\n"				\
+			"	slli %1, %1, 16\n"			\
+			"	srli %1, %1, 16\n"			\
+			"	move %2, %z3\n"				\
+			"	slli %2, %2, 16\n"			\
+			"	or   %1, %2, %1\n"			\
+			"	sc.w %2, %1, 0(%z4)\n"			\
+			"	bnez %2, 0b\n"				\
+			"	srli %0, %0, 16\n"			\
+			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
+			: "rJ" (__new), "rJ"(addr)			\
+			: "memory");					\
+		} else {						\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, (%z4)\n"			\
+			"	move %1, %0\n"				\
+			"	srli %1, %1, 16\n"			\
+			"	slli %1, %1, 16\n"			\
+			"	move %2, %z3\n"				\
+			"	or   %1, %2, %1\n"			\
+			"	sc.w %2, %1, 0(%z4)\n"			\
+			"	bnez %2, 0b\n"				\
+			"	slli %0, %0, 16\n"			\
+			"	srli %0, %0, 16\n"			\
+			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
+			: "rJ" (__new), "rJ"(addr)			\
+			: "memory");					\
+		}							\
+		break;							\
 	case 4:								\
 		__asm__ __volatile__ (					\
 			"	amoswap.w %0, %2, %1\n"			\
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index f4f7fa1..b8deb3a 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -7,129 +7,7 @@ 
 #ifndef _ASM_RISCV_SPINLOCK_H
 #define _ASM_RISCV_SPINLOCK_H
 
-#include <linux/kernel.h>
-#include <asm/current.h>
-#include <asm/fence.h>
-
-/*
- * Simple spin lock operations.  These provide no fairness guarantees.
- */
-
-/* FIXME: Replace this with a ticket lock, like MIPS. */
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	int tmp = 1, busy;
-
-	__asm__ __volatile__ (
-		"	amoswap.w %0, %2, %1\n"
-		RISCV_ACQUIRE_BARRIER
-		: "=r" (busy), "+A" (lock->lock)
-		: "r" (tmp)
-		: "memory");
-
-	return !busy;
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	while (1) {
-		if (arch_spin_is_locked(lock))
-			continue;
-
-		if (arch_spin_trylock(lock))
-			break;
-	}
-}
-
-/***********************************************************/
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1b\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
-}
-
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1b\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1f\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1f\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-		RISCV_RELEASE_BARRIER
-		"	amoadd.w x0, %1, %0\n"
-		: "+A" (lock->lock)
-		: "r" (-1)
-		: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
+#include <asm/qrwlock.h>
+#include <asm/qspinlock.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
index f398e76..d033a97 100644
--- a/arch/riscv/include/asm/spinlock_types.h
+++ b/arch/riscv/include/asm/spinlock_types.h
@@ -6,20 +6,11 @@ 
 #ifndef _ASM_RISCV_SPINLOCK_TYPES_H
 #define _ASM_RISCV_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(_ASM_RISCV_SPINLOCK_H)
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
-	volatile unsigned int lock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */