diff mbox series

[RFC,v3,4/5] riscv/cmpxchg: Implement cmpxchg for variables of size 1 and 2

Message ID 20230804084900.1135660-6-leobras@redhat.com (mailing list archive)
State Superseded
Delegated to: Palmer Dabbelt
Headers show
Series Rework & improve riscv cmpxchg.h and atomic.h | expand

Commit Message

Leonardo Bras Aug. 4, 2023, 8:48 a.m. UTC
cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
riscv, even though its present in other architectures such as arm64 and
x86. This could lead to not being able to implement some locking mechanisms
or requiring some rework to make it work properly.

Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
architectures.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
---
 arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

Comments

Guo Ren Aug. 4, 2023, 5:45 p.m. UTC | #1
On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
>
> cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> riscv, even though its present in other architectures such as arm64 and
> x86. This could lead to not being able to implement some locking mechanisms
> or requiring some rework to make it work properly.
>
> Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> architectures.
>
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> ---
>  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
>  1 file changed, 35 insertions(+)
>
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 5a07646fae65..dfb433ac544f 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -72,6 +72,36 @@
>   * indicated by comparing RETURN with OLD.
>   */
>
> +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> +({                                                                     \
> +       /* Depends on 2-byte variables being 2-byte aligned */          \
> +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> +                       << __s;                                         \
> +       ulong __newx = (ulong)(n) << __s;                               \
> +       ulong __oldx = (ulong)(o) << __s;                               \
> +       ulong __retx;                                                   \
> +       register unsigned int __rc;                                     \
> +                                                                       \
> +       __asm__ __volatile__ (                                          \
> +               prepend                                                 \
> +               "0:     lr.w %0, %2\n"                                  \
> +               "       and  %0, %0, %z5\n"                             \
> +               "       bne  %0, %z3, 1f\n"                             \
bug:
-               "       and  %0, %0, %z5\n"                             \
-               "       bne  %0, %z3, 1f\n"                             \
+               "       and  %1, %0, %z5\n"                             \
+               "       bne  %1, %z3, 1f\n"                             \
Your code breaks the %0.



> +               "       and  %1, %0, %z6\n"                             \
> +               "       or   %1, %1, %z4\n"                             \
> +               "       sc.w" sc_sfx " %1, %1, %2\n"                    \
> +               "       bnez %1, 0b\n"                                  \
> +               append                                                  \
> +               "1:\n"                                                  \
> +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> +                 "rJ" (__mask), "rJ" (~__mask)                         \
> +               : "memory");                                            \
> +                                                                       \
> +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> +})
> +
>
>  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
>  ({                                                                     \
> @@ -98,6 +128,11 @@
>         __typeof__(*(ptr)) __ret;                                       \
>                                                                         \
>         switch (sizeof(*__ptr)) {                                       \
> +       case 1:                                                         \
> +       case 2:                                                         \
> +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> +                                       __ret, __ptr, __old, __new);    \
> +               break;                                                  \
>         case 4:                                                         \
>                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
>                                 __ret, __ptr, (long), __old, __new);    \
> --
> 2.41.0
>
Leonardo Bras Aug. 5, 2023, 3:14 a.m. UTC | #2
Hello Guo Ren, thanks for the feedback!

On Fri, Aug 4, 2023 at 2:45 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> > riscv, even though its present in other architectures such as arm64 and
> > x86. This could lead to not being able to implement some locking mechanisms
> > or requiring some rework to make it work properly.
> >
> > Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> > architectures.
> >
> > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > ---
> >  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
> >  1 file changed, 35 insertions(+)
> >
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 5a07646fae65..dfb433ac544f 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -72,6 +72,36 @@
> >   * indicated by comparing RETURN with OLD.
> >   */
> >
> > +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > +({                                                                     \
> > +       /* Depends on 2-byte variables being 2-byte aligned */          \
> > +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > +                       << __s;                                         \
> > +       ulong __newx = (ulong)(n) << __s;                               \
> > +       ulong __oldx = (ulong)(o) << __s;                               \
> > +       ulong __retx;                                                   \
> > +       register unsigned int __rc;                                     \
> > +                                                                       \
> > +       __asm__ __volatile__ (                                          \
> > +               prepend                                                 \
> > +               "0:     lr.w %0, %2\n"                                  \
> > +               "       and  %0, %0, %z5\n"                             \
> > +               "       bne  %0, %z3, 1f\n"                             \

> bug:
> -               "       and  %0, %0, %z5\n"                             \
> -               "       bne  %0, %z3, 1f\n"                             \
> +               "       and  %1, %0, %z5\n"                             \
> +               "       bne  %1, %z3, 1f\n"                             \
> Your code breaks the %0.

What do you mean by breaks here?

In the end of this macro, I intended  to have __retx = (*p & __mask)
which means the value is clean to be rotated at the end of the macro
(no need to apply the mask again): r = __ret >> __s;

Also, I assumed we are supposed to return the same variable type
as the pointer, so this is valid:
u8 a, *b, c;
a = xchg(b, c);

Is this correct?

> > +               append                                                  \
> > +               "1:\n"                                                  \
> > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > +                 "rJ" (__mask), "rJ" (~__mask)                         \
> > +               : "memory");                                            \
> > +                                                                       \
> > +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> > +})
> > +
> >
> >  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
> >  ({                                                                     \
> > @@ -98,6 +128,11 @@
> >         __typeof__(*(ptr)) __ret;                                       \
> >                                                                         \
> >         switch (sizeof(*__ptr)) {                                       \
> > +       case 1:                                                         \
> > +       case 2:                                                         \
> > +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> > +                                       __ret, __ptr, __old, __new);    \
> > +               break;                                                  \
> >         case 4:                                                         \
> >                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
> >                                 __ret, __ptr, (long), __old, __new);    \
> > --
> > 2.41.0
> >
>
>
> --
> Best Regards
>  Guo Ren
>
Guo Ren Aug. 5, 2023, 4:24 a.m. UTC | #3
On Sat, Aug 5, 2023 at 11:14 AM Leonardo Bras Soares Passos
<leobras@redhat.com> wrote:
>
> Hello Guo Ren, thanks for the feedback!
>
> On Fri, Aug 4, 2023 at 2:45 PM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
> > >
> > > cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> > > riscv, even though its present in other architectures such as arm64 and
> > > x86. This could lead to not being able to implement some locking mechanisms
> > > or requiring some rework to make it work properly.
> > >
> > > Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> > > architectures.
> > >
> > > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > > ---
> > >  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
> > >  1 file changed, 35 insertions(+)
> > >
> > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > index 5a07646fae65..dfb433ac544f 100644
> > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > @@ -72,6 +72,36 @@
> > >   * indicated by comparing RETURN with OLD.
> > >   */
> > >
> > > +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > > +({                                                                     \
> > > +       /* Depends on 2-byte variables being 2-byte aligned */          \
> > > +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > > +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > > +                       << __s;                                         \
> > > +       ulong __newx = (ulong)(n) << __s;                               \
> > > +       ulong __oldx = (ulong)(o) << __s;                               \
> > > +       ulong __retx;                                                   \
> > > +       register unsigned int __rc;                                     \
> > > +                                                                       \
> > > +       __asm__ __volatile__ (                                          \
> > > +               prepend                                                 \
> > > +               "0:     lr.w %0, %2\n"                                  \
> > > +               "       and  %0, %0, %z5\n"                             \
> > > +               "       bne  %0, %z3, 1f\n"                             \
>
> > bug:
> > -               "       and  %0, %0, %z5\n"                             \
> > -               "       bne  %0, %z3, 1f\n"                             \
> > +               "       and  %1, %0, %z5\n"                             \
> > +               "       bne  %1, %z3, 1f\n"                             \
> > Your code breaks the %0.
>
> What do you mean by breaks here?
>
> In the end of this macro, I intended  to have __retx = (*p & __mask)
> which means the value is clean to be rotated at the end of the macro
> (no need to apply the mask again): r = __ret >> __s;
>
> Also, I assumed we are supposed to return the same variable type
> as the pointer, so this is valid:
> u8 a, *b, c;
> a = xchg(b, c);
>
> Is this correct?
I missed your removing "__ret & mask" at the end. So this may not the problem.

Your patch can't boot. After chewing your code for several hours, I
found a problem:
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 943f094375c7..67bcce63b267 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -14,6 +14,7 @@
 #define __arch_xchg_mask(prepend, append, r, p, n)                     \
 ({                                                                     \
        /* Depends on 2-byte variables being 2-byte aligned */          \
+       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
        ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
        ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
                        << __s;                                         \
@@ -29,7 +30,7 @@
               "        sc.w %1, %1, %2\n"                              \
               "        bnez %1, 0b\n"                                  \
               append                                                   \
-              : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))              \
+              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))            \
               : "rJ" (__newx), "rJ" (~__mask)                          \
               : "memory");                                             \
                                                                        \
@@ -106,6 +107,7 @@
 #define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
 ({                                                                     \
        /* Depends on 2-byte variables being 2-byte aligned */          \
+       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
        ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
        ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
                        << __s;                                         \
@@ -125,7 +127,7 @@
                "       bnez %1, 0b\n"                                  \
                append                                                  \
                "1:\n"                                                  \
-               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
+               : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))           \
                : "rJ" ((long)__oldx), "rJ" (__newx),                   \
                  "rJ" (__mask), "rJ" (~__mask)                         \
                : "memory");                                            \

But the lkvm-static still can't boot with paravirt_spinlock .... Are
there any atomic tests in the Linux?

I found you use some "register int variables". Would it cause the problem?

You can reference this file, and it has passed the lock torture test:
https://github.com/guoren83/linux/blob/sg2042-master-qspinlock-64ilp32_v4/arch/riscv/include/asm/cmpxchg.h

I also merged your patches with the qspinlock series: (Use the above
cmpxchg.h the lkvm would run normally.)
https://github.com/guoren83/linux/tree/qspinlock_v11



>
> > > +               append                                                  \
> > > +               "1:\n"                                                  \
> > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > > +                 "rJ" (__mask), "rJ" (~__mask)                         \
> > > +               : "memory");                                            \
> > > +                                                                       \
> > > +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> > > +})
> > > +
> > >
> > >  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
> > >  ({                                                                     \
> > > @@ -98,6 +128,11 @@
> > >         __typeof__(*(ptr)) __ret;                                       \
> > >                                                                         \
> > >         switch (sizeof(*__ptr)) {                                       \
> > > +       case 1:                                                         \
> > > +       case 2:                                                         \
> > > +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> > > +                                       __ret, __ptr, __old, __new);    \
> > > +               break;                                                  \
> > >         case 4:                                                         \
> > >                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
> > >                                 __ret, __ptr, (long), __old, __new);    \
> > > --
> > > 2.41.0
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
>


--
Best Regards
 Guo Ren
Leonardo Bras Aug. 7, 2023, 4:17 p.m. UTC | #4
On Sat, Aug 5, 2023 at 1:24 AM Guo Ren <guoren@kernel.org> wrote:
>
> On Sat, Aug 5, 2023 at 11:14 AM Leonardo Bras Soares Passos
> <leobras@redhat.com> wrote:
> >
> > Hello Guo Ren, thanks for the feedback!
> >
> > On Fri, Aug 4, 2023 at 2:45 PM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > >
> > > > cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> > > > riscv, even though its present in other architectures such as arm64 and
> > > > x86. This could lead to not being able to implement some locking mechanisms
> > > > or requiring some rework to make it work properly.
> > > >
> > > > Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> > > > architectures.
> > > >
> > > > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > > > ---
> > > >  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
> > > >  1 file changed, 35 insertions(+)
> > > >
> > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > index 5a07646fae65..dfb433ac544f 100644
> > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > @@ -72,6 +72,36 @@
> > > >   * indicated by comparing RETURN with OLD.
> > > >   */
> > > >
> > > > +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > > > +({                                                                     \
> > > > +       /* Depends on 2-byte variables being 2-byte aligned */          \
> > > > +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > > > +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > > > +                       << __s;                                         \
> > > > +       ulong __newx = (ulong)(n) << __s;                               \
> > > > +       ulong __oldx = (ulong)(o) << __s;                               \
> > > > +       ulong __retx;                                                   \
> > > > +       register unsigned int __rc;                                     \
> > > > +                                                                       \
> > > > +       __asm__ __volatile__ (                                          \
> > > > +               prepend                                                 \
> > > > +               "0:     lr.w %0, %2\n"                                  \
> > > > +               "       and  %0, %0, %z5\n"                             \
> > > > +               "       bne  %0, %z3, 1f\n"                             \
> >
> > > bug:
> > > -               "       and  %0, %0, %z5\n"                             \
> > > -               "       bne  %0, %z3, 1f\n"                             \
> > > +               "       and  %1, %0, %z5\n"                             \
> > > +               "       bne  %1, %z3, 1f\n"                             \
> > > Your code breaks the %0.
> >
> > What do you mean by breaks here?
> >
> > In the end of this macro, I intended  to have __retx = (*p & __mask)
> > which means the value is clean to be rotated at the end of the macro
> > (no need to apply the mask again): r = __ret >> __s;
> >
> > Also, I assumed we are supposed to return the same variable type
> > as the pointer, so this is valid:
> > u8 a, *b, c;
> > a = xchg(b, c);
> >
> > Is this correct?
> I missed your removing "__ret & mask" at the end. So this may not the problem.
>
> Your patch can't boot. After chewing your code for several hours, I
> found a problem:
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 943f094375c7..67bcce63b267 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -14,6 +14,7 @@
>  #define __arch_xchg_mask(prepend, append, r, p, n)                     \
>  ({                                                                     \
>         /* Depends on 2-byte variables being 2-byte aligned */          \
> +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \

Wow, I totally missed this one: I should not assume the processor
behavior on unaligned lr/sc.
Thanks for spotting this :)

>         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
>         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
>                         << __s;                                         \
> @@ -29,7 +30,7 @@
>                "        sc.w %1, %1, %2\n"                              \
>                "        bnez %1, 0b\n"                                  \
>                append                                                   \
> -              : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))              \
> +              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))            \
>                : "rJ" (__newx), "rJ" (~__mask)                          \
>                : "memory");                                             \
>                                                                         \
> @@ -106,6 +107,7 @@
>  #define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
>  ({                                                                     \
>         /* Depends on 2-byte variables being 2-byte aligned */          \
> +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
>         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
>         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
>                         << __s;                                         \
> @@ -125,7 +127,7 @@
>                 "       bnez %1, 0b\n"                                  \
>                 append                                                  \
>                 "1:\n"                                                  \
> -               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))           \
>                 : "rJ" ((long)__oldx), "rJ" (__newx),                   \
>                   "rJ" (__mask), "rJ" (~__mask)                         \
>                 : "memory");                                            \
>
> But the lkvm-static still can't boot with paravirt_spinlock .... Are
> there any atomic tests in the Linux?

I recall reading something about 'locktorture' or so, not sure if useful:

https://docs.kernel.org/locking/locktorture.html

>
> I found you use some "register int variables". Would it cause the problem?

Honestly, not sure.
I will try inspecting the generated asm for any unexpected changes
compared to your version.

>
> You can reference this file, and it has passed the lock torture test:
> https://github.com/guoren83/linux/blob/sg2042-master-qspinlock-64ilp32_v4/arch/riscv/include/asm/cmpxchg.h
>
> I also merged your patches with the qspinlock series: (Use the above
> cmpxchg.h the lkvm would run normally.)
> https://github.com/guoren83/linux/tree/qspinlock_v11

Thanks!

I should reply as soon as I find anything.

Best regards,
Leo

>
>
>
> >
> > > > +               append                                                  \
> > > > +               "1:\n"                                                  \
> > > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > > +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > > > +                 "rJ" (__mask), "rJ" (~__mask)                         \
> > > > +               : "memory");                                            \
> > > > +                                                                       \
> > > > +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> > > > +})
> > > > +
> > > >
> > > >  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
> > > >  ({                                                                     \
> > > > @@ -98,6 +128,11 @@
> > > >         __typeof__(*(ptr)) __ret;                                       \
> > > >                                                                         \
> > > >         switch (sizeof(*__ptr)) {                                       \
> > > > +       case 1:                                                         \
> > > > +       case 2:                                                         \
> > > > +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> > > > +                                       __ret, __ptr, __old, __new);    \
> > > > +               break;                                                  \
> > > >         case 4:                                                         \
> > > >                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
> > > >                                 __ret, __ptr, (long), __old, __new);    \
> > > > --
> > > > 2.41.0
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
> >
>
>
> --
> Best Regards
>  Guo Ren
>
Leonardo Bras Aug. 9, 2023, 2:02 a.m. UTC | #5
Hello Guo Ren,

On Mon, Aug 7, 2023 at 1:17 PM Leonardo Bras Soares Passos
<leobras@redhat.com> wrote:
>
> On Sat, Aug 5, 2023 at 1:24 AM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Sat, Aug 5, 2023 at 11:14 AM Leonardo Bras Soares Passos
> > <leobras@redhat.com> wrote:
> > >
> > > Hello Guo Ren, thanks for the feedback!
> > >
> > > On Fri, Aug 4, 2023 at 2:45 PM Guo Ren <guoren@kernel.org> wrote:
> > > >
> > > > On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > > >
> > > > > cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> > > > > riscv, even though its present in other architectures such as arm64 and
> > > > > x86. This could lead to not being able to implement some locking mechanisms
> > > > > or requiring some rework to make it work properly.
> > > > >
> > > > > Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> > > > > architectures.
> > > > >
> > > > > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > > > > ---
> > > > >  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 35 insertions(+)
> > > > >
> > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > index 5a07646fae65..dfb433ac544f 100644
> > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > @@ -72,6 +72,36 @@
> > > > >   * indicated by comparing RETURN with OLD.
> > > > >   */
> > > > >
> > > > > +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > > > > +({                                                                     \
> > > > > +       /* Depends on 2-byte variables being 2-byte aligned */          \
> > > > > +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > > > > +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > > > > +                       << __s;                                         \
> > > > > +       ulong __newx = (ulong)(n) << __s;                               \
> > > > > +       ulong __oldx = (ulong)(o) << __s;                               \
> > > > > +       ulong __retx;                                                   \
> > > > > +       register unsigned int __rc;                                     \
> > > > > +                                                                       \
> > > > > +       __asm__ __volatile__ (                                          \
> > > > > +               prepend                                                 \
> > > > > +               "0:     lr.w %0, %2\n"                                  \

      bne  %0, %z3, 1f\n"                             \
> > >
> > > > bug:
> > > > -               "       and  %0, %0, %z5\n"                             \
> > > > -               "       bne  %0, %z3, 1f\n"                             \
> > > > +               "       and  %1, %0, %z5\n"                             \
> > > > +               "       bne  %1, %z3, 1f\n"                             \
> > > > Your code breaks the %0.
> > >
> > > What do you mean by breaks here?
> > >
> > > In the end of this macro, I intended  to have __retx = (*p & __mask)
> > > which means the value is clean to be rotated at the end of the macro
> > > (no need to apply the mask again): r = __ret >> __s;
> > >
> > > Also, I assumed we are supposed to return the same variable type
> > > as the pointer, so this is valid:
> > > u8 a, *b, c;
> > > a = xchg(b, c);
> > >
> > > Is this correct?
> > I missed your removing "__ret & mask" at the end. So this may not the problem.

It was actually the problem :)
Even though I revised a lot, I was missing this which was made clear
by test-running on a VM:

        "0:    lr.w %0, %2\n"                    \
        "    and  %0, %0, %z5\n"                \
        "    bne  %0, %z3, 1f\n"                \
        "    and  %1, %0, %z6\n"                \
        "    or   %1, %1, %z4\n"                \
        "    sc.w" sc_sfx " %1, %1, %2\n"            \
        "    bnez %1, 0b\n"

If I do "and %0, %0, %z5" there, the line  "and  %1, %0, %z6"  will
output %1 = 0, which is not the intended.
You were right! The above fix solves the issue, and I just have to
mask it after.

Sorry it took me a few days to realize this.


> >
> > Your patch can't boot. After chewing your code for several hours, I
> > found a problem:
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 943f094375c7..67bcce63b267 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -14,6 +14,7 @@
> >  #define __arch_xchg_mask(prepend, append, r, p, n)                     \
> >  ({                                                                     \
> >         /* Depends on 2-byte variables being 2-byte aligned */          \
> > +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
>
> Wow, I totally missed this one: I should not assume the processor
> behavior on unaligned lr/sc.
> Thanks for spotting this :)
>
> >         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> >         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> >                         << __s;                                         \
> > @@ -29,7 +30,7 @@
> >                "        sc.w %1, %1, %2\n"                              \
> >                "        bnez %1, 0b\n"                                  \
> >                append                                                   \
> > -              : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))              \
> > +              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))            \
> >                : "rJ" (__newx), "rJ" (~__mask)                          \
> >                : "memory");                                             \
> >                                                                         \
> > @@ -106,6 +107,7 @@
> >  #define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> >  ({                                                                     \
> >         /* Depends on 2-byte variables being 2-byte aligned */          \
> > +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
> >         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> >         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> >                         << __s;                                         \
> > @@ -125,7 +127,7 @@
> >                 "       bnez %1, 0b\n"                                  \
> >                 append                                                  \
> >                 "1:\n"                                                  \
> > -               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))           \
> >                 : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> >                   "rJ" (__mask), "rJ" (~__mask)                         \
> >                 : "memory");                                            \
> >
> > But the lkvm-static still can't boot with paravirt_spinlock .... Are
> > there any atomic tests in the Linux?
>
> I recall reading something about 'locktorture' or so, not sure if useful:
>
> https://docs.kernel.org/locking/locktorture.html
>
> >
> > I found you use some "register int variables". Would it cause the problem?
>
> Honestly, not sure.
> I will try inspecting the generated asm for any unexpected changes
> compared to your version.

changed type to ulong, no changes detected in asm

>
> >
> > You can reference this file, and it has passed the lock torture test:
> > https://github.com/guoren83/linux/blob/sg2042-master-qspinlock-64ilp32_v4/arch/riscv/include/asm/cmpxchg.h
> >
> > I also merged your patches with the qspinlock series: (Use the above
> > cmpxchg.h the lkvm would run normally.)
> > https://github.com/guoren83/linux/tree/qspinlock_v11
>
> Thanks!
>
> I should reply as soon as I find anything.
>

Some changes found (for xchg16):
- pointer alignment wrong (need mask with ~0x3) : solved. Doesn't seem
to need to be volatile, though.

- AND with 0x3 for getting shift: I was trusting 2-byte vars to be
2-byte aligned , ie. p & 0x1 == 0: may not be true. Solved (it's
simple)

- on your code, IIUC you also assume ptr & 0x1 == 0, on
        ulong *__ptr = (ulong *)((ulong)ptr & ~2);
if it's not aligned, you may have __ptr = 0xYYYYYY01
(Not saying it's wrong to assume this, just listing It differs on my code)

- also verified your code generates addw and sllw on some points mine
generates add and sll, don't see any issue there

- There are extra sll and srl in my asm, because I end up casting to
typeof(*ptr) at the end.

changes found in cmpxchg (__cmpxchg_small_relaxed)
- mostly the same as above,

- the beginning of my function ends up being smaller, since I do my
masking differently.

I fixed the above issues, and  ran some userspace debugging tests on a
VM, and all seems to be correct now.

I then fetched your github repo, replaced my patches v3->v4 on your
tree, added linux boot parameter "qspinlock" on my qemu cmdline
(-machine virt): booting and working just fine.

I will send a v4 soon, please give it a test when possible.

Best regards,
Leo



> >
> >
> >
> > >
> > > > > +               append                                                  \
> > > > > +               "1:\n"                                                  \
> > > > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > > > +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > > > > +                 "rJ" (__mask), "rJ" (~__mask)                         \
> > > > > +               : "memory");                                            \
> > > > > +                                                                       \
> > > > > +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> > > > > +})
> > > > > +
> > > > >
> > > > >  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
> > > > >  ({                                                                     \
> > > > > @@ -98,6 +128,11 @@
> > > > >         __typeof__(*(ptr)) __ret;                                       \
> > > > >                                                                         \
> > > > >         switch (sizeof(*__ptr)) {                                       \
> > > > > +       case 1:                                                         \
> > > > > +       case 2:                                                         \
> > > > > +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> > > > > +                                       __ret, __ptr, __old, __new);    \
> > > > > +               break;                                                  \
> > > > >         case 4:                                                         \
> > > > >                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
> > > > >                                 __ret, __ptr, (long), __old, __new);    \
> > > > > --
> > > > > 2.41.0
> > > > >
> > > >
> > > >
> > > > --
> > > > Best Regards
> > > >  Guo Ren
> > > >
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
Leonardo Bras Aug. 10, 2023, 4:06 a.m. UTC | #6
v5: https://lore.kernel.org/all/20230810040349.92279-2-leobras@redhat.com/

On Tue, Aug 8, 2023 at 11:02 PM Leonardo Bras Soares Passos
<leobras@redhat.com> wrote:
>
> Hello Guo Ren,
>
> On Mon, Aug 7, 2023 at 1:17 PM Leonardo Bras Soares Passos
> <leobras@redhat.com> wrote:
> >
> > On Sat, Aug 5, 2023 at 1:24 AM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Sat, Aug 5, 2023 at 11:14 AM Leonardo Bras Soares Passos
> > > <leobras@redhat.com> wrote:
> > > >
> > > > Hello Guo Ren, thanks for the feedback!
> > > >
> > > > On Fri, Aug 4, 2023 at 2:45 PM Guo Ren <guoren@kernel.org> wrote:
> > > > >
> > > > > On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > > > >
> > > > > > cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> > > > > > riscv, even though its present in other architectures such as arm64 and
> > > > > > x86. This could lead to not being able to implement some locking mechanisms
> > > > > > or requiring some rework to make it work properly.
> > > > > >
> > > > > > Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> > > > > > architectures.
> > > > > >
> > > > > > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > > > > > ---
> > > > > >  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 35 insertions(+)
> > > > > >
> > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > index 5a07646fae65..dfb433ac544f 100644
> > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > @@ -72,6 +72,36 @@
> > > > > >   * indicated by comparing RETURN with OLD.
> > > > > >   */
> > > > > >
> > > > > > +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > > > > > +({                                                                     \
> > > > > > +       /* Depends on 2-byte variables being 2-byte aligned */          \
> > > > > > +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > > > > > +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > > > > > +                       << __s;                                         \
> > > > > > +       ulong __newx = (ulong)(n) << __s;                               \
> > > > > > +       ulong __oldx = (ulong)(o) << __s;                               \
> > > > > > +       ulong __retx;                                                   \
> > > > > > +       register unsigned int __rc;                                     \
> > > > > > +                                                                       \
> > > > > > +       __asm__ __volatile__ (                                          \
> > > > > > +               prepend                                                 \
> > > > > > +               "0:     lr.w %0, %2\n"                                  \
>
>       bne  %0, %z3, 1f\n"                             \
> > > >
> > > > > bug:
> > > > > -               "       and  %0, %0, %z5\n"                             \
> > > > > -               "       bne  %0, %z3, 1f\n"                             \
> > > > > +               "       and  %1, %0, %z5\n"                             \
> > > > > +               "       bne  %1, %z3, 1f\n"                             \
> > > > > Your code breaks the %0.
> > > >
> > > > What do you mean by breaks here?
> > > >
> > > > In the end of this macro, I intended  to have __retx = (*p & __mask)
> > > > which means the value is clean to be rotated at the end of the macro
> > > > (no need to apply the mask again): r = __ret >> __s;
> > > >
> > > > Also, I assumed we are supposed to return the same variable type
> > > > as the pointer, so this is valid:
> > > > u8 a, *b, c;
> > > > a = xchg(b, c);
> > > >
> > > > Is this correct?
> > > I missed your removing "__ret & mask" at the end. So this may not the problem.
>
> It was actually the problem :)
> Even though I revised a lot, I was missing this which was made clear
> by test-running on a VM:
>
>         "0:    lr.w %0, %2\n"                    \
>         "    and  %0, %0, %z5\n"                \
>         "    bne  %0, %z3, 1f\n"                \
>         "    and  %1, %0, %z6\n"                \
>         "    or   %1, %1, %z4\n"                \
>         "    sc.w" sc_sfx " %1, %1, %2\n"            \
>         "    bnez %1, 0b\n"
>
> If I do "and %0, %0, %z5" there, the line  "and  %1, %0, %z6"  will
> output %1 = 0, which is not the intended.
> You were right! The above fix solves the issue, and I just have to
> mask it after.
>
> Sorry it took me a few days to realize this.
>
>
> > >
> > > Your patch can't boot. After chewing your code for several hours, I
> > > found a problem:
> > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > index 943f094375c7..67bcce63b267 100644
> > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > @@ -14,6 +14,7 @@
> > >  #define __arch_xchg_mask(prepend, append, r, p, n)                     \
> > >  ({                                                                     \
> > >         /* Depends on 2-byte variables being 2-byte aligned */          \
> > > +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
> >
> > Wow, I totally missed this one: I should not assume the processor
> > behavior on unaligned lr/sc.
> > Thanks for spotting this :)
> >
> > >         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > >         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > >                         << __s;                                         \
> > > @@ -29,7 +30,7 @@
> > >                "        sc.w %1, %1, %2\n"                              \
> > >                "        bnez %1, 0b\n"                                  \
> > >                append                                                   \
> > > -              : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))              \
> > > +              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))            \
> > >                : "rJ" (__newx), "rJ" (~__mask)                          \
> > >                : "memory");                                             \
> > >                                                                         \
> > > @@ -106,6 +107,7 @@
> > >  #define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > >  ({                                                                     \
> > >         /* Depends on 2-byte variables being 2-byte aligned */          \
> > > +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
> > >         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > >         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > >                         << __s;                                         \
> > > @@ -125,7 +127,7 @@
> > >                 "       bnez %1, 0b\n"                                  \
> > >                 append                                                  \
> > >                 "1:\n"                                                  \
> > > -               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))           \
> > >                 : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > >                   "rJ" (__mask), "rJ" (~__mask)                         \
> > >                 : "memory");                                            \
> > >
> > > But the lkvm-static still can't boot with paravirt_spinlock .... Are
> > > there any atomic tests in the Linux?
> >
> > I recall reading something about 'locktorture' or so, not sure if useful:
> >
> > https://docs.kernel.org/locking/locktorture.html
> >
> > >
> > > I found you use some "register int variables". Would it cause the problem?
> >
> > Honestly, not sure.
> > I will try inspecting the generated asm for any unexpected changes
> > compared to your version.
>
> changed type to ulong, no changes detected in asm
>
> >
> > >
> > > You can reference this file, and it has passed the lock torture test:
> > > https://github.com/guoren83/linux/blob/sg2042-master-qspinlock-64ilp32_v4/arch/riscv/include/asm/cmpxchg.h
> > >
> > > I also merged your patches with the qspinlock series: (Use the above
> > > cmpxchg.h the lkvm would run normally.)
> > > https://github.com/guoren83/linux/tree/qspinlock_v11
> >
> > Thanks!
> >
> > I should reply as soon as I find anything.
> >
>
> Some changes found (for xchg16):
> - pointer alignment wrong (need mask with ~0x3) : solved. Doesn't seem
> to need to be volatile, though.
>
> - AND with 0x3 for getting shift: I was trusting 2-byte vars to be
> 2-byte aligned , ie. p & 0x1 == 0: may not be true. Solved (it's
> simple)
>
> - on your code, IIUC you also assume ptr & 0x1 == 0, on
>         ulong *__ptr = (ulong *)((ulong)ptr & ~2);
> if it's not aligned, you may have __ptr = 0xYYYYYY01
> (Not saying it's wrong to assume this, just listing It differs on my code)
>
> - also verified your code generates addw and sllw on some points mine
> generates add and sll, don't see any issue there
>
> - There are extra sll and srl in my asm, because I end up casting to
> typeof(*ptr) at the end.
>
> changes found in cmpxchg (__cmpxchg_small_relaxed)
> - mostly the same as above,
>
> - the beginning of my function ends up being smaller, since I do my
> masking differently.
>
> I fixed the above issues, and  ran some userspace debugging tests on a
> VM, and all seems to be correct now.
>
> I then fetched your github repo, replaced my patches v3->v4 on your
> tree, added linux boot parameter "qspinlock" on my qemu cmdline
> (-machine virt): booting and working just fine.
>
> I will send a v4 soon, please give it a test when possible.
>
> Best regards,
> Leo
>
>
>
> > >
> > >
> > >
> > > >
> > > > > > +               append                                                  \
> > > > > > +               "1:\n"                                                  \
> > > > > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > > > > +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > > > > > +                 "rJ" (__mask), "rJ" (~__mask)                         \
> > > > > > +               : "memory");                                            \
> > > > > > +                                                                       \
> > > > > > +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> > > > > > +})
> > > > > > +
> > > > > >
> > > > > >  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
> > > > > >  ({                                                                     \
> > > > > > @@ -98,6 +128,11 @@
> > > > > >         __typeof__(*(ptr)) __ret;                                       \
> > > > > >                                                                         \
> > > > > >         switch (sizeof(*__ptr)) {                                       \
> > > > > > +       case 1:                                                         \
> > > > > > +       case 2:                                                         \
> > > > > > +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> > > > > > +                                       __ret, __ptr, __old, __new);    \
> > > > > > +               break;                                                  \
> > > > > >         case 4:                                                         \
> > > > > >                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
> > > > > >                                 __ret, __ptr, (long), __old, __new);    \
> > > > > > --
> > > > > > 2.41.0
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Best Regards
> > > > >  Guo Ren
> > > > >
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
Guo Ren Aug. 11, 2023, 2:32 a.m. UTC | #7
On Wed, Aug 9, 2023 at 10:03 AM Leonardo Bras Soares Passos
<leobras@redhat.com> wrote:
>
> Hello Guo Ren,
>
> On Mon, Aug 7, 2023 at 1:17 PM Leonardo Bras Soares Passos
> <leobras@redhat.com> wrote:
> >
> > On Sat, Aug 5, 2023 at 1:24 AM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Sat, Aug 5, 2023 at 11:14 AM Leonardo Bras Soares Passos
> > > <leobras@redhat.com> wrote:
> > > >
> > > > Hello Guo Ren, thanks for the feedback!
> > > >
> > > > On Fri, Aug 4, 2023 at 2:45 PM Guo Ren <guoren@kernel.org> wrote:
> > > > >
> > > > > On Fri, Aug 4, 2023 at 4:49 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > > > >
> > > > > > cmpxchg for variables of size 1-byte and 2-bytes is not yet available for
> > > > > > riscv, even though its present in other architectures such as arm64 and
> > > > > > x86. This could lead to not being able to implement some locking mechanisms
> > > > > > or requiring some rework to make it work properly.
> > > > > >
> > > > > > Implement 1-byte and 2-bytes cmpxchg in order to achieve parity with other
> > > > > > architectures.
> > > > > >
> > > > > > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > > > > > ---
> > > > > >  arch/riscv/include/asm/cmpxchg.h | 35 ++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 35 insertions(+)
> > > > > >
> > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > index 5a07646fae65..dfb433ac544f 100644
> > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > @@ -72,6 +72,36 @@
> > > > > >   * indicated by comparing RETURN with OLD.
> > > > > >   */
> > > > > >
> > > > > > +#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > > > > > +({                                                                     \
> > > > > > +       /* Depends on 2-byte variables being 2-byte aligned */          \
> > > > > > +       ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > > > > > +       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > > > > > +                       << __s;                                         \
> > > > > > +       ulong __newx = (ulong)(n) << __s;                               \
> > > > > > +       ulong __oldx = (ulong)(o) << __s;                               \
> > > > > > +       ulong __retx;                                                   \
> > > > > > +       register unsigned int __rc;                                     \
> > > > > > +                                                                       \
> > > > > > +       __asm__ __volatile__ (                                          \
> > > > > > +               prepend                                                 \
> > > > > > +               "0:     lr.w %0, %2\n"                                  \
>
>       bne  %0, %z3, 1f\n"                             \
> > > >
> > > > > bug:
> > > > > -               "       and  %0, %0, %z5\n"                             \
> > > > > -               "       bne  %0, %z3, 1f\n"                             \
> > > > > +               "       and  %1, %0, %z5\n"                             \
> > > > > +               "       bne  %1, %z3, 1f\n"                             \
> > > > > Your code breaks the %0.
> > > >
> > > > What do you mean by breaks here?
> > > >
> > > > In the end of this macro, I intended  to have __retx = (*p & __mask)
> > > > which means the value is clean to be rotated at the end of the macro
> > > > (no need to apply the mask again): r = __ret >> __s;
> > > >
> > > > Also, I assumed we are supposed to return the same variable type
> > > > as the pointer, so this is valid:
> > > > u8 a, *b, c;
> > > > a = xchg(b, c);
> > > >
> > > > Is this correct?
> > > I missed your removing "__ret & mask" at the end. So this may not the problem.
>
> It was actually the problem :)
> Even though I revised a lot, I was missing this which was made clear
> by test-running on a VM:
>
>         "0:    lr.w %0, %2\n"                    \
>         "    and  %0, %0, %z5\n"                \
>         "    bne  %0, %z3, 1f\n"                \
>         "    and  %1, %0, %z6\n"                \
>         "    or   %1, %1, %z4\n"                \
>         "    sc.w" sc_sfx " %1, %1, %2\n"            \
>         "    bnez %1, 0b\n"
>
> If I do "and %0, %0, %z5" there, the line  "and  %1, %0, %z6"  will
> output %1 = 0, which is not the intended.
> You were right! The above fix solves the issue, and I just have to
> mask it after.
>
> Sorry it took me a few days to realize this.
Great jot! I would test your 5th patch series, thx.

>
>
> > >
> > > Your patch can't boot. After chewing your code for several hours, I
> > > found a problem:
> > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > index 943f094375c7..67bcce63b267 100644
> > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > @@ -14,6 +14,7 @@
> > >  #define __arch_xchg_mask(prepend, append, r, p, n)                     \
> > >  ({                                                                     \
> > >         /* Depends on 2-byte variables being 2-byte aligned */          \
> > > +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
> >
> > Wow, I totally missed this one: I should not assume the processor
> > behavior on unaligned lr/sc.
> > Thanks for spotting this :)
> >
> > >         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > >         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > >                         << __s;                                         \
> > > @@ -29,7 +30,7 @@
> > >                "        sc.w %1, %1, %2\n"                              \
> > >                "        bnez %1, 0b\n"                                  \
> > >                append                                                   \
> > > -              : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))              \
> > > +              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))            \
> > >                : "rJ" (__newx), "rJ" (~__mask)                          \
> > >                : "memory");                                             \
> > >                                                                         \
> > > @@ -106,6 +107,7 @@
> > >  #define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)       \
> > >  ({                                                                     \
> > >         /* Depends on 2-byte variables being 2-byte aligned */          \
> > > +       volatile ulong *__p = (ulong *)((ulong)(p) & ~0x3);             \
> > >         ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;                 \
> > >         ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
> > >                         << __s;                                         \
> > > @@ -125,7 +127,7 @@
> > >                 "       bnez %1, 0b\n"                                  \
> > >                 append                                                  \
> > >                 "1:\n"                                                  \
> > > -               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(__p))           \
> > >                 : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > >                   "rJ" (__mask), "rJ" (~__mask)                         \
> > >                 : "memory");                                            \
> > >
> > > But the lkvm-static still can't boot with paravirt_spinlock .... Are
> > > there any atomic tests in the Linux?
> >
> > I recall reading something about 'locktorture' or so, not sure if useful:
> >
> > https://docs.kernel.org/locking/locktorture.html
> >
> > >
> > > I found you use some "register int variables". Would it cause the problem?
> >
> > Honestly, not sure.
> > I will try inspecting the generated asm for any unexpected changes
> > compared to your version.
>
> changed type to ulong, no changes detected in asm
>
> >
> > >
> > > You can reference this file, and it has passed the lock torture test:
> > > https://github.com/guoren83/linux/blob/sg2042-master-qspinlock-64ilp32_v4/arch/riscv/include/asm/cmpxchg.h
> > >
> > > I also merged your patches with the qspinlock series: (Use the above
> > > cmpxchg.h the lkvm would run normally.)
> > > https://github.com/guoren83/linux/tree/qspinlock_v11
> >
> > Thanks!
> >
> > I should reply as soon as I find anything.
> >
>
> Some changes found (for xchg16):
> - pointer alignment wrong (need mask with ~0x3) : solved. Doesn't seem
> to need to be volatile, though.
>
> - AND with 0x3 for getting shift: I was trusting 2-byte vars to be
> 2-byte aligned , ie. p & 0x1 == 0: may not be true. Solved (it's
> simple)
>
> - on your code, IIUC you also assume ptr & 0x1 == 0, on
>         ulong *__ptr = (ulong *)((ulong)ptr & ~2);
> if it's not aligned, you may have __ptr = 0xYYYYYY01
> (Not saying it's wrong to assume this, just listing It differs on my code)
>
> - also verified your code generates addw and sllw on some points mine
> generates add and sll, don't see any issue there
>
> - There are extra sll and srl in my asm, because I end up casting to
> typeof(*ptr) at the end.
>
> changes found in cmpxchg (__cmpxchg_small_relaxed)
> - mostly the same as above,
>
> - the beginning of my function ends up being smaller, since I do my
> masking differently.
>
> I fixed the above issues, and  ran some userspace debugging tests on a
> VM, and all seems to be correct now.
>
> I then fetched your github repo, replaced my patches v3->v4 on your
> tree, added linux boot parameter "qspinlock" on my qemu cmdline
> (-machine virt): booting and working just fine.
>
> I will send a v4 soon, please give it a test when possible.
>
> Best regards,
> Leo
>
>
>
> > >
> > >
> > >
> > > >
> > > > > > +               append                                                  \
> > > > > > +               "1:\n"                                                  \
> > > > > > +               : "=&r" (__retx), "=&r" (__rc), "+A" (*(p))             \
> > > > > > +               : "rJ" ((long)__oldx), "rJ" (__newx),                   \
> > > > > > +                 "rJ" (__mask), "rJ" (~__mask)                         \
> > > > > > +               : "memory");                                            \
> > > > > > +                                                                       \
> > > > > > +       r = (__typeof__(*(p)))(__retx >> __s);                          \
> > > > > > +})
> > > > > > +
> > > > > >
> > > > > >  #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)        \
> > > > > >  ({                                                                     \
> > > > > > @@ -98,6 +128,11 @@
> > > > > >         __typeof__(*(ptr)) __ret;                                       \
> > > > > >                                                                         \
> > > > > >         switch (sizeof(*__ptr)) {                                       \
> > > > > > +       case 1:                                                         \
> > > > > > +       case 2:                                                         \
> > > > > > +               __arch_cmpxchg_mask(sc_sfx, prepend, append,            \
> > > > > > +                                       __ret, __ptr, __old, __new);    \
> > > > > > +               break;                                                  \
> > > > > >         case 4:                                                         \
> > > > > >                 __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,      \
> > > > > >                                 __ret, __ptr, (long), __old, __new);    \
> > > > > > --
> > > > > > 2.41.0
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Best Regards
> > > > >  Guo Ren
> > > > >
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
>
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 5a07646fae65..dfb433ac544f 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -72,6 +72,36 @@ 
  * indicated by comparing RETURN with OLD.
  */
 
+#define __arch_cmpxchg_mask(sc_sfx, prepend, append, r, p, o, n)	\
+({									\
+	/* Depends on 2-byte variables being 2-byte aligned */		\
+	ulong __s = ((ulong)(p) & 0x3) * BITS_PER_BYTE;			\
+	ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)	\
+			<< __s;						\
+	ulong __newx = (ulong)(n) << __s;				\
+	ulong __oldx = (ulong)(o) << __s;				\
+	ulong __retx;							\
+	register unsigned int __rc;					\
+									\
+	__asm__ __volatile__ (						\
+		prepend							\
+		"0:	lr.w %0, %2\n"					\
+		"	and  %0, %0, %z5\n"				\
+		"	bne  %0, %z3, 1f\n"				\
+		"	and  %1, %0, %z6\n"				\
+		"	or   %1, %1, %z4\n"				\
+		"	sc.w" sc_sfx " %1, %1, %2\n"			\
+		"	bnez %1, 0b\n"					\
+		append							\
+		"1:\n"							\
+		: "=&r" (__retx), "=&r" (__rc), "+A" (*(p))		\
+		: "rJ" ((long)__oldx), "rJ" (__newx),			\
+		  "rJ" (__mask), "rJ" (~__mask)				\
+		: "memory");						\
+									\
+	r = (__typeof__(*(p)))(__retx >> __s);				\
+})
+
 
 #define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)	\
 ({									\
@@ -98,6 +128,11 @@ 
 	__typeof__(*(ptr)) __ret;					\
 									\
 	switch (sizeof(*__ptr)) {					\
+	case 1:								\
+	case 2:								\
+		__arch_cmpxchg_mask(sc_sfx, prepend, append,		\
+					__ret, __ptr, __old, __new);	\
+		break;							\
 	case 4:								\
 		__arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,	\
 				__ret, __ptr, (long), __old, __new);	\