diff mbox series

[1/4] arm64: Avoid redundant type conversions in xchg() and cmpxchg()

Message ID 1543347887-21101-2-git-send-email-will.deacon@arm.com (mailing list archive)
State New, archived
Headers show
Series Rewrite of percpu atomics and introduction of LSE | expand

Commit Message

Will Deacon Nov. 27, 2018, 7:44 p.m. UTC
Our atomic instructions (either LSE atomics of LDXR/STXR sequences)
natively support byte, half-word, word and double-word memory accesses
so there is no need to mask the data register prior to being stored.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/atomic_ll_sc.h |  53 ++++++++--------
 arch/arm64/include/asm/atomic_lse.h   |  46 +++++++-------
 arch/arm64/include/asm/cmpxchg.h      | 116 +++++++++++++++++-----------------
 3 files changed, 108 insertions(+), 107 deletions(-)

Comments

Ard Biesheuvel Dec. 4, 2018, 5:29 p.m. UTC | #1
On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
>
> Our atomic instructions (either LSE atomics of LDXR/STXR sequences)
> natively support byte, half-word, word and double-word memory accesses
> so there is no need to mask the data register prior to being stored.
>
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |  53 ++++++++--------
>  arch/arm64/include/asm/atomic_lse.h   |  46 +++++++-------
>  arch/arm64/include/asm/cmpxchg.h      | 116 +++++++++++++++++-----------------
>  3 files changed, 108 insertions(+), 107 deletions(-)
>
> diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> index f5a2d09afb38..f02d3bf7b9e6 100644
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -248,48 +248,49 @@ __LL_SC_PREFIX(atomic64_dec_if_positive(atomic64_t *v))
>  }
>  __LL_SC_EXPORT(atomic64_dec_if_positive);
>
> -#define __CMPXCHG_CASE(w, sz, name, mb, acq, rel, cl)                  \
> -__LL_SC_INLINE unsigned long                                           \
> -__LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr,               \
> -                                    unsigned long old,                 \
> -                                    unsigned long new))                \
> +#define __CMPXCHG_CASE(w, sfx, name, sz, mb, acq, rel, cl)             \
> +__LL_SC_INLINE u##sz                                                   \
> +__LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,           \
> +                                        unsigned long old,             \
> +                                        u##sz new))                    \

Same question as before: doesn't the narrowing of these types force
the compiler to perform the cast before populating the register for
the inline asm?

>  {                                                                      \
> -       unsigned long tmp, oldval;                                      \
> +       unsigned long tmp;                                              \
> +       u##sz oldval;                                                   \
>                                                                         \
>         asm volatile(                                                   \
>         "       prfm    pstl1strm, %[v]\n"                              \
> -       "1:     ld" #acq "xr" #sz "\t%" #w "[oldval], %[v]\n"           \
> +       "1:     ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"          \
>         "       eor     %" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"  \
>         "       cbnz    %" #w "[tmp], 2f\n"                             \
> -       "       st" #rel "xr" #sz "\t%w[tmp], %" #w "[new], %[v]\n"     \
> +       "       st" #rel "xr" #sfx "\t%w[tmp], %" #w "[new], %[v]\n"    \
>         "       cbnz    %w[tmp], 1b\n"                                  \
>         "       " #mb "\n"                                              \
>         "2:"                                                            \
>         : [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),                   \
> -         [v] "+Q" (*(unsigned long *)ptr)                              \
> +         [v] "+Q" (*(u##sz *)ptr)                                      \
>         : [old] "Lr" (old), [new] "r" (new)                             \
>         : cl);                                                          \
>                                                                         \
>         return oldval;                                                  \
>  }                                                                      \
> -__LL_SC_EXPORT(__cmpxchg_case_##name);
> +__LL_SC_EXPORT(__cmpxchg_case_##name##sz);
>
> -__CMPXCHG_CASE(w, b,     1,        ,  ,  ,         )
> -__CMPXCHG_CASE(w, h,     2,        ,  ,  ,         )
> -__CMPXCHG_CASE(w,  ,     4,        ,  ,  ,         )
> -__CMPXCHG_CASE( ,  ,     8,        ,  ,  ,         )
> -__CMPXCHG_CASE(w, b, acq_1,        , a,  , "memory")
> -__CMPXCHG_CASE(w, h, acq_2,        , a,  , "memory")
> -__CMPXCHG_CASE(w,  , acq_4,        , a,  , "memory")
> -__CMPXCHG_CASE( ,  , acq_8,        , a,  , "memory")
> -__CMPXCHG_CASE(w, b, rel_1,        ,  , l, "memory")
> -__CMPXCHG_CASE(w, h, rel_2,        ,  , l, "memory")
> -__CMPXCHG_CASE(w,  , rel_4,        ,  , l, "memory")
> -__CMPXCHG_CASE( ,  , rel_8,        ,  , l, "memory")
> -__CMPXCHG_CASE(w, b,  mb_1, dmb ish,  , l, "memory")
> -__CMPXCHG_CASE(w, h,  mb_2, dmb ish,  , l, "memory")
> -__CMPXCHG_CASE(w,  ,  mb_4, dmb ish,  , l, "memory")
> -__CMPXCHG_CASE( ,  ,  mb_8, dmb ish,  , l, "memory")
> +__CMPXCHG_CASE(w, b,     ,  8,        ,  ,  ,         )
> +__CMPXCHG_CASE(w, h,     , 16,        ,  ,  ,         )
> +__CMPXCHG_CASE(w,  ,     , 32,        ,  ,  ,         )
> +__CMPXCHG_CASE( ,  ,     , 64,        ,  ,  ,         )
> +__CMPXCHG_CASE(w, b, acq_,  8,        , a,  , "memory")
> +__CMPXCHG_CASE(w, h, acq_, 16,        , a,  , "memory")
> +__CMPXCHG_CASE(w,  , acq_, 32,        , a,  , "memory")
> +__CMPXCHG_CASE( ,  , acq_, 64,        , a,  , "memory")
> +__CMPXCHG_CASE(w, b, rel_,  8,        ,  , l, "memory")
> +__CMPXCHG_CASE(w, h, rel_, 16,        ,  , l, "memory")
> +__CMPXCHG_CASE(w,  , rel_, 32,        ,  , l, "memory")
> +__CMPXCHG_CASE( ,  , rel_, 64,        ,  , l, "memory")
> +__CMPXCHG_CASE(w, b,  mb_,  8, dmb ish,  , l, "memory")
> +__CMPXCHG_CASE(w, h,  mb_, 16, dmb ish,  , l, "memory")
> +__CMPXCHG_CASE(w,  ,  mb_, 32, dmb ish,  , l, "memory")
> +__CMPXCHG_CASE( ,  ,  mb_, 64, dmb ish,  , l, "memory")
>
>  #undef __CMPXCHG_CASE
>
> diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> index f9b0b09153e0..4d6f917b654e 100644
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -446,22 +446,22 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
>
>  #define __LL_SC_CMPXCHG(op)    __LL_SC_CALL(__cmpxchg_case_##op)
>
> -#define __CMPXCHG_CASE(w, sz, name, mb, cl...)                         \
> -static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,  \
> -                                                 unsigned long old,    \
> -                                                 unsigned long new)    \
> +#define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> +static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> +                                             unsigned long old,        \
> +                                             u##sz new)                \
>  {                                                                      \
>         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
>         register unsigned long x1 asm ("x1") = old;                     \
> -       register unsigned long x2 asm ("x2") = new;                     \
> +       register u##sz x2 asm ("x2") = new;                             \
>                                                                         \
>         asm volatile(ARM64_LSE_ATOMIC_INSN(                             \
>         /* LL/SC */                                                     \
> -       __LL_SC_CMPXCHG(name)                                           \
> +       __LL_SC_CMPXCHG(name##sz)                                       \
>         __nops(2),                                                      \
>         /* LSE atomics */                                               \
>         "       mov     " #w "30, %" #w "[old]\n"                       \
> -       "       cas" #mb #sz "\t" #w "30, %" #w "[new], %[v]\n"         \
> +       "       cas" #mb #sfx "\t" #w "30, %" #w "[new], %[v]\n"        \
>         "       mov     %" #w "[ret], " #w "30")                        \
>         : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr)             \
>         : [old] "r" (x1), [new] "r" (x2)                                \
> @@ -470,22 +470,22 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,     \
>         return x0;                                                      \
>  }
>
> -__CMPXCHG_CASE(w, b,     1,   )
> -__CMPXCHG_CASE(w, h,     2,   )
> -__CMPXCHG_CASE(w,  ,     4,   )
> -__CMPXCHG_CASE(x,  ,     8,   )
> -__CMPXCHG_CASE(w, b, acq_1,  a, "memory")
> -__CMPXCHG_CASE(w, h, acq_2,  a, "memory")
> -__CMPXCHG_CASE(w,  , acq_4,  a, "memory")
> -__CMPXCHG_CASE(x,  , acq_8,  a, "memory")
> -__CMPXCHG_CASE(w, b, rel_1,  l, "memory")
> -__CMPXCHG_CASE(w, h, rel_2,  l, "memory")
> -__CMPXCHG_CASE(w,  , rel_4,  l, "memory")
> -__CMPXCHG_CASE(x,  , rel_8,  l, "memory")
> -__CMPXCHG_CASE(w, b,  mb_1, al, "memory")
> -__CMPXCHG_CASE(w, h,  mb_2, al, "memory")
> -__CMPXCHG_CASE(w,  ,  mb_4, al, "memory")
> -__CMPXCHG_CASE(x,  ,  mb_8, al, "memory")
> +__CMPXCHG_CASE(w, b,     ,  8,   )
> +__CMPXCHG_CASE(w, h,     , 16,   )
> +__CMPXCHG_CASE(w,  ,     , 32,   )
> +__CMPXCHG_CASE(x,  ,     , 64,   )
> +__CMPXCHG_CASE(w, b, acq_,  8,  a, "memory")
> +__CMPXCHG_CASE(w, h, acq_, 16,  a, "memory")
> +__CMPXCHG_CASE(w,  , acq_, 32,  a, "memory")
> +__CMPXCHG_CASE(x,  , acq_, 64,  a, "memory")
> +__CMPXCHG_CASE(w, b, rel_,  8,  l, "memory")
> +__CMPXCHG_CASE(w, h, rel_, 16,  l, "memory")
> +__CMPXCHG_CASE(w,  , rel_, 32,  l, "memory")
> +__CMPXCHG_CASE(x,  , rel_, 64,  l, "memory")
> +__CMPXCHG_CASE(w, b,  mb_,  8, al, "memory")
> +__CMPXCHG_CASE(w, h,  mb_, 16, al, "memory")
> +__CMPXCHG_CASE(w,  ,  mb_, 32, al, "memory")
> +__CMPXCHG_CASE(x,  ,  mb_, 64, al, "memory")
>
>  #undef __LL_SC_CMPXCHG
>  #undef __CMPXCHG_CASE
> diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
> index 3b0938281541..1f0340fc6dad 100644
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -30,46 +30,46 @@
>   * barrier case is generated as release+dmb for the former and
>   * acquire+release for the latter.
>   */
> -#define __XCHG_CASE(w, sz, name, mb, nop_lse, acq, acq_lse, rel, cl)   \
> -static inline unsigned long __xchg_case_##name(unsigned long x,                \
> -                                              volatile void *ptr)      \
> -{                                                                      \
> -       unsigned long ret, tmp;                                         \
> -                                                                       \
> -       asm volatile(ARM64_LSE_ATOMIC_INSN(                             \
> -       /* LL/SC */                                                     \
> -       "       prfm    pstl1strm, %2\n"                                \
> -       "1:     ld" #acq "xr" #sz "\t%" #w "0, %2\n"                    \
> -       "       st" #rel "xr" #sz "\t%w1, %" #w "3, %2\n"               \
> -       "       cbnz    %w1, 1b\n"                                      \
> -       "       " #mb,                                                  \
> -       /* LSE atomics */                                               \
> -       "       swp" #acq_lse #rel #sz "\t%" #w "3, %" #w "0, %2\n"     \
> -               __nops(3)                                               \
> -       "       " #nop_lse)                                             \
> -       : "=&r" (ret), "=&r" (tmp), "+Q" (*(unsigned long *)ptr)        \
> -       : "r" (x)                                                       \
> -       : cl);                                                          \
> -                                                                       \
> -       return ret;                                                     \
> +#define __XCHG_CASE(w, sfx, name, sz, mb, nop_lse, acq, acq_lse, rel, cl)      \
> +static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr)                \
> +{                                                                              \
> +       u##sz ret;                                                              \
> +       unsigned long tmp;                                                      \
> +                                                                               \
> +       asm volatile(ARM64_LSE_ATOMIC_INSN(                                     \
> +       /* LL/SC */                                                             \
> +       "       prfm    pstl1strm, %2\n"                                        \
> +       "1:     ld" #acq "xr" #sfx "\t%" #w "0, %2\n"                           \
> +       "       st" #rel "xr" #sfx "\t%w1, %" #w "3, %2\n"                      \
> +       "       cbnz    %w1, 1b\n"                                              \
> +       "       " #mb,                                                          \
> +       /* LSE atomics */                                                       \
> +       "       swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n"            \
> +               __nops(3)                                                       \
> +       "       " #nop_lse)                                                     \
> +       : "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr)                        \
> +       : "r" (x)                                                               \
> +       : cl);                                                                  \
> +                                                                               \
> +       return ret;                                                             \
>  }
>
> -__XCHG_CASE(w, b,     1,        ,    ,  ,  ,  ,         )
> -__XCHG_CASE(w, h,     2,        ,    ,  ,  ,  ,         )
> -__XCHG_CASE(w,  ,     4,        ,    ,  ,  ,  ,         )
> -__XCHG_CASE( ,  ,     8,        ,    ,  ,  ,  ,         )
> -__XCHG_CASE(w, b, acq_1,        ,    , a, a,  , "memory")
> -__XCHG_CASE(w, h, acq_2,        ,    , a, a,  , "memory")
> -__XCHG_CASE(w,  , acq_4,        ,    , a, a,  , "memory")
> -__XCHG_CASE( ,  , acq_8,        ,    , a, a,  , "memory")
> -__XCHG_CASE(w, b, rel_1,        ,    ,  ,  , l, "memory")
> -__XCHG_CASE(w, h, rel_2,        ,    ,  ,  , l, "memory")
> -__XCHG_CASE(w,  , rel_4,        ,    ,  ,  , l, "memory")
> -__XCHG_CASE( ,  , rel_8,        ,    ,  ,  , l, "memory")
> -__XCHG_CASE(w, b,  mb_1, dmb ish, nop,  , a, l, "memory")
> -__XCHG_CASE(w, h,  mb_2, dmb ish, nop,  , a, l, "memory")
> -__XCHG_CASE(w,  ,  mb_4, dmb ish, nop,  , a, l, "memory")
> -__XCHG_CASE( ,  ,  mb_8, dmb ish, nop,  , a, l, "memory")
> +__XCHG_CASE(w, b,     ,  8,        ,    ,  ,  ,  ,         )
> +__XCHG_CASE(w, h,     , 16,        ,    ,  ,  ,  ,         )
> +__XCHG_CASE(w,  ,     , 32,        ,    ,  ,  ,  ,         )
> +__XCHG_CASE( ,  ,     , 64,        ,    ,  ,  ,  ,         )
> +__XCHG_CASE(w, b, acq_,  8,        ,    , a, a,  , "memory")
> +__XCHG_CASE(w, h, acq_, 16,        ,    , a, a,  , "memory")
> +__XCHG_CASE(w,  , acq_, 32,        ,    , a, a,  , "memory")
> +__XCHG_CASE( ,  , acq_, 64,        ,    , a, a,  , "memory")
> +__XCHG_CASE(w, b, rel_,  8,        ,    ,  ,  , l, "memory")
> +__XCHG_CASE(w, h, rel_, 16,        ,    ,  ,  , l, "memory")
> +__XCHG_CASE(w,  , rel_, 32,        ,    ,  ,  , l, "memory")
> +__XCHG_CASE( ,  , rel_, 64,        ,    ,  ,  , l, "memory")
> +__XCHG_CASE(w, b,  mb_,  8, dmb ish, nop,  , a, l, "memory")
> +__XCHG_CASE(w, h,  mb_, 16, dmb ish, nop,  , a, l, "memory")
> +__XCHG_CASE(w,  ,  mb_, 32, dmb ish, nop,  , a, l, "memory")
> +__XCHG_CASE( ,  ,  mb_, 64, dmb ish, nop,  , a, l, "memory")
>
>  #undef __XCHG_CASE
>
> @@ -80,13 +80,13 @@ static inline unsigned long __xchg##sfx(unsigned long x,            \
>  {                                                                      \
>         switch (size) {                                                 \
>         case 1:                                                         \
> -               return __xchg_case##sfx##_1(x, ptr);                    \
> +               return __xchg_case##sfx##_8(x, ptr);                    \
>         case 2:                                                         \
> -               return __xchg_case##sfx##_2(x, ptr);                    \
> +               return __xchg_case##sfx##_16(x, ptr);                   \
>         case 4:                                                         \
> -               return __xchg_case##sfx##_4(x, ptr);                    \
> +               return __xchg_case##sfx##_32(x, ptr);                   \
>         case 8:                                                         \
> -               return __xchg_case##sfx##_8(x, ptr);                    \
> +               return __xchg_case##sfx##_64(x, ptr);                   \
>         default:                                                        \
>                 BUILD_BUG();                                            \
>         }                                                               \
> @@ -123,13 +123,13 @@ static inline unsigned long __cmpxchg##sfx(volatile void *ptr,            \
>  {                                                                      \
>         switch (size) {                                                 \
>         case 1:                                                         \
> -               return __cmpxchg_case##sfx##_1(ptr, (u8)old, new);      \
> +               return __cmpxchg_case##sfx##_8(ptr, (u8)old, new);      \
>         case 2:                                                         \
> -               return __cmpxchg_case##sfx##_2(ptr, (u16)old, new);     \
> +               return __cmpxchg_case##sfx##_16(ptr, (u16)old, new);    \
>         case 4:                                                         \
> -               return __cmpxchg_case##sfx##_4(ptr, old, new);          \
> +               return __cmpxchg_case##sfx##_32(ptr, old, new);         \
>         case 8:                                                         \
> -               return __cmpxchg_case##sfx##_8(ptr, old, new);          \
> +               return __cmpxchg_case##sfx##_64(ptr, old, new);         \
>         default:                                                        \
>                 BUILD_BUG();                                            \
>         }                                                               \
> @@ -197,16 +197,16 @@ __CMPXCHG_GEN(_mb)
>         __ret; \
>  })
>
> -#define __CMPWAIT_CASE(w, sz, name)                                    \
> -static inline void __cmpwait_case_##name(volatile void *ptr,           \
> -                                        unsigned long val)             \
> +#define __CMPWAIT_CASE(w, sfx, sz)                                     \
> +static inline void __cmpwait_case_##sz(volatile void *ptr,             \
> +                                      unsigned long val)               \
>  {                                                                      \
>         unsigned long tmp;                                              \
>                                                                         \
>         asm volatile(                                                   \
>         "       sevl\n"                                                 \
>         "       wfe\n"                                                  \
> -       "       ldxr" #sz "\t%" #w "[tmp], %[v]\n"                      \
> +       "       ldxr" #sfx "\t%" #w "[tmp], %[v]\n"                     \
>         "       eor     %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"     \
>         "       cbnz    %" #w "[tmp], 1f\n"                             \
>         "       wfe\n"                                                  \
> @@ -215,10 +215,10 @@ static inline void __cmpwait_case_##name(volatile void *ptr,              \
>         : [val] "r" (val));                                             \
>  }
>
> -__CMPWAIT_CASE(w, b, 1);
> -__CMPWAIT_CASE(w, h, 2);
> -__CMPWAIT_CASE(w,  , 4);
> -__CMPWAIT_CASE( ,  , 8);
> +__CMPWAIT_CASE(w, b, 8);
> +__CMPWAIT_CASE(w, h, 16);
> +__CMPWAIT_CASE(w,  , 32);
> +__CMPWAIT_CASE( ,  , 64);
>
>  #undef __CMPWAIT_CASE
>
> @@ -229,13 +229,13 @@ static inline void __cmpwait##sfx(volatile void *ptr,                     \
>  {                                                                      \
>         switch (size) {                                                 \
>         case 1:                                                         \
> -               return __cmpwait_case##sfx##_1(ptr, (u8)val);           \
> +               return __cmpwait_case##sfx##_8(ptr, (u8)val);           \
>         case 2:                                                         \
> -               return __cmpwait_case##sfx##_2(ptr, (u16)val);          \
> +               return __cmpwait_case##sfx##_16(ptr, (u16)val);         \
>         case 4:                                                         \
> -               return __cmpwait_case##sfx##_4(ptr, val);               \
> +               return __cmpwait_case##sfx##_32(ptr, val);              \
>         case 8:                                                         \
> -               return __cmpwait_case##sfx##_8(ptr, val);               \
> +               return __cmpwait_case##sfx##_64(ptr, val);              \
>         default:                                                        \
>                 BUILD_BUG();                                            \
>         }                                                               \
> --
> 2.1.4
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index f5a2d09afb38..f02d3bf7b9e6 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -248,48 +248,49 @@  __LL_SC_PREFIX(atomic64_dec_if_positive(atomic64_t *v))
 }
 __LL_SC_EXPORT(atomic64_dec_if_positive);
 
-#define __CMPXCHG_CASE(w, sz, name, mb, acq, rel, cl)			\
-__LL_SC_INLINE unsigned long						\
-__LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr,		\
-				     unsigned long old,			\
-				     unsigned long new))		\
+#define __CMPXCHG_CASE(w, sfx, name, sz, mb, acq, rel, cl)		\
+__LL_SC_INLINE u##sz							\
+__LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,		\
+					 unsigned long old,		\
+					 u##sz new))			\
 {									\
-	unsigned long tmp, oldval;					\
+	unsigned long tmp;						\
+	u##sz oldval;							\
 									\
 	asm volatile(							\
 	"	prfm	pstl1strm, %[v]\n"				\
-	"1:	ld" #acq "xr" #sz "\t%" #w "[oldval], %[v]\n"		\
+	"1:	ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"		\
 	"	eor	%" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"	\
 	"	cbnz	%" #w "[tmp], 2f\n"				\
-	"	st" #rel "xr" #sz "\t%w[tmp], %" #w "[new], %[v]\n"	\
+	"	st" #rel "xr" #sfx "\t%w[tmp], %" #w "[new], %[v]\n"	\
 	"	cbnz	%w[tmp], 1b\n"					\
 	"	" #mb "\n"						\
 	"2:"								\
 	: [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),			\
-	  [v] "+Q" (*(unsigned long *)ptr)				\
+	  [v] "+Q" (*(u##sz *)ptr)					\
 	: [old] "Lr" (old), [new] "r" (new)				\
 	: cl);								\
 									\
 	return oldval;							\
 }									\
-__LL_SC_EXPORT(__cmpxchg_case_##name);
+__LL_SC_EXPORT(__cmpxchg_case_##name##sz);
 
-__CMPXCHG_CASE(w, b,     1,        ,  ,  ,         )
-__CMPXCHG_CASE(w, h,     2,        ,  ,  ,         )
-__CMPXCHG_CASE(w,  ,     4,        ,  ,  ,         )
-__CMPXCHG_CASE( ,  ,     8,        ,  ,  ,         )
-__CMPXCHG_CASE(w, b, acq_1,        , a,  , "memory")
-__CMPXCHG_CASE(w, h, acq_2,        , a,  , "memory")
-__CMPXCHG_CASE(w,  , acq_4,        , a,  , "memory")
-__CMPXCHG_CASE( ,  , acq_8,        , a,  , "memory")
-__CMPXCHG_CASE(w, b, rel_1,        ,  , l, "memory")
-__CMPXCHG_CASE(w, h, rel_2,        ,  , l, "memory")
-__CMPXCHG_CASE(w,  , rel_4,        ,  , l, "memory")
-__CMPXCHG_CASE( ,  , rel_8,        ,  , l, "memory")
-__CMPXCHG_CASE(w, b,  mb_1, dmb ish,  , l, "memory")
-__CMPXCHG_CASE(w, h,  mb_2, dmb ish,  , l, "memory")
-__CMPXCHG_CASE(w,  ,  mb_4, dmb ish,  , l, "memory")
-__CMPXCHG_CASE( ,  ,  mb_8, dmb ish,  , l, "memory")
+__CMPXCHG_CASE(w, b,     ,  8,        ,  ,  ,         )
+__CMPXCHG_CASE(w, h,     , 16,        ,  ,  ,         )
+__CMPXCHG_CASE(w,  ,     , 32,        ,  ,  ,         )
+__CMPXCHG_CASE( ,  ,     , 64,        ,  ,  ,         )
+__CMPXCHG_CASE(w, b, acq_,  8,        , a,  , "memory")
+__CMPXCHG_CASE(w, h, acq_, 16,        , a,  , "memory")
+__CMPXCHG_CASE(w,  , acq_, 32,        , a,  , "memory")
+__CMPXCHG_CASE( ,  , acq_, 64,        , a,  , "memory")
+__CMPXCHG_CASE(w, b, rel_,  8,        ,  , l, "memory")
+__CMPXCHG_CASE(w, h, rel_, 16,        ,  , l, "memory")
+__CMPXCHG_CASE(w,  , rel_, 32,        ,  , l, "memory")
+__CMPXCHG_CASE( ,  , rel_, 64,        ,  , l, "memory")
+__CMPXCHG_CASE(w, b,  mb_,  8, dmb ish,  , l, "memory")
+__CMPXCHG_CASE(w, h,  mb_, 16, dmb ish,  , l, "memory")
+__CMPXCHG_CASE(w,  ,  mb_, 32, dmb ish,  , l, "memory")
+__CMPXCHG_CASE( ,  ,  mb_, 64, dmb ish,  , l, "memory")
 
 #undef __CMPXCHG_CASE
 
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index f9b0b09153e0..4d6f917b654e 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -446,22 +446,22 @@  static inline long atomic64_dec_if_positive(atomic64_t *v)
 
 #define __LL_SC_CMPXCHG(op)	__LL_SC_CALL(__cmpxchg_case_##op)
 
-#define __CMPXCHG_CASE(w, sz, name, mb, cl...)				\
-static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,	\
-						  unsigned long old,	\
-						  unsigned long new)	\
+#define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)			\
+static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,	\
+					      unsigned long old,	\
+					      u##sz new)		\
 {									\
 	register unsigned long x0 asm ("x0") = (unsigned long)ptr;	\
 	register unsigned long x1 asm ("x1") = old;			\
-	register unsigned long x2 asm ("x2") = new;			\
+	register u##sz x2 asm ("x2") = new;				\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_CMPXCHG(name)						\
+	__LL_SC_CMPXCHG(name##sz)					\
 	__nops(2),							\
 	/* LSE atomics */						\
 	"	mov	" #w "30, %" #w "[old]\n"			\
-	"	cas" #mb #sz "\t" #w "30, %" #w "[new], %[v]\n"		\
+	"	cas" #mb #sfx "\t" #w "30, %" #w "[new], %[v]\n"	\
 	"	mov	%" #w "[ret], " #w "30")			\
 	: [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr)		\
 	: [old] "r" (x1), [new] "r" (x2)				\
@@ -470,22 +470,22 @@  static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,	\
 	return x0;							\
 }
 
-__CMPXCHG_CASE(w, b,     1,   )
-__CMPXCHG_CASE(w, h,     2,   )
-__CMPXCHG_CASE(w,  ,     4,   )
-__CMPXCHG_CASE(x,  ,     8,   )
-__CMPXCHG_CASE(w, b, acq_1,  a, "memory")
-__CMPXCHG_CASE(w, h, acq_2,  a, "memory")
-__CMPXCHG_CASE(w,  , acq_4,  a, "memory")
-__CMPXCHG_CASE(x,  , acq_8,  a, "memory")
-__CMPXCHG_CASE(w, b, rel_1,  l, "memory")
-__CMPXCHG_CASE(w, h, rel_2,  l, "memory")
-__CMPXCHG_CASE(w,  , rel_4,  l, "memory")
-__CMPXCHG_CASE(x,  , rel_8,  l, "memory")
-__CMPXCHG_CASE(w, b,  mb_1, al, "memory")
-__CMPXCHG_CASE(w, h,  mb_2, al, "memory")
-__CMPXCHG_CASE(w,  ,  mb_4, al, "memory")
-__CMPXCHG_CASE(x,  ,  mb_8, al, "memory")
+__CMPXCHG_CASE(w, b,     ,  8,   )
+__CMPXCHG_CASE(w, h,     , 16,   )
+__CMPXCHG_CASE(w,  ,     , 32,   )
+__CMPXCHG_CASE(x,  ,     , 64,   )
+__CMPXCHG_CASE(w, b, acq_,  8,  a, "memory")
+__CMPXCHG_CASE(w, h, acq_, 16,  a, "memory")
+__CMPXCHG_CASE(w,  , acq_, 32,  a, "memory")
+__CMPXCHG_CASE(x,  , acq_, 64,  a, "memory")
+__CMPXCHG_CASE(w, b, rel_,  8,  l, "memory")
+__CMPXCHG_CASE(w, h, rel_, 16,  l, "memory")
+__CMPXCHG_CASE(w,  , rel_, 32,  l, "memory")
+__CMPXCHG_CASE(x,  , rel_, 64,  l, "memory")
+__CMPXCHG_CASE(w, b,  mb_,  8, al, "memory")
+__CMPXCHG_CASE(w, h,  mb_, 16, al, "memory")
+__CMPXCHG_CASE(w,  ,  mb_, 32, al, "memory")
+__CMPXCHG_CASE(x,  ,  mb_, 64, al, "memory")
 
 #undef __LL_SC_CMPXCHG
 #undef __CMPXCHG_CASE
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 3b0938281541..1f0340fc6dad 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -30,46 +30,46 @@ 
  * barrier case is generated as release+dmb for the former and
  * acquire+release for the latter.
  */
-#define __XCHG_CASE(w, sz, name, mb, nop_lse, acq, acq_lse, rel, cl)	\
-static inline unsigned long __xchg_case_##name(unsigned long x,		\
-					       volatile void *ptr)	\
-{									\
-	unsigned long ret, tmp;						\
-									\
-	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
-	/* LL/SC */							\
-	"	prfm	pstl1strm, %2\n"				\
-	"1:	ld" #acq "xr" #sz "\t%" #w "0, %2\n"			\
-	"	st" #rel "xr" #sz "\t%w1, %" #w "3, %2\n"		\
-	"	cbnz	%w1, 1b\n"					\
-	"	" #mb,							\
-	/* LSE atomics */						\
-	"	swp" #acq_lse #rel #sz "\t%" #w "3, %" #w "0, %2\n"	\
-		__nops(3)						\
-	"	" #nop_lse)						\
-	: "=&r" (ret), "=&r" (tmp), "+Q" (*(unsigned long *)ptr)	\
-	: "r" (x)							\
-	: cl);								\
-									\
-	return ret;							\
+#define __XCHG_CASE(w, sfx, name, sz, mb, nop_lse, acq, acq_lse, rel, cl)	\
+static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr)		\
+{										\
+	u##sz ret;								\
+	unsigned long tmp;							\
+										\
+	asm volatile(ARM64_LSE_ATOMIC_INSN(					\
+	/* LL/SC */								\
+	"	prfm	pstl1strm, %2\n"					\
+	"1:	ld" #acq "xr" #sfx "\t%" #w "0, %2\n"				\
+	"	st" #rel "xr" #sfx "\t%w1, %" #w "3, %2\n"			\
+	"	cbnz	%w1, 1b\n"						\
+	"	" #mb,								\
+	/* LSE atomics */							\
+	"	swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n"		\
+		__nops(3)							\
+	"	" #nop_lse)							\
+	: "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr)			\
+	: "r" (x)								\
+	: cl);									\
+										\
+	return ret;								\
 }
 
-__XCHG_CASE(w, b,     1,        ,    ,  ,  ,  ,         )
-__XCHG_CASE(w, h,     2,        ,    ,  ,  ,  ,         )
-__XCHG_CASE(w,  ,     4,        ,    ,  ,  ,  ,         )
-__XCHG_CASE( ,  ,     8,        ,    ,  ,  ,  ,         )
-__XCHG_CASE(w, b, acq_1,        ,    , a, a,  , "memory")
-__XCHG_CASE(w, h, acq_2,        ,    , a, a,  , "memory")
-__XCHG_CASE(w,  , acq_4,        ,    , a, a,  , "memory")
-__XCHG_CASE( ,  , acq_8,        ,    , a, a,  , "memory")
-__XCHG_CASE(w, b, rel_1,        ,    ,  ,  , l, "memory")
-__XCHG_CASE(w, h, rel_2,        ,    ,  ,  , l, "memory")
-__XCHG_CASE(w,  , rel_4,        ,    ,  ,  , l, "memory")
-__XCHG_CASE( ,  , rel_8,        ,    ,  ,  , l, "memory")
-__XCHG_CASE(w, b,  mb_1, dmb ish, nop,  , a, l, "memory")
-__XCHG_CASE(w, h,  mb_2, dmb ish, nop,  , a, l, "memory")
-__XCHG_CASE(w,  ,  mb_4, dmb ish, nop,  , a, l, "memory")
-__XCHG_CASE( ,  ,  mb_8, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE(w, b,     ,  8,        ,    ,  ,  ,  ,         )
+__XCHG_CASE(w, h,     , 16,        ,    ,  ,  ,  ,         )
+__XCHG_CASE(w,  ,     , 32,        ,    ,  ,  ,  ,         )
+__XCHG_CASE( ,  ,     , 64,        ,    ,  ,  ,  ,         )
+__XCHG_CASE(w, b, acq_,  8,        ,    , a, a,  , "memory")
+__XCHG_CASE(w, h, acq_, 16,        ,    , a, a,  , "memory")
+__XCHG_CASE(w,  , acq_, 32,        ,    , a, a,  , "memory")
+__XCHG_CASE( ,  , acq_, 64,        ,    , a, a,  , "memory")
+__XCHG_CASE(w, b, rel_,  8,        ,    ,  ,  , l, "memory")
+__XCHG_CASE(w, h, rel_, 16,        ,    ,  ,  , l, "memory")
+__XCHG_CASE(w,  , rel_, 32,        ,    ,  ,  , l, "memory")
+__XCHG_CASE( ,  , rel_, 64,        ,    ,  ,  , l, "memory")
+__XCHG_CASE(w, b,  mb_,  8, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE(w, h,  mb_, 16, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE(w,  ,  mb_, 32, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE( ,  ,  mb_, 64, dmb ish, nop,  , a, l, "memory")
 
 #undef __XCHG_CASE
 
@@ -80,13 +80,13 @@  static inline unsigned long __xchg##sfx(unsigned long x,		\
 {									\
 	switch (size) {							\
 	case 1:								\
-		return __xchg_case##sfx##_1(x, ptr);			\
+		return __xchg_case##sfx##_8(x, ptr);			\
 	case 2:								\
-		return __xchg_case##sfx##_2(x, ptr);			\
+		return __xchg_case##sfx##_16(x, ptr);			\
 	case 4:								\
-		return __xchg_case##sfx##_4(x, ptr);			\
+		return __xchg_case##sfx##_32(x, ptr);			\
 	case 8:								\
-		return __xchg_case##sfx##_8(x, ptr);			\
+		return __xchg_case##sfx##_64(x, ptr);			\
 	default:							\
 		BUILD_BUG();						\
 	}								\
@@ -123,13 +123,13 @@  static inline unsigned long __cmpxchg##sfx(volatile void *ptr,		\
 {									\
 	switch (size) {							\
 	case 1:								\
-		return __cmpxchg_case##sfx##_1(ptr, (u8)old, new);	\
+		return __cmpxchg_case##sfx##_8(ptr, (u8)old, new);	\
 	case 2:								\
-		return __cmpxchg_case##sfx##_2(ptr, (u16)old, new);	\
+		return __cmpxchg_case##sfx##_16(ptr, (u16)old, new);	\
 	case 4:								\
-		return __cmpxchg_case##sfx##_4(ptr, old, new);		\
+		return __cmpxchg_case##sfx##_32(ptr, old, new);		\
 	case 8:								\
-		return __cmpxchg_case##sfx##_8(ptr, old, new);		\
+		return __cmpxchg_case##sfx##_64(ptr, old, new);		\
 	default:							\
 		BUILD_BUG();						\
 	}								\
@@ -197,16 +197,16 @@  __CMPXCHG_GEN(_mb)
 	__ret; \
 })
 
-#define __CMPWAIT_CASE(w, sz, name)					\
-static inline void __cmpwait_case_##name(volatile void *ptr,		\
-					 unsigned long val)		\
+#define __CMPWAIT_CASE(w, sfx, sz)					\
+static inline void __cmpwait_case_##sz(volatile void *ptr,		\
+				       unsigned long val)		\
 {									\
 	unsigned long tmp;						\
 									\
 	asm volatile(							\
 	"	sevl\n"							\
 	"	wfe\n"							\
-	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
+	"	ldxr" #sfx "\t%" #w "[tmp], %[v]\n"			\
 	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
 	"	cbnz	%" #w "[tmp], 1f\n"				\
 	"	wfe\n"							\
@@ -215,10 +215,10 @@  static inline void __cmpwait_case_##name(volatile void *ptr,		\
 	: [val] "r" (val));						\
 }
 
-__CMPWAIT_CASE(w, b, 1);
-__CMPWAIT_CASE(w, h, 2);
-__CMPWAIT_CASE(w,  , 4);
-__CMPWAIT_CASE( ,  , 8);
+__CMPWAIT_CASE(w, b, 8);
+__CMPWAIT_CASE(w, h, 16);
+__CMPWAIT_CASE(w,  , 32);
+__CMPWAIT_CASE( ,  , 64);
 
 #undef __CMPWAIT_CASE
 
@@ -229,13 +229,13 @@  static inline void __cmpwait##sfx(volatile void *ptr,			\
 {									\
 	switch (size) {							\
 	case 1:								\
-		return __cmpwait_case##sfx##_1(ptr, (u8)val);		\
+		return __cmpwait_case##sfx##_8(ptr, (u8)val);		\
 	case 2:								\
-		return __cmpwait_case##sfx##_2(ptr, (u16)val);		\
+		return __cmpwait_case##sfx##_16(ptr, (u16)val);		\
 	case 4:								\
-		return __cmpwait_case##sfx##_4(ptr, val);		\
+		return __cmpwait_case##sfx##_32(ptr, val);		\
 	case 8:								\
-		return __cmpwait_case##sfx##_8(ptr, val);		\
+		return __cmpwait_case##sfx##_64(ptr, val);		\
 	default:							\
 		BUILD_BUG();						\
 	}								\