diff mbox series

[RFC,05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()

Message ID 20221219154119.154045458@infradead.org (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() | expand

Commit Message

Peter Zijlstra Dec. 19, 2022, 3:35 p.m. UTC
For all architectures that currently support cmpxchg_double()
implement the cmpxchg128() family of functions that is basically the
same but with a saner interface.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
 arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
 arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
 arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
 arch/x86/include/asm/cmpxchg_32.h     |    3 +
 arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
 6 files changed, 185 insertions(+), 3 deletions(-)

Comments

Boqun Feng Dec. 19, 2022, 8:07 p.m. UTC | #1
On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
> 
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
>  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
>  
>  #undef __CMPXCHG_DBL
> +
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +#define __CMPXCHG128(name, mb, rel, cl)					\
> +static __always_inline u128						\
> +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +									\
> +	asm volatile("// __cmpxchg128" #name "\n"			\
> +	"	prfm	pstl1strm, %2\n"				\
> +	"1:	ldxp	%0, %1, %2\n"					\
> +	"	eor	%3, %0, %3\n"					\
> +	"	eor	%4, %1, %4\n"					\
> +	"	orr	%3, %4, %3\n"					\
> +	"	cbnz	%3, 2f\n"					\
> +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> +	"	cbnz	%w3, 1b\n"					\
> +	"	" #mb "\n"						\
> +	"2:"								\
> +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\

I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
long *) ptr)"? Because compilers may think only 64bit value pointed by
"ptr" gets modified, and they are allowed to do "useful" optimization.

Same for lse and s390.

Regards,
Boqun

> +	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
> +	: cl);								\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,        ,  ,         )
> +__CMPXCHG128(_mb, dmb ish, l, "memory")
> +
> +#undef __CMPXCHG128
> +
>  #undef K
>  
>  #endif	/* __ASM_ATOMIC_LL_SC_H */
Peter Zijlstra Dec. 20, 2022, 11:08 a.m. UTC | #2
On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > For all architectures that currently support cmpxchg_double()
> > implement the cmpxchg128() family of functions that is basically the
> > same but with a saner interface.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> >  6 files changed, 185 insertions(+), 3 deletions(-)
> > 
> > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> >  
> >  #undef __CMPXCHG_DBL
> > +
> > +union __u128_halves {
> > +	u128 full;
> > +	struct {
> > +		u64 low, high;
> > +	};
> > +};
> > +
> > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > +static __always_inline u128						\
> > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > +{									\
> > +	union __u128_halves r, o = { .full = (old) },			\
> > +			       n = { .full = (new) };			\
> > +									\
> > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > +	"	prfm	pstl1strm, %2\n"				\
> > +	"1:	ldxp	%0, %1, %2\n"					\
> > +	"	eor	%3, %0, %3\n"					\
> > +	"	eor	%4, %1, %4\n"					\
> > +	"	orr	%3, %4, %3\n"					\
> > +	"	cbnz	%3, 2f\n"					\
> > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > +	"	cbnz	%w3, 1b\n"					\
> > +	"	" #mb "\n"						\
> > +	"2:"								\
> > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> 
> I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> long *) ptr)"? Because compilers may think only 64bit value pointed by
> "ptr" gets modified, and they are allowed to do "useful" optimization.

In this I've copied the existing cmpxchg_double() code; I'll have to let
the arch folks speak here, I've no clue.
Linus Torvalds Dec. 20, 2022, 2:31 p.m. UTC | #3
On Tue, Dec 20, 2022 at 5:09 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> >
> > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > "ptr" gets modified, and they are allowed to do "useful" optimization.
>
> In this I've copied the existing cmpxchg_double() code; I'll have to let
> the arch folks speak here, I've no clue.

It does sound like the right thing to do. I doubt it ends up making a
difference in practice, but yes, the asm doesn't have a memory
clobber, so the input/output types should be the right ones for the
compiler to not possibly do something odd and cache the part that it
doesn't see as being accessed.

              Linus
Peter Zijlstra Dec. 20, 2022, 3:09 p.m. UTC | #4
On Tue, Dec 20, 2022 at 08:31:19AM -0600, Linus Torvalds wrote:
> On Tue, Dec 20, 2022 at 5:09 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > >
> > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > > "ptr" gets modified, and they are allowed to do "useful" optimization.
> >
> > In this I've copied the existing cmpxchg_double() code; I'll have to let
> > the arch folks speak here, I've no clue.
> 
> It does sound like the right thing to do. I doubt it ends up making a
> difference in practice, but yes, the asm doesn't have a memory
> clobber, so the input/output types should be the right ones for the
> compiler to not possibly do something odd and cache the part that it
> doesn't see as being accessed.

Right, and x86 does just *ptr, without trying to cast away the volatile
even.

I've pushed out a *(u128 *)ptr variant for arm64 and s390, then at least
we'll know if the compiler objects.
Boqun Feng Dec. 22, 2022, 1:25 a.m. UTC | #5
On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
> 
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
>  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
>  
>  #undef __CMPXCHG_DBL
> +
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +#define __CMPXCHG128(name, mb, rel, cl)					\
> +static __always_inline u128						\
> +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +									\
> +	asm volatile("// __cmpxchg128" #name "\n"			\
> +	"	prfm	pstl1strm, %2\n"				\
> +	"1:	ldxp	%0, %1, %2\n"					\
> +	"	eor	%3, %0, %3\n"					\
> +	"	eor	%4, %1, %4\n"					\
> +	"	orr	%3, %4, %3\n"					\
> +	"	cbnz	%3, 2f\n"					\
> +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> +	"	cbnz	%w3, 1b\n"					\
> +	"	" #mb "\n"						\
> +	"2:"								\
> +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> +	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
> +	: cl);								\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,        ,  ,         )
> +__CMPXCHG128(_mb, dmb ish, l, "memory")
> +
> +#undef __CMPXCHG128
> +
>  #undef K
>  
>  #endif	/* __ASM_ATOMIC_LL_SC_H */
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a
>  	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
>  	: [v] "+Q" (v->counter),					\
>  	  [old] "=r" (old)						\
> -	: [i] "r" (i) 							\
> +	: [i] "r" (i)							\
>  	: cl);								\
>  									\
>  	return old;							\
> @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory")
>  
>  #undef __CMPXCHG_DBL
>  
> +#define __CMPXCHG128(name, mb, cl...)					\
> +static __always_inline u128						\
> +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +	register unsigned long x0 asm ("x0") = o.low;			\
> +	register unsigned long x1 asm ("x1") = o.high;			\
> +	register unsigned long x2 asm ("x2") = n.low;			\
> +	register unsigned long x3 asm ("x3") = n.high;			\
> +	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
> +									\
> +	asm volatile(							\
> +	__LSE_PREAMBLE							\
> +	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
> +	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
> +	  [v] "+Q" (*(unsigned long *)ptr)				\
> +	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\

Issue #1: the line below can be removed, otherwise..

> +	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\

warning:

	./arch/arm64/include/asm/atomic_lse.h: In function '__lse__cmpxchg128_mb':
	./arch/arm64/include/asm/atomic_lse.h:309:27: warning: 'r.<U97b8>.low' is used uninitialized [-Wuninitialized]
	  309 |           [oldval1] "r" (r.low), [oldval2] "r" (r.high)


> +	: cl);								\
> +									\
> +	r.low = x0; r.high = x1;					\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,   )
> +__CMPXCHG128(_mb, al, "memory")
> +
> +#undef __CMPXCHG128
> +
>  #endif	/* __ASM_ATOMIC_LSE_H */
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb)
>  
>  #undef __CMPXCHG_DBL
>  
> +#define __CMPXCHG128(name)						\
> +static inline long __cmpxchg128##name(volatile u128 *ptr,		\

Issue #2: this should be

static inline u128 __cmpxchg128##name(..)

because cmpxchg* needs to return the old value.

Regards,
Boqun

> +				      u128 old, u128 new)		\
> +{									\
> +	return __lse_ll_sc_body(_cmpxchg128##name,			\
> +				ptr, old, new);				\
> +}
> +
> +__CMPXCHG128(   )
> +__CMPXCHG128(_mb)
> +
> +#undef __CMPXCHG128
> +
>  #define __CMPXCHG_GEN(sfx)						\
>  static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr,	\
>  					   unsigned long old,		\
> @@ -229,6 +242,19 @@ __CMPXCHG_GEN(_mb)
>  	__ret;									\
>  })
>  
> +/* cmpxchg128 */
> +#define system_has_cmpxchg128()		1
> +
> +#define arch_cmpxchg128(ptr, o, n)						\
> +({										\
> +	__cmpxchg128_mb((ptr), (o), (n));					\
> +})
> +
> +#define arch_cmpxchg128_local(ptr, o, n)					\
> +({										\
> +	__cmpxchg128((ptr), (o), (n));						\
> +})
> +
>  #define __CMPWAIT_CASE(w, sfx, sz)					\
>  static inline void __cmpwait_case_##sz(volatile void *ptr,		\
>  				       unsigned long val)		\
> --- a/arch/s390/include/asm/cmpxchg.h
> +++ b/arch/s390/include/asm/cmpxchg.h
> @@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou
>  			 (unsigned long)(n1), (unsigned long)(n2));	\
>  })
>  
> +#define system_has_cmpxchg128()		1
> +
> +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		: [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +	return old;
> +}
> +
> +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
> +{
> +	u128 old = *oldp;
> +	int cc;
> +
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		"	ipm	%[cc]\n"
> +		"	srl	%[cc],28\n"
> +		: [cc] "=&d" (cc), [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +
> +	if (unlikely(!cc))
> +		*oldp = old;
> +
> +	return likely(cc);
> +}
> +
>  #endif /* __ASM_CMPXCHG_H */
> --- a/arch/x86/include/asm/cmpxchg_32.h
> +++ b/arch/x86/include/asm/cmpxchg_32.h
> @@ -103,6 +103,7 @@ static inline bool __try_cmpxchg64(volat
>  
>  #endif
>  
> -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
> +#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX8)
> +#define system_has_cmpxchg64()		boot_cpu_has(X86_FEATURE_CX8)
>  
>  #endif /* _ASM_X86_CMPXCHG_32_H */
> --- a/arch/x86/include/asm/cmpxchg_64.h
> +++ b/arch/x86/include/asm/cmpxchg_64.h
> @@ -20,6 +20,59 @@
>  	arch_try_cmpxchg((ptr), (po), (n));				\
>  })
>  
> -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	union __u128_halves o = { .full = old, }, n = { .full = new, };
> +
> +	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
> +		     : [ptr] "+m" (*ptr),
> +		       "+a" (o.low), "+d" (o.high)
> +		     : "b" (n.low), "c" (n.high)
> +		     : "memory");
> +
> +	return o.full;
> +}
> +
> +static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	union __u128_halves o = { .full = old, }, n = { .full = new, };
> +
> +	asm volatile("cmpxchg16b %[ptr]"
> +		     : [ptr] "+m" (*ptr),
> +		       "+a" (o.low), "+d" (o.high)
> +		     : "b" (n.low), "c" (n.high)
> +		     : "memory");
> +
> +	return o.full;
> +}
> +
> +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *old, u128 new)
> +{
> +	union __u128_halves o = { .full = *old, }, n = { .full = new, };
> +	bool ret;
> +
> +	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
> +		     CC_SET(e)
> +		     : CC_OUT(e) (ret),
> +		       [ptr] "+m" (*ptr),
> +		       "+a" (o.low), "+d" (o.high)
> +		     : "b" (n.low), "c" (n.high)
> +		     : "memory");
> +
> +	if (unlikely(!ret))
> +		*old = o.full;
> +
> +	return likely(ret);
> +}
> +
> +#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX16)
> +#define system_has_cmpxchg128()		boot_cpu_has(X86_FEATURE_CX16)
>  
>  #endif /* _ASM_X86_CMPXCHG_64_H */
> 
>
Peter Zijlstra Dec. 22, 2022, 1:16 p.m. UTC | #6
On Wed, Dec 21, 2022 at 05:25:20PM -0800, Boqun Feng wrote:

> > +#define __CMPXCHG128(name, mb, cl...)					\
> > +static __always_inline u128						\
> > +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
> > +{									\
> > +	union __u128_halves r, o = { .full = (old) },			\
> > +			       n = { .full = (new) };			\
> > +	register unsigned long x0 asm ("x0") = o.low;			\
> > +	register unsigned long x1 asm ("x1") = o.high;			\
> > +	register unsigned long x2 asm ("x2") = n.low;			\
> > +	register unsigned long x3 asm ("x3") = n.high;			\
> > +	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
> > +									\
> > +	asm volatile(							\
> > +	__LSE_PREAMBLE							\
> > +	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
> > +	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
> > +	  [v] "+Q" (*(unsigned long *)ptr)				\
> > +	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
> 
> Issue #1: the line below can be removed, otherwise..
> 
> > +	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\
> 
> warning:
> 
> 	./arch/arm64/include/asm/atomic_lse.h: In function '__lse__cmpxchg128_mb':
> 	./arch/arm64/include/asm/atomic_lse.h:309:27: warning: 'r.<U97b8>.low' is used uninitialized [-Wuninitialized]
> 	  309 |           [oldval1] "r" (r.low), [oldval2] "r" (r.high)
> 
> 
> > +	: cl);								\
> > +									\
> > +	r.low = x0; r.high = x1;					\
> > +									\
> > +	return r.full;							\
> > +}
> > +
> > +__CMPXCHG128(   ,   )
> > +__CMPXCHG128(_mb, al, "memory")
> > +
> > +#undef __CMPXCHG128
> > +
> >  #endif	/* __ASM_ATOMIC_LSE_H */
> > --- a/arch/arm64/include/asm/cmpxchg.h
> > +++ b/arch/arm64/include/asm/cmpxchg.h
> > @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb)
> >  
> >  #undef __CMPXCHG_DBL
> >  
> > +#define __CMPXCHG128(name)						\
> > +static inline long __cmpxchg128##name(volatile u128 *ptr,		\
> 
> Issue #2: this should be
> 
> static inline u128 __cmpxchg128##name(..)
> 
> because cmpxchg* needs to return the old value.

Duh.. fixed both. Pushed out to queue/core/wip-u128. I'll probably
continue all this in two weeks (yay xmas break!).
Mark Rutland Jan. 3, 2023, 1:25 p.m. UTC | #7
On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > > For all architectures that currently support cmpxchg_double()
> > > implement the cmpxchg128() family of functions that is basically the
> > > same but with a saner interface.
> > > 
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> > >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> > >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> > >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> > >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> > >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> > >  6 files changed, 185 insertions(+), 3 deletions(-)
> > > 
> > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> > >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> > >  
> > >  #undef __CMPXCHG_DBL
> > > +
> > > +union __u128_halves {
> > > +	u128 full;
> > > +	struct {
> > > +		u64 low, high;
> > > +	};
> > > +};
> > > +
> > > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > > +static __always_inline u128						\
> > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > > +{									\
> > > +	union __u128_halves r, o = { .full = (old) },			\
> > > +			       n = { .full = (new) };			\
> > > +									\
> > > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > > +	"	prfm	pstl1strm, %2\n"				\
> > > +	"1:	ldxp	%0, %1, %2\n"					\
> > > +	"	eor	%3, %0, %3\n"					\
> > > +	"	eor	%4, %1, %4\n"					\
> > > +	"	orr	%3, %4, %3\n"					\
> > > +	"	cbnz	%3, 2f\n"					\
> > > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > > +	"	cbnz	%w3, 1b\n"					\
> > > +	"	" #mb "\n"						\
> > > +	"2:"								\
> > > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> > 
> > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > "ptr" gets modified, and they are allowed to do "useful" optimization.
> 
> In this I've copied the existing cmpxchg_double() code; I'll have to let
> the arch folks speak here, I've no clue.

We definitely need to ensure the compiler sees we poke the whole thing, or it
can get this horribly wrong, so that is a latent bug.

See commit:

  fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable")

... for examples of GCC being clever, where I overlooked the *_double() cases.

I'll go spin a quick fix for that so that we can have something go to stable
before we rejig the API.

Mark.
Mark Rutland Jan. 3, 2023, 2:03 p.m. UTC | #8
On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > > > For all architectures that currently support cmpxchg_double()
> > > > implement the cmpxchg128() family of functions that is basically the
> > > > same but with a saner interface.
> > > > 
> > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > > ---
> > > >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> > > >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> > > >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> > > >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> > > >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> > > >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> > > >  6 files changed, 185 insertions(+), 3 deletions(-)
> > > > 
> > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> > > >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> > > >  
> > > >  #undef __CMPXCHG_DBL
> > > > +
> > > > +union __u128_halves {
> > > > +	u128 full;
> > > > +	struct {
> > > > +		u64 low, high;
> > > > +	};
> > > > +};
> > > > +
> > > > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > > > +static __always_inline u128						\
> > > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > > > +{									\
> > > > +	union __u128_halves r, o = { .full = (old) },			\
> > > > +			       n = { .full = (new) };			\
> > > > +									\
> > > > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > > > +	"	prfm	pstl1strm, %2\n"				\
> > > > +	"1:	ldxp	%0, %1, %2\n"					\
> > > > +	"	eor	%3, %0, %3\n"					\
> > > > +	"	eor	%4, %1, %4\n"					\
> > > > +	"	orr	%3, %4, %3\n"					\
> > > > +	"	cbnz	%3, 2f\n"					\
> > > > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > > > +	"	cbnz	%w3, 1b\n"					\
> > > > +	"	" #mb "\n"						\
> > > > +	"2:"								\
> > > > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> > > 
> > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > > "ptr" gets modified, and they are allowed to do "useful" optimization.
> > 
> > In this I've copied the existing cmpxchg_double() code; I'll have to let
> > the arch folks speak here, I've no clue.
> 
> We definitely need to ensure the compiler sees we poke the whole thing, or it
> can get this horribly wrong, so that is a latent bug.
> 
> See commit:
> 
>   fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable")
> 
> ... for examples of GCC being clever, where I overlooked the *_double() cases.

Ugh; with GCC 12.1.0, arm64 defconfig, and the following:

| struct big {
|         u64 lo, hi;
| } __aligned(128);
| 
| unsigned long foo(struct big *b)
| {
|         u64 hi_old, hi_new;
| 
|         hi_old = b->hi;
| 
|         cmpxchg_double_local(&b->lo, &b->hi, 0x12, 0x34, 0x56, 0x78);
| 
|         hi_new = b->hi;
| 
|         return hi_old ^ hi_new;
| }

GCC clearly figures out the high half isn't modified, and constant folds hi_old
^ hi_new down to zero, regardless of whether we use LL/SC or LSE:

<foo>:
   0:   d503233f        paciasp
   4:   aa0003e4        mov     x4, x0
   8:   1400000e        b       40 <foo+0x40>
   c:   d2800240        mov     x0, #0x12                       // #18
  10:   d2800681        mov     x1, #0x34                       // #52
  14:   aa0003e5        mov     x5, x0
  18:   aa0103e6        mov     x6, x1
  1c:   d2800ac2        mov     x2, #0x56                       // #86
  20:   d2800f03        mov     x3, #0x78                       // #120
  24:   48207c82        casp    x0, x1, x2, x3, [x4]
  28:   ca050000        eor     x0, x0, x5
  2c:   ca060021        eor     x1, x1, x6
  30:   aa010000        orr     x0, x0, x1
  34:   d2800000        mov     x0, #0x0                        // #0    <--- BANG
  38:   d50323bf        autiasp
  3c:   d65f03c0        ret
  40:   d2800240        mov     x0, #0x12                       // #18
  44:   d2800681        mov     x1, #0x34                       // #52
  48:   d2800ac2        mov     x2, #0x56                       // #86
  4c:   d2800f03        mov     x3, #0x78                       // #120
  50:   f9800091        prfm    pstl1strm, [x4]
  54:   c87f1885        ldxp    x5, x6, [x4]
  58:   ca0000a5        eor     x5, x5, x0
  5c:   ca0100c6        eor     x6, x6, x1
  60:   aa0600a6        orr     x6, x5, x6
  64:   b5000066        cbnz    x6, 70 <foo+0x70>
  68:   c8250c82        stxp    w5, x2, x3, [x4]
  6c:   35ffff45        cbnz    w5, 54 <foo+0x54>
  70:   d2800000        mov     x0, #0x0                        // #0     <--- BANG
  74:   d50323bf        autiasp
  78:   d65f03c0        ret
  7c:   d503201f        nop

... so we *definitely* need to fix that.

Using __uint128_t instead, e.g.

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 0890e4f568fb7..cbb3d961123b1 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1,                   \
        "       cbnz    %w0, 1b\n"                                      \
        "       " #mb "\n"                                              \
        "2:"                                                            \
-       : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)        \
+       : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr)          \
        : "r" (old1), "r" (old2), "r" (new1), "r" (new2)                \
        : cl);                                                          \
                                                                        \
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 52075e93de6c0..a94d6dacc0292 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned long old1,                             \
        "       eor     %[old2], %[old2], %[oldval2]\n"                 \
        "       orr     %[old1], %[old1], %[old2]"                      \
        : [old1] "+&r" (x0), [old2] "+&r" (x1),                         \
-         [v] "+Q" (*(unsigned long *)ptr)                              \
+         [v] "+Q" (*(__uint128_t *)ptr)                                \
        : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),             \
          [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)              \
        : cl);                                                          \

... makes GCC much happier:

<foo>:
   0:   f9400407        ldr     x7, [x0, #8]
   4:   d503233f        paciasp
   8:   aa0003e4        mov     x4, x0
   c:   1400000f        b       48 <foo+0x48>
  10:   d2800240        mov     x0, #0x12                       // #18
  14:   d2800681        mov     x1, #0x34                       // #52
  18:   aa0003e5        mov     x5, x0
  1c:   aa0103e6        mov     x6, x1
  20:   d2800ac2        mov     x2, #0x56                       // #86
  24:   d2800f03        mov     x3, #0x78                       // #120
  28:   48207c82        casp    x0, x1, x2, x3, [x4]
  2c:   ca050000        eor     x0, x0, x5
  30:   ca060021        eor     x1, x1, x6
  34:   aa010000        orr     x0, x0, x1
  38:   f9400480        ldr     x0, [x4, #8]
  3c:   d50323bf        autiasp
  40:   ca0000e0        eor     x0, x7, x0
  44:   d65f03c0        ret
  48:   d2800240        mov     x0, #0x12                       // #18
  4c:   d2800681        mov     x1, #0x34                       // #52
  50:   d2800ac2        mov     x2, #0x56                       // #86
  54:   d2800f03        mov     x3, #0x78                       // #120
  58:   f9800091        prfm    pstl1strm, [x4]
  5c:   c87f1885        ldxp    x5, x6, [x4]
  60:   ca0000a5        eor     x5, x5, x0
  64:   ca0100c6        eor     x6, x6, x1
  68:   aa0600a6        orr     x6, x5, x6
  6c:   b5000066        cbnz    x6, 78 <foo+0x78>
  70:   c8250c82        stxp    w5, x2, x3, [x4]
  74:   35ffff45        cbnz    w5, 5c <foo+0x5c>
  78:   f9400480        ldr     x0, [x4, #8]
  7c:   d50323bf        autiasp
  80:   ca0000e0        eor     x0, x7, x0
  84:   d65f03c0        ret
  88:   d503201f        nop
  8c:   d503201f        nop

... I'll go check whether clang is happy with that, and how far back that can
go, otherwise we'll need to blat the high half with a separate constaint that
(ideally) doesn't end up allocating a pointless address register.

Mark.
Mark Rutland Jan. 3, 2023, 4:19 p.m. UTC | #9
On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> > > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > > > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > > > > For all architectures that currently support cmpxchg_double()
> > > > > implement the cmpxchg128() family of functions that is basically the
> > > > > same but with a saner interface.
> > > > > 
> > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > > > ---
> > > > >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> > > > >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> > > > >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> > > > >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> > > > >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> > > > >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> > > > >  6 files changed, 185 insertions(+), 3 deletions(-)
> > > > > 
> > > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> > > > >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> > > > >  
> > > > >  #undef __CMPXCHG_DBL
> > > > > +
> > > > > +union __u128_halves {
> > > > > +	u128 full;
> > > > > +	struct {
> > > > > +		u64 low, high;
> > > > > +	};
> > > > > +};
> > > > > +
> > > > > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > > > > +static __always_inline u128						\
> > > > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > > > > +{									\
> > > > > +	union __u128_halves r, o = { .full = (old) },			\
> > > > > +			       n = { .full = (new) };			\
> > > > > +									\
> > > > > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > > > > +	"	prfm	pstl1strm, %2\n"				\
> > > > > +	"1:	ldxp	%0, %1, %2\n"					\
> > > > > +	"	eor	%3, %0, %3\n"					\
> > > > > +	"	eor	%4, %1, %4\n"					\
> > > > > +	"	orr	%3, %4, %3\n"					\
> > > > > +	"	cbnz	%3, 2f\n"					\
> > > > > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > > > > +	"	cbnz	%w3, 1b\n"					\
> > > > > +	"	" #mb "\n"						\
> > > > > +	"2:"								\
> > > > > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> > > > 
> > > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > > > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > > > "ptr" gets modified, and they are allowed to do "useful" optimization.
> > > 
> > > In this I've copied the existing cmpxchg_double() code; I'll have to let
> > > the arch folks speak here, I've no clue.
> > 
> > We definitely need to ensure the compiler sees we poke the whole thing, or it
> > can get this horribly wrong, so that is a latent bug.
> > 
> > See commit:
> > 
> >   fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable")
> > 
> > ... for examples of GCC being clever, where I overlooked the *_double() cases.

> Using __uint128_t instead, e.g.
> 
> diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> index 0890e4f568fb7..cbb3d961123b1 100644
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1,                   \
>         "       cbnz    %w0, 1b\n"                                      \
>         "       " #mb "\n"                                              \
>         "2:"                                                            \
> -       : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)        \
> +       : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr)          \
>         : "r" (old1), "r" (old2), "r" (new1), "r" (new2)                \
>         : cl);                                                          \
>                                                                         \
> diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> index 52075e93de6c0..a94d6dacc0292 100644
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned long old1,                             \
>         "       eor     %[old2], %[old2], %[oldval2]\n"                 \
>         "       orr     %[old1], %[old1], %[old2]"                      \
>         : [old1] "+&r" (x0), [old2] "+&r" (x1),                         \
> -         [v] "+Q" (*(unsigned long *)ptr)                              \
> +         [v] "+Q" (*(__uint128_t *)ptr)                                \
>         : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),             \
>           [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)              \
>         : cl);                                                          \
> 
> ... makes GCC much happier:

> ... I'll go check whether clang is happy with that, and how far back that can
> go, otherwise we'll need to blat the high half with a separate constaint that
> (ideally) doesn't end up allocating a pointless address register.

Hmm... from the commit history it looks like GCC prior to 5.1 might not be
happy with that, but that *might* just be if we actually do arithmetic on the
value, and we might be ok just using it for memroy effects. I can't currently
get such an old GCC to run on my machines so I haven't been able to check.

I'll dig into this a bit more tomorrow, but it looks like the options (for a
backport-suitable fix) will be:

(a) use a __uint128_t input+output, as above, if we're lucky

(b) introduce a second 64-bit input+output for the high half (likely a "+o")

(c) use a full memory clobber for ancient compilers.

Mark.
Arnd Bergmann Jan. 3, 2023, 4:50 p.m. UTC | #10
On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
>> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
>> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:

>> ... makes GCC much happier:
>
>> ... I'll go check whether clang is happy with that, and how far back that can
>> go, otherwise we'll need to blat the high half with a separate constaint that
>> (ideally) doesn't end up allocating a pointless address register.
>
> Hmm... from the commit history it looks like GCC prior to 5.1 might not be
> happy with that, but that *might* just be if we actually do arithmetic on the
> value, and we might be ok just using it for memroy effects. I can't currently
> get such an old GCC to run on my machines so I haven't been able to check.

gcc-5.1 is the oldest (barely) supported compiler, the minimum was
last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are
affected, we're good on mainline but may still want a fix for stable
kernels.

I checked that the cross-compiler binaries from [1] still work, but I noticed
that this version is missing the native aarch64-to-aarch64 compiler (x86 to
aarch64 and vice versa are there), and you need to install libmpfr4 [2]
as a dependency. The newer compilers (6.5.0 and up) don't have these problems.

     Arnd

[1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/arm64/5.5.0/
[2] http://ftp.uk.debian.org/debian/pool/main/m/mpfr4/libmpfr4_3.1.5-1_arm64.deb
Heiko Carstens Jan. 3, 2023, 5:12 p.m. UTC | #11
On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
...
> --- a/arch/s390/include/asm/cmpxchg.h
> +++ b/arch/s390/include/asm/cmpxchg.h
> @@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou
>  			 (unsigned long)(n1), (unsigned long)(n2));	\
>  })
>  
> +#define system_has_cmpxchg128()		1
> +
> +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		: [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +	return old;
> +}
> +
> +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
> +{
> +	u128 old = *oldp;
> +	int cc;
> +
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		"	ipm	%[cc]\n"
> +		"	srl	%[cc],28\n"
> +		: [cc] "=&d" (cc), [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +
> +	if (unlikely(!cc))
> +		*oldp = old;
> +
> +	return likely(cc);
> +}
> +

I was wondering why arch_try_cmpxchg128() isn't even used with later
code. Turned out this is because of a missing

#define arch_try_cmpxchg128 arch_try_cmpxchg128

which in turn means that the generic fallback variant is used.

The above arch_try_cmpxchg128() implementation is broken, since it has
inversed condition code handling (cc == 0 means compare and swap
succeeded, cc == 1 means it failed).

However I would prefer to use the generic fallback variant anyway.
Could you please merge the below into your current patch?

It addresses also the oddity that *ptr within arch_cmpxchg128() is
only specified as input, while it should be input/output - it doesn't
matter due to the memory clobber, but let's have that correct anyway.

diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 527c968945e8..0b98f57bbe9e 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -173,31 +173,12 @@ static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 n
 {
 	asm volatile(
 		"	cdsg	%[old],%[new],%[ptr]\n"
-		: [old] "+&d" (old)
-		: [new] "d" (new),
-		  [ptr] "QS" (*(u128 *)ptr)
+		: [old] "+d" (old), [ptr] "+QS" (*ptr)
+		: [new] "d" (new)
 		: "memory", "cc");
 	return old;
 }
 
-static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
-{
-	u128 old = *oldp;
-	int cc;
-
-	asm volatile(
-		"	cdsg	%[old],%[new],%[ptr]\n"
-		"	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
-		: [cc] "=&d" (cc), [old] "+&d" (old)
-		: [new] "d" (new),
-		  [ptr] "QS" (*(u128 *)ptr)
-		: "memory", "cc");
-
-	if (unlikely(!cc))
-		*oldp = old;
-
-	return likely(cc);
-}
+#define arch_cmpxchg128		arch_cmpxchg128
 
 #endif /* __ASM_CMPXCHG_H */
Mark Rutland Jan. 4, 2023, 11:36 a.m. UTC | #12
On Tue, Jan 03, 2023 at 05:50:00PM +0100, Arnd Bergmann wrote:
> On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote:
> > On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
> >> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> >> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> 
> >> ... makes GCC much happier:
> >
> >> ... I'll go check whether clang is happy with that, and how far back that can
> >> go, otherwise we'll need to blat the high half with a separate constaint that
> >> (ideally) doesn't end up allocating a pointless address register.
> >
> > Hmm... from the commit history it looks like GCC prior to 5.1 might not be
> > happy with that, but that *might* just be if we actually do arithmetic on the
> > value, and we might be ok just using it for memroy effects. I can't currently
> > get such an old GCC to run on my machines so I haven't been able to check.
> 
> gcc-5.1 is the oldest (barely) supported compiler, the minimum was
> last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are
> affected, we're good on mainline but may still want a fix for stable
> kernels.

Yup; I just wanted something that would easily backport to stable, at least as
far as linux-4.9.y (where I couldn't find the minimum GCC version when I looked
yesterday).

> I checked that the cross-compiler binaries from [1] still work, but I noticed
> that this version is missing the native aarch64-to-aarch64 compiler (x86 to
> aarch64 and vice versa are there), and you need to install libmpfr4 [2]
> as a dependency. The newer compilers (6.5.0 and up) don't have these problems.

I was trying the old kernel.org crosstool binaries, but I was either missing a
library (or I have an incompatible version) on my x86_64 host. I'll have
another look today -- thanks for the pointers!

Mark.

>      Arnd
> 
> [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/arm64/5.5.0/
> [2] http://ftp.uk.debian.org/debian/pool/main/m/mpfr4/libmpfr4_3.1.5-1_arm64.deb
Mark Rutland Jan. 4, 2023, 1:55 p.m. UTC | #13
On Wed, Jan 04, 2023 at 11:36:18AM +0000, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 05:50:00PM +0100, Arnd Bergmann wrote:
> > On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote:
> > > On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
> > >> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> > >> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> > 
> > >> ... makes GCC much happier:
> > >
> > >> ... I'll go check whether clang is happy with that, and how far back that can
> > >> go, otherwise we'll need to blat the high half with a separate constaint that
> > >> (ideally) doesn't end up allocating a pointless address register.
> > >
> > > Hmm... from the commit history it looks like GCC prior to 5.1 might not be
> > > happy with that, but that *might* just be if we actually do arithmetic on the
> > > value, and we might be ok just using it for memroy effects. I can't currently
> > > get such an old GCC to run on my machines so I haven't been able to check.
> > 
> > gcc-5.1 is the oldest (barely) supported compiler, the minimum was
> > last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are
> > affected, we're good on mainline but may still want a fix for stable
> > kernels.
> 
> Yup; I just wanted something that would easily backport to stable, at least as
> far as linux-4.9.y (where I couldn't find the minimum GCC version when I looked
> yesterday).

I'd missed that we backported commit:

  dca5244d2f5b94f1 ("compiler.h: Raise minimum version of GCC to 5.1 for arm64")

... all the way back to v4.4.y, so we can assume v5.1 even in stable.

The earliest toolchain I could get running was GCC 4.8.5, and that was happy
with the __uint128_t cast for the asm,

Looking back through the history, the reason for the GCC 5.1 check was that
prior to GCC 5.1 GCC would output library calls for arithmetic on 128-bit
types, as noted in commit:

  fb8722735f50cd51 ("arm64: support __int128 on gcc 5+")

... but since we're not doing any actual manipulation of the value, that should
be fine.

I'll go write a commit message and send that out as a fix.

> > I checked that the cross-compiler binaries from [1] still work, but I noticed
> > that this version is missing the native aarch64-to-aarch64 compiler (x86 to
> > aarch64 and vice versa are there), and you need to install libmpfr4 [2]
> > as a dependency. The newer compilers (6.5.0 and up) don't have these problems.
> 
> I was trying the old kernel.org crosstool binaries, but I was either missing a
> library (or I have an incompatible version) on my x86_64 host. I'll have
> another look today -- thanks for the pointers!

It turns out I'd just missed that at some point the prefix used by the
kernel.org cross compilers changed from:

  aarch64-linux-gnu-

to:

  aarch64-linux-

... and I'd become so used to the latter that I was trying to invoke a binary
that didn't exist. With the older prefix I could use the kernel.org GCC 4.8.5
without issue.

Thanks,
Mark.
Mark Rutland Jan. 9, 2023, 6:50 p.m. UTC | #14
Hi Peter,

On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

I tried giving this a go, specifically your queue core/wip-u128 branch with
HEAD commit c05419246aa69cd3, but it locked up at boot.

I spotted a couple of problems there which also apply here, noted below with
suggested fixes.

> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
> 
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
>  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
>  
>  #undef __CMPXCHG_DBL
> +
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +#define __CMPXCHG128(name, mb, rel, cl)					\
> +static __always_inline u128						\
> +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +									\
> +	asm volatile("// __cmpxchg128" #name "\n"			\
> +	"	prfm	pstl1strm, %2\n"				\
> +	"1:	ldxp	%0, %1, %2\n"					\
> +	"	eor	%3, %0, %3\n"					\
> +	"	eor	%4, %1, %4\n"					\
> +	"	orr	%3, %4, %3\n"					\
> +	"	cbnz	%3, 2f\n"					\

These lines clobber %3 and %4, but per below, those are input operands, and so this blows up.

> +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> +	"	cbnz	%w3, 1b\n"					\
> +	"	" #mb "\n"						\
> +	"2:"								\
> +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> +	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
> +	: cl);								\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,        ,  ,         )
> +__CMPXCHG128(_mb, dmb ish, l, "memory")
> +
> +#undef __CMPXCHG128

I think we can do this simpler and more clearly if we use the u128 operand
directly, with the 'H' modifier to get at the high register of the pair:

| #define __CMPXCHG128(name, mb, rel, cl...)                              \
| static __always_inline u128                                             \
| __ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)       \
| {                                                                       \
|         u128 ret;                                                       \
|         unsigned int tmp;                                               \
|                                                                         \
|         asm volatile("// __cmpxchg128" #name "\n"                       \
|         "       prfm    pstl1strm, %[v]\n"                              \
|         "1:     ldxp    %[ret], %H[ret], %[v]\n"                        \
|         "       cmp     %[ret], %[old]\n"                               \
|         "       ccmp    %H[ret], %H[old], #4, ne\n"                     \
|         "       b.ne    2f\n"                                           \
|         "       st" #rel "xp %w[tmp], %[new], %H[new], %[v]\n"          \
|         "       cbnz    %w[tmp], 1b\n"                                  \
|         "       " #mb "\n"                                              \
|         "2:"                                                            \
|         : [ret] "=&r" (ret),                                            \
|           [tmp] "=&r" (tmp),                                            \
|           [v] "+Q" (*ptr)                                               \
|         : [old] "r" (old),                                              \
|           [new] "r" (new)                                               \
|         : "cc" , ##cl);                                                 \
|                                                                         \
|         return ret;                                                     \
| }
| 
| __CMPXCHG128(   ,        ,  )
| __CMPXCHG128(_mb, dmb ish, l, "memory")
| 
| #undef __CMPXCHG128

Note: I've used CMP and CCMP to simplify the equality check, which clobbers the
flags/condition-codes ("cc"), but requires two fewer GPRs. I'm assuming that's
the better tradeoff here.

The existing cmpxchg_double() code clobbers the loaded value as part of
checking whether it was equal, but to be able to preserve the value and be able
to replay the loop (which for hilarious LL/SC reasons *must* be in asm), we
can't do the same here.

I've boot-tested the suggestion with GCC 12.1.0.

> +
>  #undef K
>  
>  #endif	/* __ASM_ATOMIC_LL_SC_H */
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a
>  	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
>  	: [v] "+Q" (v->counter),					\
>  	  [old] "=r" (old)						\
> -	: [i] "r" (i) 							\
> +	: [i] "r" (i)							\
>  	: cl);								\
>  									\
>  	return old;							\

Spurious whitespace change?

> @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory")
>  
>  #undef __CMPXCHG_DBL
>  
> +#define __CMPXCHG128(name, mb, cl...)					\
> +static __always_inline u128						\
> +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +	register unsigned long x0 asm ("x0") = o.low;			\
> +	register unsigned long x1 asm ("x1") = o.high;			\
> +	register unsigned long x2 asm ("x2") = n.low;			\
> +	register unsigned long x3 asm ("x3") = n.high;			\
> +	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
> +									\
> +	asm volatile(							\
> +	__LSE_PREAMBLE							\
> +	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
> +	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
> +	  [v] "+Q" (*(unsigned long *)ptr)				\
> +	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
> +	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\
> +	: cl);								\
> +									\
> +	r.low = x0; r.high = x1;					\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,   )
> +__CMPXCHG128(_mb, al, "memory")
> +
> +#undef __CMPXCHG128

Similarly, I'd suggest:

| #define __CMPXCHG128(name, mb, cl...)                                   \
| static __always_inline u128                                             \
| __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)         \
| {                                                                       \
|         asm volatile(                                                   \
|         __LSE_PREAMBLE                                                  \
|         "       casp" #mb "\t%[old], %H[old], %[new], %H[new], %[v]\n"  \
|         : [old] "+&r" (old),                                            \
|           [v] "+Q" (*(u128 *)ptr)                                       \
|         : [new] "r" (new)                                               \
|         : cl);                                                          \
|                                                                         \
|         return old;                                                     \
| }
| 
| __CMPXCHG128(   ,   )   
| __CMPXCHG128(_mb, al, "memory")
| 
| #undef __CMPXCHG128

Thanks,
Mark.
Peter Zijlstra Jan. 12, 2023, 10:35 a.m. UTC | #15
On Mon, Jan 09, 2023 at 06:50:24PM +0000, Mark Rutland wrote:

> Similarly, I'd suggest:
> 
> | #define __CMPXCHG128(name, mb, cl...)                                   \
> | static __always_inline u128                                             \
> | __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)         \
> | {                                                                       \
> |         asm volatile(                                                   \
> |         __LSE_PREAMBLE                                                  \
> |         "       casp" #mb "\t%[old], %H[old], %[new], %H[new], %[v]\n"  \
> |         : [old] "+&r" (old),                                            \
> |           [v] "+Q" (*(u128 *)ptr)                                       \
> |         : [new] "r" (new)                                               \
> |         : cl);                                                          \
> |                                                                         \
> |         return old;                                                     \
> | }
> | 
> | __CMPXCHG128(   ,   )   
> | __CMPXCHG128(_mb, al, "memory")
> | 
> | #undef __CMPXCHG128

clang-16 seems to hate on this like:

arch/arm64/include/asm/atomic_lse.h:342:1: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
arch/arm64/include/asm/atomic_lse.h:334:17: note: expanded from macro '__CMPXCHG128'
	: [old] "+&r" (old),                                            \
		       ^

(much the same for the ll_sc version; if you want the full build thing,
holler and I'll bounce you the robot mail).
diff mbox series

Patch

--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -326,6 +326,44 @@  __CMPXCHG_DBL(   ,        ,  ,         )
 __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
 
 #undef __CMPXCHG_DBL
+
+union __u128_halves {
+	u128 full;
+	struct {
+		u64 low, high;
+	};
+};
+
+#define __CMPXCHG128(name, mb, rel, cl)					\
+static __always_inline u128						\
+__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
+{									\
+	union __u128_halves r, o = { .full = (old) },			\
+			       n = { .full = (new) };			\
+									\
+	asm volatile("// __cmpxchg128" #name "\n"			\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ldxp	%0, %1, %2\n"					\
+	"	eor	%3, %0, %3\n"					\
+	"	eor	%4, %1, %4\n"					\
+	"	orr	%3, %4, %3\n"					\
+	"	cbnz	%3, 2f\n"					\
+	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
+	"	cbnz	%w3, 1b\n"					\
+	"	" #mb "\n"						\
+	"2:"								\
+	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
+	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
+	: cl);								\
+									\
+	return r.full;							\
+}
+
+__CMPXCHG128(   ,        ,  ,         )
+__CMPXCHG128(_mb, dmb ish, l, "memory")
+
+#undef __CMPXCHG128
+
 #undef K
 
 #endif	/* __ASM_ATOMIC_LL_SC_H */
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -151,7 +151,7 @@  __lse_atomic64_fetch_##op##name(s64 i, a
 	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
 	: [v] "+Q" (v->counter),					\
 	  [old] "=r" (old)						\
-	: [i] "r" (i) 							\
+	: [i] "r" (i)							\
 	: cl);								\
 									\
 	return old;							\
@@ -324,4 +324,35 @@  __CMPXCHG_DBL(_mb, al, "memory")
 
 #undef __CMPXCHG_DBL
 
+#define __CMPXCHG128(name, mb, cl...)					\
+static __always_inline u128						\
+__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
+{									\
+	union __u128_halves r, o = { .full = (old) },			\
+			       n = { .full = (new) };			\
+	register unsigned long x0 asm ("x0") = o.low;			\
+	register unsigned long x1 asm ("x1") = o.high;			\
+	register unsigned long x2 asm ("x2") = n.low;			\
+	register unsigned long x3 asm ("x3") = n.high;			\
+	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
+									\
+	asm volatile(							\
+	__LSE_PREAMBLE							\
+	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
+	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
+	  [v] "+Q" (*(unsigned long *)ptr)				\
+	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
+	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\
+	: cl);								\
+									\
+	r.low = x0; r.high = x1;					\
+									\
+	return r.full;							\
+}
+
+__CMPXCHG128(   ,   )
+__CMPXCHG128(_mb, al, "memory")
+
+#undef __CMPXCHG128
+
 #endif	/* __ASM_ATOMIC_LSE_H */
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -147,6 +147,19 @@  __CMPXCHG_DBL(_mb)
 
 #undef __CMPXCHG_DBL
 
+#define __CMPXCHG128(name)						\
+static inline long __cmpxchg128##name(volatile u128 *ptr,		\
+				      u128 old, u128 new)		\
+{									\
+	return __lse_ll_sc_body(_cmpxchg128##name,			\
+				ptr, old, new);				\
+}
+
+__CMPXCHG128(   )
+__CMPXCHG128(_mb)
+
+#undef __CMPXCHG128
+
 #define __CMPXCHG_GEN(sfx)						\
 static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr,	\
 					   unsigned long old,		\
@@ -229,6 +242,19 @@  __CMPXCHG_GEN(_mb)
 	__ret;									\
 })
 
+/* cmpxchg128 */
+#define system_has_cmpxchg128()		1
+
+#define arch_cmpxchg128(ptr, o, n)						\
+({										\
+	__cmpxchg128_mb((ptr), (o), (n));					\
+})
+
+#define arch_cmpxchg128_local(ptr, o, n)					\
+({										\
+	__cmpxchg128((ptr), (o), (n));						\
+})
+
 #define __CMPWAIT_CASE(w, sfx, sz)					\
 static inline void __cmpwait_case_##sz(volatile void *ptr,		\
 				       unsigned long val)		\
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -201,4 +201,37 @@  static __always_inline int __cmpxchg_dou
 			 (unsigned long)(n1), (unsigned long)(n2));	\
 })
 
+#define system_has_cmpxchg128()		1
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
+{
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		: [old] "+&d" (old)
+		: [new] "d" (new),
+		  [ptr] "QS" (*(unsigned long *)ptr)
+		: "memory", "cc");
+	return old;
+}
+
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
+{
+	u128 old = *oldp;
+	int cc;
+
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		"	ipm	%[cc]\n"
+		"	srl	%[cc],28\n"
+		: [cc] "=&d" (cc), [old] "+&d" (old)
+		: [new] "d" (new),
+		  [ptr] "QS" (*(unsigned long *)ptr)
+		: "memory", "cc");
+
+	if (unlikely(!cc))
+		*oldp = old;
+
+	return likely(cc);
+}
+
 #endif /* __ASM_CMPXCHG_H */
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -103,6 +103,7 @@  static inline bool __try_cmpxchg64(volat
 
 #endif
 
-#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
+#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX8)
+#define system_has_cmpxchg64()		boot_cpu_has(X86_FEATURE_CX8)
 
 #endif /* _ASM_X86_CMPXCHG_32_H */
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,59 @@ 
 	arch_try_cmpxchg((ptr), (po), (n));				\
 })
 
-#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
+union __u128_halves {
+	u128 full;
+	struct {
+		u64 low, high;
+	};
+};
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
+{
+	union __u128_halves o = { .full = old, }, n = { .full = new, };
+
+	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
+		     : [ptr] "+m" (*ptr),
+		       "+a" (o.low), "+d" (o.high)
+		     : "b" (n.low), "c" (n.high)
+		     : "memory");
+
+	return o.full;
+}
+
+static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new)
+{
+	union __u128_halves o = { .full = old, }, n = { .full = new, };
+
+	asm volatile("cmpxchg16b %[ptr]"
+		     : [ptr] "+m" (*ptr),
+		       "+a" (o.low), "+d" (o.high)
+		     : "b" (n.low), "c" (n.high)
+		     : "memory");
+
+	return o.full;
+}
+
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *old, u128 new)
+{
+	union __u128_halves o = { .full = *old, }, n = { .full = new, };
+	bool ret;
+
+	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
+		     CC_SET(e)
+		     : CC_OUT(e) (ret),
+		       [ptr] "+m" (*ptr),
+		       "+a" (o.low), "+d" (o.high)
+		     : "b" (n.low), "c" (n.high)
+		     : "memory");
+
+	if (unlikely(!ret))
+		*old = o.full;
+
+	return likely(ret);
+}
+
+#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX16)
+#define system_has_cmpxchg128()		boot_cpu_has(X86_FEATURE_CX16)
 
 #endif /* _ASM_X86_CMPXCHG_64_H */